1 /* 2 Defines the basic matrix operations for the AIJ (compressed row) 3 matrix storage format using the CUSPARSE library, 4 */ 5 #define PETSC_SKIP_SPINLOCK 6 #define PETSC_SKIP_CXX_COMPLEX_FIX 7 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1 8 9 #include <petscconf.h> 10 #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/ 11 #include <../src/mat/impls/sbaij/seq/sbaij.h> 12 #include <../src/vec/vec/impls/dvecimpl.h> 13 #include <petsc/private/vecimpl.h> 14 #undef VecType 15 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h> 16 17 const char *const MatCUSPARSEStorageFormats[] = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0}; 18 19 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*); 20 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*); 21 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*); 22 23 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 24 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 25 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*); 26 27 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec); 28 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 29 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 30 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 31 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat); 32 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec); 33 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 34 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 35 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 36 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 37 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 38 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec,PetscBool,PetscBool); 39 40 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**); 41 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**); 42 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat); 43 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors**); 44 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**); 45 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**); 46 47 PetscErrorCode MatCUSPARSESetStream(Mat A,const cudaStream_t stream) 48 { 49 cusparseStatus_t stat; 50 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 51 52 PetscFunctionBegin; 53 cusparsestruct->stream = stream; 54 stat = cusparseSetStream(cusparsestruct->handle,cusparsestruct->stream);CHKERRCUSPARSE(stat); 55 PetscFunctionReturn(0); 56 } 57 58 PetscErrorCode MatCUSPARSESetHandle(Mat A,const cusparseHandle_t handle) 59 { 60 cusparseStatus_t stat; 61 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 62 63 PetscFunctionBegin; 64 if (cusparsestruct->handle != handle) { 65 if (cusparsestruct->handle) { 66 stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUSPARSE(stat); 67 } 68 cusparsestruct->handle = handle; 69 } 70 stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 71 PetscFunctionReturn(0); 72 } 73 74 PetscErrorCode MatCUSPARSEClearHandle(Mat A) 75 { 76 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 77 78 PetscFunctionBegin; 79 if (cusparsestruct->handle) cusparsestruct->handle = 0; 80 PetscFunctionReturn(0); 81 } 82 83 PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A,MatSolverType *type) 84 { 85 PetscFunctionBegin; 86 *type = MATSOLVERCUSPARSE; 87 PetscFunctionReturn(0); 88 } 89 90 /*MC 91 MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices 92 on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported 93 algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer 94 performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the 95 CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these 96 algorithms are not recommended. This class does NOT support direct solver operations. 97 98 Level: beginner 99 100 .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 101 M*/ 102 103 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B) 104 { 105 PetscErrorCode ierr; 106 PetscInt n = A->rmap->n; 107 108 PetscFunctionBegin; 109 ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr); 110 ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr); 111 (*B)->factortype = ftype; 112 (*B)->useordering = PETSC_TRUE; 113 ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 114 115 if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) { 116 ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr); 117 (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE; 118 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJCUSPARSE; 119 } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) { 120 (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJCUSPARSE; 121 (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE; 122 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types"); 123 124 ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr); 125 ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_cusparse);CHKERRQ(ierr); 126 PetscFunctionReturn(0); 127 } 128 129 PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 130 { 131 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 132 133 PetscFunctionBegin; 134 switch (op) { 135 case MAT_CUSPARSE_MULT: 136 cusparsestruct->format = format; 137 break; 138 case MAT_CUSPARSE_ALL: 139 cusparsestruct->format = format; 140 break; 141 default: 142 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op); 143 } 144 PetscFunctionReturn(0); 145 } 146 147 /*@ 148 MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular 149 operation. Only the MatMult operation can use different GPU storage formats 150 for MPIAIJCUSPARSE matrices. 151 Not Collective 152 153 Input Parameters: 154 + A - Matrix of type SEQAIJCUSPARSE 155 . op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL. 156 - format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2) 157 158 Output Parameter: 159 160 Level: intermediate 161 162 .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 163 @*/ 164 PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 165 { 166 PetscErrorCode ierr; 167 168 PetscFunctionBegin; 169 PetscValidHeaderSpecific(A, MAT_CLASSID,1); 170 ierr = PetscTryMethod(A,"MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));CHKERRQ(ierr); 171 PetscFunctionReturn(0); 172 } 173 174 /*@ 175 MatSeqAIJCUSPARSESetGenerateTranspose - Sets the flag to explicitly generate the tranpose matrix before calling MatMultTranspose 176 177 Collective on mat 178 179 Input Parameters: 180 + A - Matrix of type SEQAIJCUSPARSE 181 - transgen - the boolean flag 182 183 Level: intermediate 184 185 .seealso: MATSEQAIJCUSPARSE 186 @*/ 187 PetscErrorCode MatSeqAIJCUSPARSESetGenerateTranspose(Mat A,PetscBool transgen) 188 { 189 PetscErrorCode ierr; 190 PetscBool flg; 191 192 PetscFunctionBegin; 193 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 194 ierr = PetscObjectTypeCompare(((PetscObject)A),MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 195 if (flg) { 196 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 197 198 if (A->factortype) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix"); 199 cusp->transgen = transgen; 200 if (!transgen) { /* need to destroy the transpose matrix if present to prevent from logic errors if transgen is set to true later */ 201 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose,cusp->format);CHKERRQ(ierr); 202 } 203 } 204 PetscFunctionReturn(0); 205 } 206 207 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A) 208 { 209 PetscErrorCode ierr; 210 MatCUSPARSEStorageFormat format; 211 PetscBool flg; 212 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 213 214 PetscFunctionBegin; 215 ierr = PetscOptionsHead(PetscOptionsObject,"SeqAIJCUSPARSE options");CHKERRQ(ierr); 216 if (A->factortype == MAT_FACTOR_NONE) { 217 PetscBool transgen = cusparsestruct->transgen; 218 219 ierr = PetscOptionsBool("-mat_cusparse_transgen","Generate explicit transpose for MatMultTranspose","MatSeqAIJCUSPARSESetGenerateTranspose",transgen,&transgen,&flg);CHKERRQ(ierr); 220 if (flg) { 221 ierr = MatSeqAIJCUSPARSESetGenerateTranspose(A,transgen);CHKERRQ(ierr); 222 } 223 ierr = PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV", 224 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 225 if (flg) { 226 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);CHKERRQ(ierr); 227 } 228 } 229 ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve", 230 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 231 if (flg) { 232 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr); 233 } 234 ierr = PetscOptionsTail();CHKERRQ(ierr); 235 PetscFunctionReturn(0); 236 } 237 238 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 239 { 240 PetscErrorCode ierr; 241 242 PetscFunctionBegin; 243 ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 244 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 245 PetscFunctionReturn(0); 246 } 247 248 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 249 { 250 PetscErrorCode ierr; 251 252 PetscFunctionBegin; 253 ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 254 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 255 PetscFunctionReturn(0); 256 } 257 258 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info) 259 { 260 PetscErrorCode ierr; 261 262 PetscFunctionBegin; 263 ierr = MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr); 264 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 265 PetscFunctionReturn(0); 266 } 267 268 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info) 269 { 270 PetscErrorCode ierr; 271 272 PetscFunctionBegin; 273 ierr = MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr); 274 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 275 PetscFunctionReturn(0); 276 } 277 278 static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A) 279 { 280 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 281 PetscInt n = A->rmap->n; 282 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 283 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 284 cusparseStatus_t stat; 285 const PetscInt *ai = a->i,*aj = a->j,*vi; 286 const MatScalar *aa = a->a,*v; 287 PetscInt *AiLo, *AjLo; 288 PetscScalar *AALo; 289 PetscInt i,nz, nzLower, offset, rowOffset; 290 PetscErrorCode ierr; 291 cudaError_t cerr; 292 293 PetscFunctionBegin; 294 if (!n) PetscFunctionReturn(0); 295 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 296 try { 297 /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */ 298 nzLower=n+ai[n]-ai[1]; 299 300 /* Allocate Space for the lower triangular matrix */ 301 cerr = cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr); 302 cerr = cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUDA(cerr); 303 cerr = cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr); 304 305 /* Fill the lower triangular matrix */ 306 AiLo[0] = (PetscInt) 0; 307 AiLo[n] = nzLower; 308 AjLo[0] = (PetscInt) 0; 309 AALo[0] = (MatScalar) 1.0; 310 v = aa; 311 vi = aj; 312 offset = 1; 313 rowOffset= 1; 314 for (i=1; i<n; i++) { 315 nz = ai[i+1] - ai[i]; 316 /* additional 1 for the term on the diagonal */ 317 AiLo[i] = rowOffset; 318 rowOffset += nz+1; 319 320 ierr = PetscArraycpy(&(AjLo[offset]), vi, nz);CHKERRQ(ierr); 321 ierr = PetscArraycpy(&(AALo[offset]), v, nz);CHKERRQ(ierr); 322 323 offset += nz; 324 AjLo[offset] = (PetscInt) i; 325 AALo[offset] = (MatScalar) 1.0; 326 offset += 1; 327 328 v += nz; 329 vi += nz; 330 } 331 332 /* allocate space for the triangular factor information */ 333 loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct; 334 335 /* Create the matrix description */ 336 stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat); 337 stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 338 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 339 stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER);CHKERRCUSPARSE(stat); 340 stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat); 341 342 /* Create the solve analysis information */ 343 stat = cusparseCreateSolveAnalysisInfo(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat); 344 345 /* set the operation */ 346 loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 347 348 /* set the matrix */ 349 loTriFactor->csrMat = new CsrMatrix; 350 loTriFactor->csrMat->num_rows = n; 351 loTriFactor->csrMat->num_cols = n; 352 loTriFactor->csrMat->num_entries = nzLower; 353 354 loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1); 355 loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1); 356 357 loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower); 358 loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower); 359 360 loTriFactor->csrMat->values = new THRUSTARRAY(nzLower); 361 loTriFactor->csrMat->values->assign(AALo, AALo+nzLower); 362 363 /* perform the solve analysis */ 364 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, 365 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 366 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 367 loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo);CHKERRCUSPARSE(stat); 368 369 /* assign the pointer. Is this really necessary? */ 370 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor; 371 372 cerr = cudaFreeHost(AiLo);CHKERRCUDA(cerr); 373 cerr = cudaFreeHost(AjLo);CHKERRCUDA(cerr); 374 cerr = cudaFreeHost(AALo);CHKERRCUDA(cerr); 375 ierr = PetscLogCpuToGpu((n+1+nzLower)*sizeof(int)+nzLower*sizeof(PetscScalar));CHKERRQ(ierr); 376 } catch(char *ex) { 377 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 378 } 379 } 380 PetscFunctionReturn(0); 381 } 382 383 static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A) 384 { 385 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 386 PetscInt n = A->rmap->n; 387 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 388 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 389 cusparseStatus_t stat; 390 const PetscInt *aj = a->j,*adiag = a->diag,*vi; 391 const MatScalar *aa = a->a,*v; 392 PetscInt *AiUp, *AjUp; 393 PetscScalar *AAUp; 394 PetscInt i,nz, nzUpper, offset; 395 PetscErrorCode ierr; 396 cudaError_t cerr; 397 398 PetscFunctionBegin; 399 if (!n) PetscFunctionReturn(0); 400 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 401 try { 402 /* next, figure out the number of nonzeros in the upper triangular matrix. */ 403 nzUpper = adiag[0]-adiag[n]; 404 405 /* Allocate Space for the upper triangular matrix */ 406 cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr); 407 cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr); 408 cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr); 409 410 /* Fill the upper triangular matrix */ 411 AiUp[0]=(PetscInt) 0; 412 AiUp[n]=nzUpper; 413 offset = nzUpper; 414 for (i=n-1; i>=0; i--) { 415 v = aa + adiag[i+1] + 1; 416 vi = aj + adiag[i+1] + 1; 417 418 /* number of elements NOT on the diagonal */ 419 nz = adiag[i] - adiag[i+1]-1; 420 421 /* decrement the offset */ 422 offset -= (nz+1); 423 424 /* first, set the diagonal elements */ 425 AjUp[offset] = (PetscInt) i; 426 AAUp[offset] = (MatScalar)1./v[nz]; 427 AiUp[i] = AiUp[i+1] - (nz+1); 428 429 ierr = PetscArraycpy(&(AjUp[offset+1]), vi, nz);CHKERRQ(ierr); 430 ierr = PetscArraycpy(&(AAUp[offset+1]), v, nz);CHKERRQ(ierr); 431 } 432 433 /* allocate space for the triangular factor information */ 434 upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct; 435 436 /* Create the matrix description */ 437 stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat); 438 stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 439 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 440 stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat); 441 stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat); 442 443 /* Create the solve analysis information */ 444 stat = cusparseCreateSolveAnalysisInfo(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat); 445 446 /* set the operation */ 447 upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 448 449 /* set the matrix */ 450 upTriFactor->csrMat = new CsrMatrix; 451 upTriFactor->csrMat->num_rows = n; 452 upTriFactor->csrMat->num_cols = n; 453 upTriFactor->csrMat->num_entries = nzUpper; 454 455 upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1); 456 upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1); 457 458 upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper); 459 upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper); 460 461 upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper); 462 upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper); 463 464 /* perform the solve analysis */ 465 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, 466 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 467 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 468 upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo);CHKERRCUSPARSE(stat); 469 470 /* assign the pointer. Is this really necessary? */ 471 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor; 472 473 cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr); 474 cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr); 475 cerr = cudaFreeHost(AAUp);CHKERRCUDA(cerr); 476 ierr = PetscLogCpuToGpu((n+1+nzUpper)*sizeof(int)+nzUpper*sizeof(PetscScalar));CHKERRQ(ierr); 477 } catch(char *ex) { 478 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 479 } 480 } 481 PetscFunctionReturn(0); 482 } 483 484 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A) 485 { 486 PetscErrorCode ierr; 487 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 488 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 489 IS isrow = a->row,iscol = a->icol; 490 PetscBool row_identity,col_identity; 491 const PetscInt *r,*c; 492 PetscInt n = A->rmap->n; 493 494 PetscFunctionBegin; 495 ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr); 496 ierr = MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);CHKERRQ(ierr); 497 ierr = MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);CHKERRQ(ierr); 498 499 cusparseTriFactors->workVector = new THRUSTARRAY(n); 500 cusparseTriFactors->nnz=a->nz; 501 502 A->offloadmask = PETSC_OFFLOAD_BOTH; 503 /* lower triangular indices */ 504 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 505 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 506 if (!row_identity) { 507 cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 508 cusparseTriFactors->rpermIndices->assign(r, r+n); 509 } 510 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 511 512 /* upper triangular indices */ 513 ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr); 514 ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 515 if (!col_identity) { 516 cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 517 cusparseTriFactors->cpermIndices->assign(c, c+n); 518 } 519 520 if (!row_identity && !col_identity) { 521 ierr = PetscLogCpuToGpu(2*n*sizeof(PetscInt));CHKERRQ(ierr); 522 } else if(!row_identity) { 523 ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr); 524 } else if(!col_identity) { 525 ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr); 526 } 527 528 ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr); 529 PetscFunctionReturn(0); 530 } 531 532 static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A) 533 { 534 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 535 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 536 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 537 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 538 cusparseStatus_t stat; 539 PetscErrorCode ierr; 540 cudaError_t cerr; 541 PetscInt *AiUp, *AjUp; 542 PetscScalar *AAUp; 543 PetscScalar *AALo; 544 PetscInt nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j; 545 Mat_SeqSBAIJ *b = (Mat_SeqSBAIJ*)A->data; 546 const PetscInt *ai = b->i,*aj = b->j,*vj; 547 const MatScalar *aa = b->a,*v; 548 549 PetscFunctionBegin; 550 if (!n) PetscFunctionReturn(0); 551 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 552 try { 553 /* Allocate Space for the upper triangular matrix */ 554 cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr); 555 cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr); 556 cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr); 557 cerr = cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr); 558 559 /* Fill the upper triangular matrix */ 560 AiUp[0]=(PetscInt) 0; 561 AiUp[n]=nzUpper; 562 offset = 0; 563 for (i=0; i<n; i++) { 564 /* set the pointers */ 565 v = aa + ai[i]; 566 vj = aj + ai[i]; 567 nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */ 568 569 /* first, set the diagonal elements */ 570 AjUp[offset] = (PetscInt) i; 571 AAUp[offset] = (MatScalar)1.0/v[nz]; 572 AiUp[i] = offset; 573 AALo[offset] = (MatScalar)1.0/v[nz]; 574 575 offset+=1; 576 if (nz>0) { 577 ierr = PetscArraycpy(&(AjUp[offset]), vj, nz);CHKERRQ(ierr); 578 ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr); 579 for (j=offset; j<offset+nz; j++) { 580 AAUp[j] = -AAUp[j]; 581 AALo[j] = AAUp[j]/v[nz]; 582 } 583 offset+=nz; 584 } 585 } 586 587 /* allocate space for the triangular factor information */ 588 upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct; 589 590 /* Create the matrix description */ 591 stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat); 592 stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 593 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 594 stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat); 595 stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat); 596 597 /* Create the solve analysis information */ 598 stat = cusparseCreateSolveAnalysisInfo(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat); 599 600 /* set the operation */ 601 upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 602 603 /* set the matrix */ 604 upTriFactor->csrMat = new CsrMatrix; 605 upTriFactor->csrMat->num_rows = A->rmap->n; 606 upTriFactor->csrMat->num_cols = A->cmap->n; 607 upTriFactor->csrMat->num_entries = a->nz; 608 609 upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 610 upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1); 611 612 upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 613 upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz); 614 615 upTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 616 upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz); 617 618 /* perform the solve analysis */ 619 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, 620 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 621 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 622 upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo);CHKERRCUSPARSE(stat); 623 624 /* assign the pointer. Is this really necessary? */ 625 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor; 626 627 /* allocate space for the triangular factor information */ 628 loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct; 629 630 /* Create the matrix description */ 631 stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat); 632 stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 633 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 634 stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat); 635 stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat); 636 637 /* Create the solve analysis information */ 638 stat = cusparseCreateSolveAnalysisInfo(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat); 639 640 /* set the operation */ 641 loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE; 642 643 /* set the matrix */ 644 loTriFactor->csrMat = new CsrMatrix; 645 loTriFactor->csrMat->num_rows = A->rmap->n; 646 loTriFactor->csrMat->num_cols = A->cmap->n; 647 loTriFactor->csrMat->num_entries = a->nz; 648 649 loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 650 loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1); 651 652 loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 653 loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz); 654 655 loTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 656 loTriFactor->csrMat->values->assign(AALo, AALo+a->nz); 657 ierr = PetscLogCpuToGpu(2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)));CHKERRQ(ierr); 658 659 /* perform the solve analysis */ 660 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, 661 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 662 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 663 loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo);CHKERRCUSPARSE(stat); 664 665 /* assign the pointer. Is this really necessary? */ 666 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor; 667 668 A->offloadmask = PETSC_OFFLOAD_BOTH; 669 cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr); 670 cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr); 671 cerr = cudaFreeHost(AAUp);CHKERRCUDA(cerr); 672 cerr = cudaFreeHost(AALo);CHKERRCUDA(cerr); 673 } catch(char *ex) { 674 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 675 } 676 } 677 PetscFunctionReturn(0); 678 } 679 680 static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A) 681 { 682 PetscErrorCode ierr; 683 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 684 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 685 IS ip = a->row; 686 const PetscInt *rip; 687 PetscBool perm_identity; 688 PetscInt n = A->rmap->n; 689 690 PetscFunctionBegin; 691 ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr); 692 ierr = MatSeqAIJCUSPARSEBuildICCTriMatrices(A);CHKERRQ(ierr); 693 cusparseTriFactors->workVector = new THRUSTARRAY(n); 694 cusparseTriFactors->nnz=(a->nz-n)*2 + n; 695 696 /* lower triangular indices */ 697 ierr = ISGetIndices(ip,&rip);CHKERRQ(ierr); 698 ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr); 699 if (!perm_identity) { 700 IS iip; 701 const PetscInt *irip; 702 703 ierr = ISInvertPermutation(ip,PETSC_DECIDE,&iip);CHKERRQ(ierr); 704 ierr = ISGetIndices(iip,&irip);CHKERRQ(ierr); 705 cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 706 cusparseTriFactors->rpermIndices->assign(rip, rip+n); 707 cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 708 cusparseTriFactors->cpermIndices->assign(irip, irip+n); 709 ierr = ISRestoreIndices(iip,&irip);CHKERRQ(ierr); 710 ierr = ISDestroy(&iip);CHKERRQ(ierr); 711 ierr = PetscLogCpuToGpu(2*n*sizeof(PetscInt));CHKERRQ(ierr); 712 } 713 ierr = ISRestoreIndices(ip,&rip);CHKERRQ(ierr); 714 PetscFunctionReturn(0); 715 } 716 717 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info) 718 { 719 Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 720 IS isrow = b->row,iscol = b->col; 721 PetscBool row_identity,col_identity; 722 PetscErrorCode ierr; 723 724 PetscFunctionBegin; 725 ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 726 B->offloadmask = PETSC_OFFLOAD_CPU; 727 /* determine which version of MatSolve needs to be used. */ 728 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 729 ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 730 if (row_identity && col_identity) { 731 B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 732 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 733 B->ops->matsolve = NULL; 734 B->ops->matsolvetranspose = NULL; 735 } else { 736 B->ops->solve = MatSolve_SeqAIJCUSPARSE; 737 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 738 B->ops->matsolve = NULL; 739 B->ops->matsolvetranspose = NULL; 740 } 741 742 /* get the triangular factors */ 743 ierr = MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);CHKERRQ(ierr); 744 PetscFunctionReturn(0); 745 } 746 747 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info) 748 { 749 Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 750 IS ip = b->row; 751 PetscBool perm_identity; 752 PetscErrorCode ierr; 753 754 PetscFunctionBegin; 755 ierr = MatCholeskyFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 756 B->offloadmask = PETSC_OFFLOAD_CPU; 757 /* determine which version of MatSolve needs to be used. */ 758 ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr); 759 if (perm_identity) { 760 B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 761 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 762 B->ops->matsolve = NULL; 763 B->ops->matsolvetranspose = NULL; 764 } else { 765 B->ops->solve = MatSolve_SeqAIJCUSPARSE; 766 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 767 B->ops->matsolve = NULL; 768 B->ops->matsolvetranspose = NULL; 769 } 770 771 /* get the triangular factors */ 772 ierr = MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);CHKERRQ(ierr); 773 PetscFunctionReturn(0); 774 } 775 776 static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A) 777 { 778 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 779 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 780 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 781 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 782 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 783 cusparseStatus_t stat; 784 cusparseIndexBase_t indexBase; 785 cusparseMatrixType_t matrixType; 786 cusparseFillMode_t fillMode; 787 cusparseDiagType_t diagType; 788 789 PetscFunctionBegin; 790 791 /*********************************************/ 792 /* Now the Transpose of the Lower Tri Factor */ 793 /*********************************************/ 794 795 /* allocate space for the transpose of the lower triangular factor */ 796 loTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct; 797 798 /* set the matrix descriptors of the lower triangular factor */ 799 matrixType = cusparseGetMatType(loTriFactor->descr); 800 indexBase = cusparseGetMatIndexBase(loTriFactor->descr); 801 fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ? 802 CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 803 diagType = cusparseGetMatDiagType(loTriFactor->descr); 804 805 /* Create the matrix description */ 806 stat = cusparseCreateMatDescr(&loTriFactorT->descr);CHKERRCUSPARSE(stat); 807 stat = cusparseSetMatIndexBase(loTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat); 808 stat = cusparseSetMatType(loTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat); 809 stat = cusparseSetMatFillMode(loTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat); 810 stat = cusparseSetMatDiagType(loTriFactorT->descr, diagType);CHKERRCUSPARSE(stat); 811 812 /* Create the solve analysis information */ 813 stat = cusparseCreateSolveAnalysisInfo(&loTriFactorT->solveInfo);CHKERRCUSPARSE(stat); 814 815 /* set the operation */ 816 loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 817 818 /* allocate GPU space for the CSC of the lower triangular factor*/ 819 loTriFactorT->csrMat = new CsrMatrix; 820 loTriFactorT->csrMat->num_rows = loTriFactor->csrMat->num_rows; 821 loTriFactorT->csrMat->num_cols = loTriFactor->csrMat->num_cols; 822 loTriFactorT->csrMat->num_entries = loTriFactor->csrMat->num_entries; 823 loTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(loTriFactor->csrMat->num_rows+1); 824 loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactor->csrMat->num_entries); 825 loTriFactorT->csrMat->values = new THRUSTARRAY(loTriFactor->csrMat->num_entries); 826 827 /* compute the transpose of the lower triangular factor, i.e. the CSC */ 828 stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, 829 loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, 830 loTriFactor->csrMat->values->data().get(), 831 loTriFactor->csrMat->row_offsets->data().get(), 832 loTriFactor->csrMat->column_indices->data().get(), 833 loTriFactorT->csrMat->values->data().get(), 834 loTriFactorT->csrMat->column_indices->data().get(), 835 loTriFactorT->csrMat->row_offsets->data().get(), 836 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat); 837 838 /* perform the solve analysis on the transposed matrix */ 839 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp, 840 loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, 841 loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(), 842 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), 843 loTriFactorT->solveInfo);CHKERRCUSPARSE(stat); 844 845 /* assign the pointer. Is this really necessary? */ 846 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT; 847 848 /*********************************************/ 849 /* Now the Transpose of the Upper Tri Factor */ 850 /*********************************************/ 851 852 /* allocate space for the transpose of the upper triangular factor */ 853 upTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct; 854 855 /* set the matrix descriptors of the upper triangular factor */ 856 matrixType = cusparseGetMatType(upTriFactor->descr); 857 indexBase = cusparseGetMatIndexBase(upTriFactor->descr); 858 fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ? 859 CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 860 diagType = cusparseGetMatDiagType(upTriFactor->descr); 861 862 /* Create the matrix description */ 863 stat = cusparseCreateMatDescr(&upTriFactorT->descr);CHKERRCUSPARSE(stat); 864 stat = cusparseSetMatIndexBase(upTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat); 865 stat = cusparseSetMatType(upTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat); 866 stat = cusparseSetMatFillMode(upTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat); 867 stat = cusparseSetMatDiagType(upTriFactorT->descr, diagType);CHKERRCUSPARSE(stat); 868 869 /* Create the solve analysis information */ 870 stat = cusparseCreateSolveAnalysisInfo(&upTriFactorT->solveInfo);CHKERRCUSPARSE(stat); 871 872 /* set the operation */ 873 upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 874 875 /* allocate GPU space for the CSC of the upper triangular factor*/ 876 upTriFactorT->csrMat = new CsrMatrix; 877 upTriFactorT->csrMat->num_rows = upTriFactor->csrMat->num_rows; 878 upTriFactorT->csrMat->num_cols = upTriFactor->csrMat->num_cols; 879 upTriFactorT->csrMat->num_entries = upTriFactor->csrMat->num_entries; 880 upTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(upTriFactor->csrMat->num_rows+1); 881 upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactor->csrMat->num_entries); 882 upTriFactorT->csrMat->values = new THRUSTARRAY(upTriFactor->csrMat->num_entries); 883 884 /* compute the transpose of the upper triangular factor, i.e. the CSC */ 885 stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, 886 upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, 887 upTriFactor->csrMat->values->data().get(), 888 upTriFactor->csrMat->row_offsets->data().get(), 889 upTriFactor->csrMat->column_indices->data().get(), 890 upTriFactorT->csrMat->values->data().get(), 891 upTriFactorT->csrMat->column_indices->data().get(), 892 upTriFactorT->csrMat->row_offsets->data().get(), 893 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat); 894 895 /* perform the solve analysis on the transposed matrix */ 896 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp, 897 upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, 898 upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(), 899 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), 900 upTriFactorT->solveInfo);CHKERRCUSPARSE(stat); 901 902 /* assign the pointer. Is this really necessary? */ 903 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT; 904 PetscFunctionReturn(0); 905 } 906 907 static PetscErrorCode MatSeqAIJCUSPARSEGenerateTransposeForMult(Mat A) 908 { 909 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 910 Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 911 Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 912 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 913 cusparseStatus_t stat; 914 cusparseIndexBase_t indexBase; 915 cudaError_t err; 916 PetscErrorCode ierr; 917 918 PetscFunctionBegin; 919 if (!cusparsestruct->transgen || cusparsestruct->matTranspose) PetscFunctionReturn(0); 920 ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 921 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 922 /* create cusparse matrix */ 923 matstructT = new Mat_SeqAIJCUSPARSEMultStruct; 924 stat = cusparseCreateMatDescr(&matstructT->descr);CHKERRCUSPARSE(stat); 925 indexBase = cusparseGetMatIndexBase(matstruct->descr); 926 stat = cusparseSetMatIndexBase(matstructT->descr, indexBase);CHKERRCUSPARSE(stat); 927 stat = cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 928 929 /* set alpha and beta */ 930 err = cudaMalloc((void **)&(matstructT->alpha), sizeof(PetscScalar));CHKERRCUDA(err); 931 err = cudaMalloc((void **)&(matstructT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err); 932 err = cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar));CHKERRCUDA(err); 933 err = cudaMemcpy(matstructT->alpha, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 934 err = cudaMemcpy(matstructT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 935 err = cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 936 stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 937 938 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 939 CsrMatrix *matrix = (CsrMatrix*)matstruct->mat; 940 CsrMatrix *matrixT= new CsrMatrix; 941 matrixT->num_rows = A->cmap->n; 942 matrixT->num_cols = A->rmap->n; 943 matrixT->num_entries = a->nz; 944 matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows+1); 945 matrixT->column_indices = new THRUSTINTARRAY32(a->nz); 946 matrixT->values = new THRUSTARRAY(a->nz); 947 948 cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1); 949 cusparsestruct->rowoffsets_gpu->assign(a->i,a->i+A->rmap->n+1); 950 /* compute the transpose, i.e. the CSC */ 951 indexBase = cusparseGetMatIndexBase(matstruct->descr); 952 stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n, 953 A->cmap->n, matrix->num_entries, 954 matrix->values->data().get(), 955 cusparsestruct->rowoffsets_gpu->data().get(), 956 matrix->column_indices->data().get(), 957 matrixT->values->data().get(), 958 matrixT->column_indices->data().get(), 959 matrixT->row_offsets->data().get(), 960 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat); 961 matstructT->mat = matrixT; 962 } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) { 963 CsrMatrix *temp = new CsrMatrix; 964 CsrMatrix *tempT = new CsrMatrix; 965 966 /* First convert HYB to CSR */ 967 temp->num_rows = A->rmap->n; 968 temp->num_cols = A->cmap->n; 969 temp->num_entries = a->nz; 970 temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 971 temp->column_indices = new THRUSTINTARRAY32(a->nz); 972 temp->values = new THRUSTARRAY(a->nz); 973 974 stat = cusparse_hyb2csr(cusparsestruct->handle, 975 matstruct->descr, (cusparseHybMat_t)matstruct->mat, 976 temp->values->data().get(), 977 temp->row_offsets->data().get(), 978 temp->column_indices->data().get());CHKERRCUSPARSE(stat); 979 980 /* Next, convert CSR to CSC (i.e. the matrix transpose) */ 981 tempT->num_rows = A->rmap->n; 982 tempT->num_cols = A->cmap->n; 983 tempT->num_entries = a->nz; 984 tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 985 tempT->column_indices = new THRUSTINTARRAY32(a->nz); 986 tempT->values = new THRUSTARRAY(a->nz); 987 988 stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows, 989 temp->num_cols, temp->num_entries, 990 temp->values->data().get(), 991 temp->row_offsets->data().get(), 992 temp->column_indices->data().get(), 993 tempT->values->data().get(), 994 tempT->column_indices->data().get(), 995 tempT->row_offsets->data().get(), 996 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat); 997 998 /* Last, convert CSC to HYB */ 999 cusparseHybMat_t hybMat; 1000 stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat); 1001 cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ? 1002 CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 1003 stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n, 1004 matstructT->descr, tempT->values->data().get(), 1005 tempT->row_offsets->data().get(), 1006 tempT->column_indices->data().get(), 1007 hybMat, 0, partition);CHKERRCUSPARSE(stat); 1008 1009 /* assign the pointer */ 1010 matstructT->mat = hybMat; 1011 /* delete temporaries */ 1012 if (tempT) { 1013 if (tempT->values) delete (THRUSTARRAY*) tempT->values; 1014 if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices; 1015 if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets; 1016 delete (CsrMatrix*) tempT; 1017 } 1018 if (temp) { 1019 if (temp->values) delete (THRUSTARRAY*) temp->values; 1020 if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices; 1021 if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets; 1022 delete (CsrMatrix*) temp; 1023 } 1024 } 1025 err = WaitForCUDA();CHKERRCUDA(err); 1026 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1027 ierr = PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1028 /* the compressed row indices is not used for matTranspose */ 1029 matstructT->cprowIndices = NULL; 1030 /* assign the pointer */ 1031 ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT; 1032 PetscFunctionReturn(0); 1033 } 1034 1035 /* Why do we need to analyze the tranposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */ 1036 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 1037 { 1038 PetscInt n = xx->map->n; 1039 const PetscScalar *barray; 1040 PetscScalar *xarray; 1041 thrust::device_ptr<const PetscScalar> bGPU; 1042 thrust::device_ptr<PetscScalar> xGPU; 1043 cusparseStatus_t stat; 1044 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1045 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1046 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1047 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1048 PetscErrorCode ierr; 1049 cudaError_t cerr; 1050 1051 PetscFunctionBegin; 1052 /* Analyze the matrix and create the transpose ... on the fly */ 1053 if (!loTriFactorT && !upTriFactorT) { 1054 ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr); 1055 loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1056 upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1057 } 1058 1059 /* Get the GPU pointers */ 1060 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1061 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1062 xGPU = thrust::device_pointer_cast(xarray); 1063 bGPU = thrust::device_pointer_cast(barray); 1064 1065 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1066 /* First, reorder with the row permutation */ 1067 thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), 1068 thrust::make_permutation_iterator(bGPU+n, cusparseTriFactors->rpermIndices->end()), 1069 xGPU); 1070 1071 /* First, solve U */ 1072 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, 1073 upTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactorT->descr, 1074 upTriFactorT->csrMat->values->data().get(), 1075 upTriFactorT->csrMat->row_offsets->data().get(), 1076 upTriFactorT->csrMat->column_indices->data().get(), 1077 upTriFactorT->solveInfo, 1078 xarray, tempGPU->data().get());CHKERRCUSPARSE(stat); 1079 1080 /* Then, solve L */ 1081 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, 1082 loTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactorT->descr, 1083 loTriFactorT->csrMat->values->data().get(), 1084 loTriFactorT->csrMat->row_offsets->data().get(), 1085 loTriFactorT->csrMat->column_indices->data().get(), 1086 loTriFactorT->solveInfo, 1087 tempGPU->data().get(), xarray);CHKERRCUSPARSE(stat); 1088 1089 /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */ 1090 thrust::copy(thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()), 1091 thrust::make_permutation_iterator(xGPU+n, cusparseTriFactors->cpermIndices->end()), 1092 tempGPU->begin()); 1093 1094 /* Copy the temporary to the full solution. */ 1095 thrust::copy(tempGPU->begin(), tempGPU->end(), xGPU); 1096 1097 /* restore */ 1098 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1099 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1100 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1101 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1102 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1103 PetscFunctionReturn(0); 1104 } 1105 1106 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 1107 { 1108 const PetscScalar *barray; 1109 PetscScalar *xarray; 1110 cusparseStatus_t stat; 1111 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1112 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1113 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1114 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1115 PetscErrorCode ierr; 1116 cudaError_t cerr; 1117 1118 PetscFunctionBegin; 1119 /* Analyze the matrix and create the transpose ... on the fly */ 1120 if (!loTriFactorT && !upTriFactorT) { 1121 ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr); 1122 loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1123 upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1124 } 1125 1126 /* Get the GPU pointers */ 1127 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1128 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1129 1130 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1131 /* First, solve U */ 1132 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, 1133 upTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactorT->descr, 1134 upTriFactorT->csrMat->values->data().get(), 1135 upTriFactorT->csrMat->row_offsets->data().get(), 1136 upTriFactorT->csrMat->column_indices->data().get(), 1137 upTriFactorT->solveInfo, 1138 barray, tempGPU->data().get());CHKERRCUSPARSE(stat); 1139 1140 /* Then, solve L */ 1141 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, 1142 loTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactorT->descr, 1143 loTriFactorT->csrMat->values->data().get(), 1144 loTriFactorT->csrMat->row_offsets->data().get(), 1145 loTriFactorT->csrMat->column_indices->data().get(), 1146 loTriFactorT->solveInfo, 1147 tempGPU->data().get(), xarray);CHKERRCUSPARSE(stat); 1148 1149 /* restore */ 1150 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1151 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1152 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1153 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1154 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1155 PetscFunctionReturn(0); 1156 } 1157 1158 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 1159 { 1160 const PetscScalar *barray; 1161 PetscScalar *xarray; 1162 thrust::device_ptr<const PetscScalar> bGPU; 1163 thrust::device_ptr<PetscScalar> xGPU; 1164 cusparseStatus_t stat; 1165 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1166 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1167 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1168 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1169 PetscErrorCode ierr; 1170 cudaError_t cerr; 1171 1172 PetscFunctionBegin; 1173 1174 /* Get the GPU pointers */ 1175 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1176 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1177 xGPU = thrust::device_pointer_cast(xarray); 1178 bGPU = thrust::device_pointer_cast(barray); 1179 1180 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1181 /* First, reorder with the row permutation */ 1182 thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), 1183 thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()), 1184 tempGPU->begin()); 1185 1186 /* Next, solve L */ 1187 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp, 1188 loTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactor->descr, 1189 loTriFactor->csrMat->values->data().get(), 1190 loTriFactor->csrMat->row_offsets->data().get(), 1191 loTriFactor->csrMat->column_indices->data().get(), 1192 loTriFactor->solveInfo, 1193 tempGPU->data().get(), xarray);CHKERRCUSPARSE(stat); 1194 1195 /* Then, solve U */ 1196 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp, 1197 upTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactor->descr, 1198 upTriFactor->csrMat->values->data().get(), 1199 upTriFactor->csrMat->row_offsets->data().get(), 1200 upTriFactor->csrMat->column_indices->data().get(), 1201 upTriFactor->solveInfo, 1202 xarray, tempGPU->data().get());CHKERRCUSPARSE(stat); 1203 1204 /* Last, reorder with the column permutation */ 1205 thrust::copy(thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()), 1206 thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()), 1207 xGPU); 1208 1209 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1210 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1211 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1212 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1213 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1214 PetscFunctionReturn(0); 1215 } 1216 1217 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 1218 { 1219 const PetscScalar *barray; 1220 PetscScalar *xarray; 1221 cusparseStatus_t stat; 1222 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1223 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1224 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1225 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1226 PetscErrorCode ierr; 1227 cudaError_t cerr; 1228 1229 PetscFunctionBegin; 1230 /* Get the GPU pointers */ 1231 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1232 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1233 1234 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1235 /* First, solve L */ 1236 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp, 1237 loTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactor->descr, 1238 loTriFactor->csrMat->values->data().get(), 1239 loTriFactor->csrMat->row_offsets->data().get(), 1240 loTriFactor->csrMat->column_indices->data().get(), 1241 loTriFactor->solveInfo, 1242 barray, tempGPU->data().get());CHKERRCUSPARSE(stat); 1243 1244 /* Next, solve U */ 1245 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp, 1246 upTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactor->descr, 1247 upTriFactor->csrMat->values->data().get(), 1248 upTriFactor->csrMat->row_offsets->data().get(), 1249 upTriFactor->csrMat->column_indices->data().get(), 1250 upTriFactor->solveInfo, 1251 tempGPU->data().get(), xarray);CHKERRCUSPARSE(stat); 1252 1253 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1254 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1255 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1256 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1257 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1258 PetscFunctionReturn(0); 1259 } 1260 1261 static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A) 1262 { 1263 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1264 Mat_SeqAIJCUSPARSEMultStruct *matstruct = cusparsestruct->mat; 1265 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1266 PetscInt m = A->rmap->n,*ii,*ridx,tmp; 1267 PetscErrorCode ierr; 1268 cusparseStatus_t stat; 1269 cudaError_t err; 1270 1271 PetscFunctionBegin; 1272 if (A->boundtocpu) PetscFunctionReturn(0); 1273 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 1274 if (A->was_assembled && A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) { 1275 /* Copy values only */ 1276 CsrMatrix *mat,*matT; 1277 mat = (CsrMatrix*)cusparsestruct->mat->mat; 1278 1279 ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1280 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1281 mat->values->assign(a->a, a->a+a->nz); 1282 err = WaitForCUDA();CHKERRCUDA(err); 1283 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1284 ierr = PetscLogCpuToGpu((a->nz)*sizeof(PetscScalar));CHKERRQ(ierr); 1285 ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1286 1287 /* Update matT when it was built before */ 1288 if (cusparsestruct->matTranspose) { 1289 cusparseIndexBase_t indexBase = cusparseGetMatIndexBase(cusparsestruct->mat->descr); 1290 matT = (CsrMatrix*)cusparsestruct->matTranspose->mat; 1291 ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1292 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1293 stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n, 1294 A->cmap->n, mat->num_entries, 1295 mat->values->data().get(), 1296 cusparsestruct->rowoffsets_gpu->data().get(), 1297 mat->column_indices->data().get(), 1298 matT->values->data().get(), 1299 matT->column_indices->data().get(), 1300 matT->row_offsets->data().get(), 1301 CUSPARSE_ACTION_NUMERIC,indexBase);CHKERRCUSPARSE(stat); 1302 err = WaitForCUDA();CHKERRCUDA(err); 1303 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1304 ierr = PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1305 } 1306 } else { 1307 ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1308 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1309 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat,cusparsestruct->format);CHKERRQ(ierr); 1310 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->matTranspose,cusparsestruct->format);CHKERRQ(ierr); 1311 delete cusparsestruct->workVector; 1312 delete cusparsestruct->rowoffsets_gpu; 1313 try { 1314 if (a->compressedrow.use) { 1315 m = a->compressedrow.nrows; 1316 ii = a->compressedrow.i; 1317 ridx = a->compressedrow.rindex; 1318 } else { 1319 m = A->rmap->n; 1320 ii = a->i; 1321 ridx = NULL; 1322 } 1323 cusparsestruct->nrows = m; 1324 1325 /* create cusparse matrix */ 1326 matstruct = new Mat_SeqAIJCUSPARSEMultStruct; 1327 stat = cusparseCreateMatDescr(&matstruct->descr);CHKERRCUSPARSE(stat); 1328 stat = cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 1329 stat = cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 1330 1331 err = cudaMalloc((void **)&(matstruct->alpha), sizeof(PetscScalar));CHKERRCUDA(err); 1332 err = cudaMalloc((void **)&(matstruct->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err); 1333 err = cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar));CHKERRCUDA(err); 1334 err = cudaMemcpy(matstruct->alpha, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1335 err = cudaMemcpy(matstruct->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1336 err = cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1337 stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 1338 1339 /* Build a hybrid/ellpack matrix if this option is chosen for the storage */ 1340 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1341 /* set the matrix */ 1342 CsrMatrix *matrix= new CsrMatrix; 1343 matrix->num_rows = m; 1344 matrix->num_cols = A->cmap->n; 1345 matrix->num_entries = a->nz; 1346 matrix->row_offsets = new THRUSTINTARRAY32(m+1); 1347 matrix->row_offsets->assign(ii, ii + m+1); 1348 1349 matrix->column_indices = new THRUSTINTARRAY32(a->nz); 1350 matrix->column_indices->assign(a->j, a->j+a->nz); 1351 1352 matrix->values = new THRUSTARRAY(a->nz); 1353 matrix->values->assign(a->a, a->a+a->nz); 1354 1355 /* assign the pointer */ 1356 matstruct->mat = matrix; 1357 1358 } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) { 1359 CsrMatrix *matrix= new CsrMatrix; 1360 matrix->num_rows = m; 1361 matrix->num_cols = A->cmap->n; 1362 matrix->num_entries = a->nz; 1363 matrix->row_offsets = new THRUSTINTARRAY32(m+1); 1364 matrix->row_offsets->assign(ii, ii + m+1); 1365 1366 matrix->column_indices = new THRUSTINTARRAY32(a->nz); 1367 matrix->column_indices->assign(a->j, a->j+a->nz); 1368 1369 matrix->values = new THRUSTARRAY(a->nz); 1370 matrix->values->assign(a->a, a->a+a->nz); 1371 1372 cusparseHybMat_t hybMat; 1373 stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat); 1374 cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ? 1375 CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 1376 stat = cusparse_csr2hyb(cusparsestruct->handle, matrix->num_rows, matrix->num_cols, 1377 matstruct->descr, matrix->values->data().get(), 1378 matrix->row_offsets->data().get(), 1379 matrix->column_indices->data().get(), 1380 hybMat, 0, partition);CHKERRCUSPARSE(stat); 1381 /* assign the pointer */ 1382 matstruct->mat = hybMat; 1383 1384 if (matrix) { 1385 if (matrix->values) delete (THRUSTARRAY*)matrix->values; 1386 if (matrix->column_indices) delete (THRUSTINTARRAY32*)matrix->column_indices; 1387 if (matrix->row_offsets) delete (THRUSTINTARRAY32*)matrix->row_offsets; 1388 delete (CsrMatrix*)matrix; 1389 } 1390 } 1391 1392 /* assign the compressed row indices */ 1393 if (a->compressedrow.use) { 1394 cusparsestruct->workVector = new THRUSTARRAY(m); 1395 matstruct->cprowIndices = new THRUSTINTARRAY(m); 1396 matstruct->cprowIndices->assign(ridx,ridx+m); 1397 tmp = m; 1398 } else { 1399 cusparsestruct->workVector = NULL; 1400 matstruct->cprowIndices = NULL; 1401 tmp = 0; 1402 } 1403 ierr = PetscLogCpuToGpu(((m+1)+(a->nz))*sizeof(int)+tmp*sizeof(PetscInt)+(3+(a->nz))*sizeof(PetscScalar));CHKERRQ(ierr); 1404 1405 /* assign the pointer */ 1406 cusparsestruct->mat = matstruct; 1407 } catch(char *ex) { 1408 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 1409 } 1410 err = WaitForCUDA();CHKERRCUDA(err); 1411 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1412 ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1413 cusparsestruct->nonzerostate = A->nonzerostate; 1414 } 1415 A->offloadmask = PETSC_OFFLOAD_BOTH; 1416 } 1417 PetscFunctionReturn(0); 1418 } 1419 1420 struct VecCUDAPlusEquals 1421 { 1422 template <typename Tuple> 1423 __host__ __device__ 1424 void operator()(Tuple t) 1425 { 1426 thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t); 1427 } 1428 }; 1429 1430 struct VecCUDAEqualsReverse 1431 { 1432 template <typename Tuple> 1433 __host__ __device__ 1434 void operator()(Tuple t) 1435 { 1436 thrust::get<0>(t) = thrust::get<1>(t); 1437 } 1438 }; 1439 1440 typedef struct { 1441 PetscBool cisdense; 1442 PetscScalar *Bt; 1443 Mat X; 1444 } MatMatCusparse; 1445 1446 static PetscErrorCode MatDestroy_MatMatCusparse(void *data) 1447 { 1448 PetscErrorCode ierr; 1449 MatMatCusparse *mmdata = (MatMatCusparse *)data; 1450 cudaError_t cerr; 1451 1452 PetscFunctionBegin; 1453 cerr = cudaFree(mmdata->Bt);CHKERRCUDA(cerr); 1454 ierr = MatDestroy(&mmdata->X);CHKERRQ(ierr); 1455 ierr = PetscFree(data);CHKERRQ(ierr); 1456 PetscFunctionReturn(0); 1457 } 1458 1459 PETSC_INTERN PetscErrorCode MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(Mat,Mat,Mat,PetscBool,PetscBool); 1460 1461 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C) 1462 { 1463 Mat_Product *product = C->product; 1464 Mat A,B; 1465 PetscInt m,n,k,blda,clda; 1466 PetscBool flg,biscuda; 1467 Mat_SeqAIJCUSPARSE *cusp; 1468 cusparseStatus_t stat; 1469 cusparseOperation_t opA; 1470 cusparseMatDescr_t matA; 1471 const PetscScalar *barray; 1472 PetscScalar *carray; 1473 PetscErrorCode ierr; 1474 MatMatCusparse *mmdata; 1475 Mat_SeqAIJCUSPARSEMultStruct *mat; 1476 CsrMatrix *csrmat; 1477 cudaError_t cuer; 1478 1479 PetscFunctionBegin; 1480 MatCheckProduct(C,1); 1481 if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data empty"); 1482 mmdata = (MatMatCusparse*)product->data; 1483 A = product->A; 1484 B = product->B; 1485 ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 1486 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name); 1487 /* currently CopyToGpu does not copy if the matrix is bound to CPU 1488 Instead of silently accepting the wrong answer, I prefer to raise the error */ 1489 if (A->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 1490 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 1491 cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 1492 switch (product->type) { 1493 case MATPRODUCT_AB: 1494 case MATPRODUCT_PtAP: 1495 mat = cusp->mat; 1496 opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 1497 m = A->rmap->n; 1498 k = A->cmap->n; 1499 n = B->cmap->n; 1500 break; 1501 case MATPRODUCT_AtB: 1502 if (!cusp->transgen) { 1503 mat = cusp->mat; 1504 opA = CUSPARSE_OPERATION_TRANSPOSE; 1505 } else { 1506 ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr); 1507 mat = cusp->matTranspose; 1508 opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 1509 } 1510 m = A->cmap->n; 1511 k = A->rmap->n; 1512 n = B->cmap->n; 1513 break; 1514 case MATPRODUCT_ABt: 1515 case MATPRODUCT_RARt: 1516 mat = cusp->mat; 1517 opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 1518 m = A->rmap->n; 1519 k = B->cmap->n; 1520 n = B->rmap->n; 1521 break; 1522 default: 1523 SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]); 1524 } 1525 if (!mat) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 1526 matA = mat->descr; 1527 csrmat = (CsrMatrix*)mat->mat; 1528 /* if the user passed a CPU matrix, copy the data to the GPU */ 1529 ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQDENSECUDA,&biscuda);CHKERRQ(ierr); 1530 if (!biscuda) { 1531 ierr = MatConvert(B,MATSEQDENSECUDA,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr); 1532 } 1533 ierr = MatDenseCUDAGetArrayRead(B,&barray);CHKERRQ(ierr); 1534 ierr = MatDenseGetLDA(B,&blda);CHKERRQ(ierr); 1535 if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) { 1536 ierr = MatDenseCUDAGetArrayWrite(mmdata->X,&carray);CHKERRQ(ierr); 1537 ierr = MatDenseGetLDA(mmdata->X,&clda);CHKERRQ(ierr); 1538 } else { 1539 ierr = MatDenseCUDAGetArrayWrite(C,&carray);CHKERRQ(ierr); 1540 ierr = MatDenseGetLDA(C,&clda);CHKERRQ(ierr); 1541 } 1542 1543 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1544 1545 /* cusparse spmm does not support transpose on B */ 1546 if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) { 1547 cublasHandle_t cublasv2handle; 1548 cublasStatus_t cerr; 1549 1550 ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr); 1551 cerr = cublasXgeam(cublasv2handle,CUBLAS_OP_T,CUBLAS_OP_T, 1552 B->cmap->n,B->rmap->n, 1553 &PETSC_CUSPARSE_ONE ,barray,blda, 1554 &PETSC_CUSPARSE_ZERO,barray,blda, 1555 mmdata->Bt,B->cmap->n);CHKERRCUBLAS(cerr); 1556 blda = B->cmap->n; 1557 } 1558 1559 /* perform the MatMat operation */ 1560 stat = cusparse_csr_spmm(cusp->handle,opA,m,n,k, 1561 csrmat->num_entries,mat->alpha,matA, 1562 csrmat->values->data().get(), 1563 csrmat->row_offsets->data().get(), 1564 csrmat->column_indices->data().get(), 1565 mmdata->Bt ? mmdata->Bt : barray,blda,mat->beta_zero, 1566 carray,clda);CHKERRCUSPARSE(stat); 1567 cuer = WaitForCUDA();CHKERRCUDA(cuer); 1568 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1569 ierr = PetscLogGpuFlops(n*2.0*csrmat->num_entries);CHKERRQ(ierr); 1570 ierr = MatDenseCUDARestoreArrayRead(B,&barray);CHKERRQ(ierr); 1571 if (product->type == MATPRODUCT_RARt) { 1572 ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr); 1573 ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr); 1574 } else if (product->type == MATPRODUCT_PtAP) { 1575 ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr); 1576 ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr); 1577 } else { 1578 ierr = MatDenseCUDARestoreArrayWrite(C,&carray);CHKERRQ(ierr); 1579 } 1580 if (mmdata->cisdense) { 1581 ierr = MatConvert(C,MATSEQDENSE,MAT_INPLACE_MATRIX,&C);CHKERRQ(ierr); 1582 } 1583 if (!biscuda) { 1584 ierr = MatConvert(B,MATSEQDENSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr); 1585 } 1586 PetscFunctionReturn(0); 1587 } 1588 1589 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C) 1590 { 1591 Mat_Product *product = C->product; 1592 Mat A,B; 1593 PetscInt m,n; 1594 PetscBool cisdense,flg; 1595 PetscErrorCode ierr; 1596 MatMatCusparse *mmdata; 1597 Mat_SeqAIJCUSPARSE *cusp; 1598 1599 PetscFunctionBegin; 1600 MatCheckProduct(C,1); 1601 if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data not empty"); 1602 A = product->A; 1603 B = product->B; 1604 ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 1605 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name); 1606 cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 1607 if (cusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format"); 1608 switch (product->type) { 1609 case MATPRODUCT_AB: 1610 m = A->rmap->n; 1611 n = B->cmap->n; 1612 break; 1613 case MATPRODUCT_AtB: 1614 m = A->cmap->n; 1615 n = B->cmap->n; 1616 break; 1617 case MATPRODUCT_ABt: 1618 m = A->rmap->n; 1619 n = B->rmap->n; 1620 break; 1621 case MATPRODUCT_PtAP: 1622 m = B->cmap->n; 1623 n = B->cmap->n; 1624 break; 1625 case MATPRODUCT_RARt: 1626 m = B->rmap->n; 1627 n = B->rmap->n; 1628 break; 1629 default: 1630 SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]); 1631 } 1632 ierr = MatSetSizes(C,m,n,m,n);CHKERRQ(ierr); 1633 /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */ 1634 ierr = PetscObjectTypeCompare((PetscObject)C,MATSEQDENSE,&cisdense);CHKERRQ(ierr); 1635 ierr = MatSetType(C,MATSEQDENSECUDA);CHKERRQ(ierr); 1636 1637 /* product data */ 1638 ierr = PetscNew(&mmdata);CHKERRQ(ierr); 1639 mmdata->cisdense = cisdense; 1640 /* cusparse spmm does not support transpose on B */ 1641 if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) { 1642 cudaError_t cerr; 1643 1644 cerr = cudaMalloc((void**)&mmdata->Bt,(size_t)B->rmap->n*(size_t)B->cmap->n*sizeof(PetscScalar));CHKERRCUDA(cerr); 1645 } 1646 /* for these products we need intermediate storage */ 1647 if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) { 1648 ierr = MatCreate(PetscObjectComm((PetscObject)C),&mmdata->X);CHKERRQ(ierr); 1649 ierr = MatSetType(mmdata->X,MATSEQDENSECUDA);CHKERRQ(ierr); 1650 if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */ 1651 ierr = MatSetSizes(mmdata->X,A->rmap->n,B->rmap->n,A->rmap->n,B->rmap->n);CHKERRQ(ierr); 1652 } else { 1653 ierr = MatSetSizes(mmdata->X,A->rmap->n,B->cmap->n,A->rmap->n,B->cmap->n);CHKERRQ(ierr); 1654 } 1655 } 1656 C->product->data = mmdata; 1657 C->product->destroy = MatDestroy_MatMatCusparse; 1658 1659 C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA; 1660 PetscFunctionReturn(0); 1661 } 1662 1663 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat); 1664 1665 /* handles dense B */ 1666 static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat C) 1667 { 1668 Mat_Product *product = C->product; 1669 PetscErrorCode ierr; 1670 1671 PetscFunctionBegin; 1672 MatCheckProduct(C,1); 1673 if (!product->A) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing A"); 1674 if (product->A->boundtocpu) { 1675 ierr = MatProductSetFromOptions_SeqAIJ_SeqDense(C);CHKERRQ(ierr); 1676 PetscFunctionReturn(0); 1677 } 1678 switch (product->type) { 1679 case MATPRODUCT_AB: 1680 case MATPRODUCT_AtB: 1681 case MATPRODUCT_ABt: 1682 case MATPRODUCT_PtAP: 1683 case MATPRODUCT_RARt: 1684 C->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA; 1685 default: 1686 break; 1687 } 1688 PetscFunctionReturn(0); 1689 } 1690 1691 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 1692 { 1693 PetscErrorCode ierr; 1694 1695 PetscFunctionBegin; 1696 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr); 1697 PetscFunctionReturn(0); 1698 } 1699 1700 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy, Vec zz) 1701 { 1702 PetscErrorCode ierr; 1703 1704 PetscFunctionBegin; 1705 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr); 1706 PetscFunctionReturn(0); 1707 } 1708 1709 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 1710 { 1711 PetscErrorCode ierr; 1712 1713 PetscFunctionBegin; 1714 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr); 1715 PetscFunctionReturn(0); 1716 } 1717 1718 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 1719 { 1720 PetscErrorCode ierr; 1721 1722 PetscFunctionBegin; 1723 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr); 1724 PetscFunctionReturn(0); 1725 } 1726 1727 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 1728 { 1729 PetscErrorCode ierr; 1730 1731 PetscFunctionBegin; 1732 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr); 1733 PetscFunctionReturn(0); 1734 } 1735 1736 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz,PetscBool trans,PetscBool herm) 1737 { 1738 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1739 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1740 Mat_SeqAIJCUSPARSEMultStruct *matstruct; 1741 PetscScalar *xarray,*zarray,*dptr,*beta,*xptr; 1742 PetscErrorCode ierr; 1743 cudaError_t cerr; 1744 cusparseStatus_t stat; 1745 cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 1746 PetscBool compressed; 1747 1748 PetscFunctionBegin; 1749 if (herm && !trans) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Hermitian and not transpose not supported"); 1750 if (!a->nonzerorowcnt) { 1751 if (!yy) { 1752 ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr); 1753 } 1754 PetscFunctionReturn(0); 1755 } 1756 /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */ 1757 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 1758 if (!trans) { 1759 matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 1760 } else { 1761 if (herm || !cusparsestruct->transgen) { 1762 opA = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE; 1763 matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 1764 } else { 1765 matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 1766 if (!matstruct) { 1767 ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr); 1768 matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 1769 } 1770 } 1771 } 1772 /* Does the matrix use compressed rows (i.e., drop zero rows)? */ 1773 compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE; 1774 1775 try { 1776 ierr = VecCUDAGetArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr); 1777 if (yy == zz) {ierr = VecCUDAGetArray(zz,&zarray);CHKERRQ(ierr);} /* read & write zz, so need to get uptodate zarray on GPU */ 1778 else {ierr = VecCUDAGetArrayWrite(zz,&zarray);CHKERRQ(ierr);} /* write zz, so no need to init zarray on GPU */ 1779 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1780 if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) { 1781 xptr = xarray; 1782 dptr = compressed ? cusparsestruct->workVector->data().get() : zarray; /* Use a work vector for SpMv output if compressed */ 1783 beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero; 1784 } else { 1785 xptr = compressed ? cusparsestruct->workVector->data().get() : xarray; /* Use a work vector for SpMv input if compressed */ 1786 dptr = zarray; 1787 beta = yy ? matstruct->beta_one : matstruct->beta_zero; 1788 1789 if (compressed) { 1790 thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray); 1791 1792 thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))), 1793 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), 1794 VecCUDAEqualsReverse()); 1795 } 1796 } 1797 1798 /* csr_spmv does y = alpha*Ax + beta*y */ 1799 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 1800 CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 1801 stat = cusparse_csr_spmv(cusparsestruct->handle, opA, 1802 mat->num_rows, mat->num_cols, 1803 mat->num_entries, matstruct->alpha, matstruct->descr, 1804 mat->values->data().get(), mat->row_offsets->data().get(), 1805 mat->column_indices->data().get(), xptr, beta, 1806 dptr);CHKERRCUSPARSE(stat); 1807 } else { 1808 if (cusparsestruct->nrows) { 1809 cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat; 1810 stat = cusparse_hyb_spmv(cusparsestruct->handle, opA, 1811 matstruct->alpha, matstruct->descr, hybMat, 1812 xptr, beta, 1813 dptr);CHKERRCUSPARSE(stat); 1814 } 1815 } 1816 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1817 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1818 1819 if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) { 1820 if (yy) { /* MatMultAdd: zz = A*xx + yy */ 1821 if (compressed) { /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */ 1822 ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr); /* zz = yy */ 1823 } else if (zz != yy) { /* A is not compressed. zz already contains A*xx, and we just need to add yy */ 1824 ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */ 1825 } 1826 } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */ 1827 ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr); 1828 } 1829 1830 /* ScatterAdd the result from work vector into the full vector when A is compressed */ 1831 if (compressed) { 1832 thrust::device_ptr<PetscScalar> zptr = thrust::device_pointer_cast(zarray); 1833 1834 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1835 thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))), 1836 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), 1837 VecCUDAPlusEquals()); 1838 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1839 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1840 } 1841 } else { 1842 if (yy && yy != zz) { 1843 ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */ 1844 } 1845 } 1846 ierr = VecCUDARestoreArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr); 1847 if (yy == zz) {ierr = VecCUDARestoreArray(zz,&zarray);CHKERRQ(ierr);} 1848 else {ierr = VecCUDARestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);} 1849 } catch(char *ex) { 1850 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 1851 } 1852 if (yy) { 1853 ierr = PetscLogGpuFlops(2.0*a->nz);CHKERRQ(ierr); 1854 } else { 1855 ierr = PetscLogGpuFlops(2.0*a->nz-a->nonzerorowcnt);CHKERRQ(ierr); 1856 } 1857 PetscFunctionReturn(0); 1858 } 1859 1860 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 1861 { 1862 PetscErrorCode ierr; 1863 1864 PetscFunctionBegin; 1865 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr); 1866 PetscFunctionReturn(0); 1867 } 1868 1869 static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode) 1870 { 1871 PetscErrorCode ierr; 1872 1873 PetscFunctionBegin; 1874 ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr); 1875 if (mode == MAT_FLUSH_ASSEMBLY || A->boundtocpu) PetscFunctionReturn(0); 1876 if (A->factortype == MAT_FACTOR_NONE) { 1877 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 1878 } 1879 PetscFunctionReturn(0); 1880 } 1881 1882 /* --------------------------------------------------------------------------------*/ 1883 /*@ 1884 MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format 1885 (the default parallel PETSc format). This matrix will ultimately pushed down 1886 to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix 1887 assembly performance the user should preallocate the matrix storage by setting 1888 the parameter nz (or the array nnz). By setting these parameters accurately, 1889 performance during matrix assembly can be increased by more than a factor of 50. 1890 1891 Collective 1892 1893 Input Parameters: 1894 + comm - MPI communicator, set to PETSC_COMM_SELF 1895 . m - number of rows 1896 . n - number of columns 1897 . nz - number of nonzeros per row (same for all rows) 1898 - nnz - array containing the number of nonzeros in the various rows 1899 (possibly different for each row) or NULL 1900 1901 Output Parameter: 1902 . A - the matrix 1903 1904 It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 1905 MatXXXXSetPreallocation() paradgm instead of this routine directly. 1906 [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 1907 1908 Notes: 1909 If nnz is given then nz is ignored 1910 1911 The AIJ format (also called the Yale sparse matrix format or 1912 compressed row storage), is fully compatible with standard Fortran 77 1913 storage. That is, the stored row and column indices can begin at 1914 either one (as in Fortran) or zero. See the users' manual for details. 1915 1916 Specify the preallocated storage with either nz or nnz (not both). 1917 Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 1918 allocation. For large problems you MUST preallocate memory or you 1919 will get TERRIBLE performance, see the users' manual chapter on matrices. 1920 1921 By default, this format uses inodes (identical nodes) when possible, to 1922 improve numerical efficiency of matrix-vector products and solves. We 1923 search for consecutive rows with the same nonzero structure, thereby 1924 reusing matrix information to achieve increased efficiency. 1925 1926 Level: intermediate 1927 1928 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE 1929 @*/ 1930 PetscErrorCode MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A) 1931 { 1932 PetscErrorCode ierr; 1933 1934 PetscFunctionBegin; 1935 ierr = MatCreate(comm,A);CHKERRQ(ierr); 1936 ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr); 1937 ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 1938 ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr); 1939 PetscFunctionReturn(0); 1940 } 1941 1942 static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A) 1943 { 1944 PetscErrorCode ierr; 1945 1946 PetscFunctionBegin; 1947 if (A->factortype == MAT_FACTOR_NONE) { 1948 ierr = MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr);CHKERRQ(ierr); 1949 } else { 1950 ierr = MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr);CHKERRQ(ierr); 1951 } 1952 ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL);CHKERRQ(ierr); 1953 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr); 1954 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr); 1955 ierr = PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL);CHKERRQ(ierr); 1956 ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr); 1957 PetscFunctionReturn(0); 1958 } 1959 1960 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat,MatType,MatReuse,Mat*); 1961 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat,PetscBool); 1962 static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A,MatDuplicateOption cpvalues,Mat *B) 1963 { 1964 PetscErrorCode ierr; 1965 1966 PetscFunctionBegin; 1967 ierr = MatDuplicate_SeqAIJ(A,cpvalues,B);CHKERRQ(ierr); 1968 ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,B);CHKERRQ(ierr); 1969 PetscFunctionReturn(0); 1970 } 1971 1972 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A,PetscBool flg) 1973 { 1974 PetscErrorCode ierr; 1975 1976 PetscFunctionBegin; 1977 if (A->factortype != MAT_FACTOR_NONE) PetscFunctionReturn(0); 1978 /* Currently, there is no case in which an AIJCUSPARSE matrix ever has its offloadmask set to PETS_OFFLOAD_GPU. 1979 If this changes, we need to implement a routine to update the CPU (host) version of the matrix from the GPU one. 1980 Right now, for safety we simply check for PETSC_OFFLOAD_GPU and have MatBindToCPU() error in this case. 1981 TODO: Add MatAIJCUSPARSECopyFromGPU() and make MatBindToCPU() functional for AIJCUSPARSE matries; 1982 can follow the example of MatBindToCPU_SeqAIJViennaCL(). */ 1983 if (flg && A->offloadmask == PETSC_OFFLOAD_GPU) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"PETSC_OFFLOAD_GPU should not happen. Please report your use case to petsc-dev@mcs.anl.gov"); 1984 /* TODO: add support for this? */ 1985 if (flg) { 1986 A->ops->mult = MatMult_SeqAIJ; 1987 A->ops->multadd = MatMultAdd_SeqAIJ; 1988 A->ops->multtranspose = MatMultTranspose_SeqAIJ; 1989 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJ; 1990 A->ops->multhermitiantranspose = NULL; 1991 A->ops->multhermitiantransposeadd = NULL; 1992 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr); 1993 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr); 1994 } else { 1995 A->ops->mult = MatMult_SeqAIJCUSPARSE; 1996 A->ops->multadd = MatMultAdd_SeqAIJCUSPARSE; 1997 A->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE; 1998 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE; 1999 A->ops->multhermitiantranspose = MatMultHermitianTranspose_SeqAIJCUSPARSE; 2000 A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE; 2001 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr); 2002 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr); 2003 } 2004 A->boundtocpu = flg; 2005 PetscFunctionReturn(0); 2006 } 2007 2008 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType mtype, MatReuse reuse, Mat* newmat) 2009 { 2010 PetscErrorCode ierr; 2011 cusparseStatus_t stat; 2012 Mat B; 2013 2014 PetscFunctionBegin; 2015 if (reuse == MAT_INITIAL_MATRIX) { 2016 ierr = MatDuplicate(A,MAT_COPY_VALUES,newmat);CHKERRQ(ierr); 2017 } else if (reuse == MAT_REUSE_MATRIX) { 2018 ierr = MatCopy(A,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr); 2019 } 2020 B = *newmat; 2021 2022 ierr = PetscFree(B->defaultvectype);CHKERRQ(ierr); 2023 ierr = PetscStrallocpy(VECCUDA,&B->defaultvectype);CHKERRQ(ierr); 2024 2025 if (reuse != MAT_REUSE_MATRIX && !B->spptr) { 2026 if (B->factortype == MAT_FACTOR_NONE) { 2027 Mat_SeqAIJCUSPARSE *spptr; 2028 2029 ierr = PetscNew(&spptr);CHKERRQ(ierr); 2030 spptr->format = MAT_CUSPARSE_CSR; 2031 stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat); 2032 B->spptr = spptr; 2033 } else { 2034 Mat_SeqAIJCUSPARSETriFactors *spptr; 2035 2036 ierr = PetscNew(&spptr);CHKERRQ(ierr); 2037 stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat); 2038 B->spptr = spptr; 2039 } 2040 B->offloadmask = PETSC_OFFLOAD_UNALLOCATED; 2041 } 2042 B->ops->assemblyend = MatAssemblyEnd_SeqAIJCUSPARSE; 2043 B->ops->destroy = MatDestroy_SeqAIJCUSPARSE; 2044 B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE; 2045 B->ops->bindtocpu = MatBindToCPU_SeqAIJCUSPARSE; 2046 B->ops->duplicate = MatDuplicate_SeqAIJCUSPARSE; 2047 2048 ierr = MatBindToCPU_SeqAIJCUSPARSE(B,PETSC_FALSE);CHKERRQ(ierr); 2049 ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 2050 ierr = PetscObjectComposeFunction((PetscObject)B,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr); 2051 PetscFunctionReturn(0); 2052 } 2053 2054 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B) 2055 { 2056 PetscErrorCode ierr; 2057 2058 PetscFunctionBegin; 2059 ierr = PetscCUDAInitializeCheck();CHKERRQ(ierr); 2060 ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr); 2061 ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr); 2062 PetscFunctionReturn(0); 2063 } 2064 2065 /*MC 2066 MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices. 2067 2068 A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either 2069 CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later. 2070 All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library. 2071 2072 Options Database Keys: 2073 + -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions() 2074 . -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 2075 - -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 2076 2077 Level: beginner 2078 2079 .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 2080 M*/ 2081 2082 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat,MatFactorType,Mat*); 2083 2084 2085 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void) 2086 { 2087 PetscErrorCode ierr; 2088 2089 PetscFunctionBegin; 2090 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 2091 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 2092 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 2093 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 2094 PetscFunctionReturn(0); 2095 } 2096 2097 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct) 2098 { 2099 PetscErrorCode ierr; 2100 cusparseStatus_t stat; 2101 cusparseHandle_t handle; 2102 2103 PetscFunctionBegin; 2104 if (*cusparsestruct) { 2105 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format);CHKERRQ(ierr); 2106 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format);CHKERRQ(ierr); 2107 delete (*cusparsestruct)->workVector; 2108 delete (*cusparsestruct)->rowoffsets_gpu; 2109 if (handle = (*cusparsestruct)->handle) { 2110 stat = cusparseDestroy(handle);CHKERRCUSPARSE(stat); 2111 } 2112 ierr = PetscFree(*cusparsestruct);CHKERRQ(ierr); 2113 } 2114 PetscFunctionReturn(0); 2115 } 2116 2117 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat) 2118 { 2119 PetscFunctionBegin; 2120 if (*mat) { 2121 delete (*mat)->values; 2122 delete (*mat)->column_indices; 2123 delete (*mat)->row_offsets; 2124 delete *mat; 2125 *mat = 0; 2126 } 2127 PetscFunctionReturn(0); 2128 } 2129 2130 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor) 2131 { 2132 cusparseStatus_t stat; 2133 PetscErrorCode ierr; 2134 2135 PetscFunctionBegin; 2136 if (*trifactor) { 2137 if ((*trifactor)->descr) { stat = cusparseDestroyMatDescr((*trifactor)->descr);CHKERRCUSPARSE(stat); } 2138 if ((*trifactor)->solveInfo) { stat = cusparseDestroySolveAnalysisInfo((*trifactor)->solveInfo);CHKERRCUSPARSE(stat); } 2139 ierr = CsrMatrix_Destroy(&(*trifactor)->csrMat);CHKERRQ(ierr); 2140 delete *trifactor; 2141 *trifactor = 0; 2142 } 2143 PetscFunctionReturn(0); 2144 } 2145 2146 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format) 2147 { 2148 CsrMatrix *mat; 2149 cusparseStatus_t stat; 2150 cudaError_t err; 2151 2152 PetscFunctionBegin; 2153 if (*matstruct) { 2154 if ((*matstruct)->mat) { 2155 if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) { 2156 cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat; 2157 stat = cusparseDestroyHybMat(hybMat);CHKERRCUSPARSE(stat); 2158 } else { 2159 mat = (CsrMatrix*)(*matstruct)->mat; 2160 CsrMatrix_Destroy(&mat); 2161 } 2162 } 2163 if ((*matstruct)->descr) { stat = cusparseDestroyMatDescr((*matstruct)->descr);CHKERRCUSPARSE(stat); } 2164 delete (*matstruct)->cprowIndices; 2165 if ((*matstruct)->alpha) { err=cudaFree((*matstruct)->alpha);CHKERRCUDA(err); } 2166 if ((*matstruct)->beta_zero) { err=cudaFree((*matstruct)->beta_zero);CHKERRCUDA(err); } 2167 if ((*matstruct)->beta_one) { err=cudaFree((*matstruct)->beta_one);CHKERRCUDA(err); } 2168 delete *matstruct; 2169 *matstruct = 0; 2170 } 2171 PetscFunctionReturn(0); 2172 } 2173 2174 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors** trifactors) 2175 { 2176 PetscErrorCode ierr; 2177 2178 PetscFunctionBegin; 2179 if (*trifactors) { 2180 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtr);CHKERRQ(ierr); 2181 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtr);CHKERRQ(ierr); 2182 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtrTranspose);CHKERRQ(ierr); 2183 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtrTranspose);CHKERRQ(ierr); 2184 delete (*trifactors)->rpermIndices; 2185 delete (*trifactors)->cpermIndices; 2186 delete (*trifactors)->workVector; 2187 (*trifactors)->rpermIndices = 0; 2188 (*trifactors)->cpermIndices = 0; 2189 (*trifactors)->workVector = 0; 2190 } 2191 PetscFunctionReturn(0); 2192 } 2193 2194 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors) 2195 { 2196 PetscErrorCode ierr; 2197 cusparseHandle_t handle; 2198 cusparseStatus_t stat; 2199 2200 PetscFunctionBegin; 2201 if (*trifactors) { 2202 ierr = MatSeqAIJCUSPARSETriFactors_Reset(trifactors);CHKERRQ(ierr); 2203 if (handle = (*trifactors)->handle) { 2204 stat = cusparseDestroy(handle);CHKERRCUSPARSE(stat); 2205 } 2206 ierr = PetscFree(*trifactors);CHKERRQ(ierr); 2207 } 2208 PetscFunctionReturn(0); 2209 } 2210