1 /* 2 Defines the basic matrix operations for the AIJ (compressed row) 3 matrix storage format using the CUSPARSE library, 4 */ 5 #define PETSC_SKIP_SPINLOCK 6 #define PETSC_SKIP_CXX_COMPLEX_FIX 7 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1 8 9 #include <petscconf.h> 10 #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/ 11 #include <../src/mat/impls/sbaij/seq/sbaij.h> 12 #include <../src/vec/vec/impls/dvecimpl.h> 13 #include <petsc/private/vecimpl.h> 14 #undef VecType 15 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h> 16 17 const char *const MatCUSPARSEStorageFormats[] = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0}; 18 19 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*); 20 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*); 21 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*); 22 23 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 24 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 25 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*); 26 27 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec); 28 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 29 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 30 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 31 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat); 32 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec); 33 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 34 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 35 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 36 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 37 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 38 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec,PetscBool,PetscBool); 39 40 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**); 41 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**); 42 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat); 43 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors**); 44 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**); 45 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**); 46 47 PetscErrorCode MatCUSPARSESetStream(Mat A,const cudaStream_t stream) 48 { 49 cusparseStatus_t stat; 50 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 51 52 PetscFunctionBegin; 53 cusparsestruct->stream = stream; 54 stat = cusparseSetStream(cusparsestruct->handle,cusparsestruct->stream);CHKERRCUSPARSE(stat); 55 PetscFunctionReturn(0); 56 } 57 58 PetscErrorCode MatCUSPARSESetHandle(Mat A,const cusparseHandle_t handle) 59 { 60 cusparseStatus_t stat; 61 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 62 63 PetscFunctionBegin; 64 if (cusparsestruct->handle != handle) { 65 if (cusparsestruct->handle) { 66 stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUSPARSE(stat); 67 } 68 cusparsestruct->handle = handle; 69 } 70 stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 71 PetscFunctionReturn(0); 72 } 73 74 PetscErrorCode MatCUSPARSEClearHandle(Mat A) 75 { 76 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 77 78 PetscFunctionBegin; 79 if (cusparsestruct->handle) cusparsestruct->handle = 0; 80 PetscFunctionReturn(0); 81 } 82 83 PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A,MatSolverType *type) 84 { 85 PetscFunctionBegin; 86 *type = MATSOLVERCUSPARSE; 87 PetscFunctionReturn(0); 88 } 89 90 /*MC 91 MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices 92 on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported 93 algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer 94 performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the 95 CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these 96 algorithms are not recommended. This class does NOT support direct solver operations. 97 98 Level: beginner 99 100 .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 101 M*/ 102 103 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B) 104 { 105 PetscErrorCode ierr; 106 PetscInt n = A->rmap->n; 107 108 PetscFunctionBegin; 109 ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr); 110 (*B)->factortype = ftype; 111 ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr); 112 ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 113 114 if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) { 115 ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr); 116 (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE; 117 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJCUSPARSE; 118 } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) { 119 (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJCUSPARSE; 120 (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE; 121 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types"); 122 123 ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr); 124 ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_cusparse);CHKERRQ(ierr); 125 PetscFunctionReturn(0); 126 } 127 128 PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 129 { 130 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 131 132 PetscFunctionBegin; 133 switch (op) { 134 case MAT_CUSPARSE_MULT: 135 cusparsestruct->format = format; 136 break; 137 case MAT_CUSPARSE_ALL: 138 cusparsestruct->format = format; 139 break; 140 default: 141 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op); 142 } 143 PetscFunctionReturn(0); 144 } 145 146 /*@ 147 MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular 148 operation. Only the MatMult operation can use different GPU storage formats 149 for MPIAIJCUSPARSE matrices. 150 Not Collective 151 152 Input Parameters: 153 + A - Matrix of type SEQAIJCUSPARSE 154 . op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL. 155 - format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2) 156 157 Output Parameter: 158 159 Level: intermediate 160 161 .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 162 @*/ 163 PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 164 { 165 PetscErrorCode ierr; 166 167 PetscFunctionBegin; 168 PetscValidHeaderSpecific(A, MAT_CLASSID,1); 169 ierr = PetscTryMethod(A,"MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));CHKERRQ(ierr); 170 PetscFunctionReturn(0); 171 } 172 173 /*@ 174 MatSeqAIJCUSPARSESetGenerateTranspose - Sets the flag to explicitly generate the tranpose matrix before calling MatMultTranspose 175 176 Collective on mat 177 178 Input Parameters: 179 + A - Matrix of type SEQAIJCUSPARSE 180 - transgen - the boolean flag 181 182 Level: intermediate 183 184 .seealso: MATSEQAIJCUSPARSE 185 @*/ 186 PetscErrorCode MatSeqAIJCUSPARSESetGenerateTranspose(Mat A,PetscBool transgen) 187 { 188 PetscErrorCode ierr; 189 PetscBool flg; 190 191 PetscFunctionBegin; 192 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 193 ierr = PetscObjectTypeCompare(((PetscObject)A),MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 194 if (flg) { 195 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 196 if (A->factortype) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix"); 197 cusp->transgen = transgen; 198 } 199 PetscFunctionReturn(0); 200 } 201 202 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A) 203 { 204 PetscErrorCode ierr; 205 MatCUSPARSEStorageFormat format; 206 PetscBool flg; 207 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 208 209 PetscFunctionBegin; 210 ierr = PetscOptionsHead(PetscOptionsObject,"SeqAIJCUSPARSE options");CHKERRQ(ierr); 211 if (A->factortype == MAT_FACTOR_NONE) { 212 ierr = PetscOptionsBool("-mat_cusparse_transgen","Generate explicit transpose for MatMultTranspose","MatSeqAIJCUSPARSESetGenerateTranspose",cusparsestruct->transgen,&cusparsestruct->transgen,NULL);CHKERRQ(ierr); 213 ierr = PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV", 214 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 215 if (flg) { 216 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);CHKERRQ(ierr); 217 } 218 } 219 ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve", 220 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 221 if (flg) { 222 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr); 223 } 224 ierr = PetscOptionsTail();CHKERRQ(ierr); 225 PetscFunctionReturn(0); 226 } 227 228 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 229 { 230 PetscErrorCode ierr; 231 232 PetscFunctionBegin; 233 ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 234 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 235 PetscFunctionReturn(0); 236 } 237 238 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 239 { 240 PetscErrorCode ierr; 241 242 PetscFunctionBegin; 243 ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 244 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 245 PetscFunctionReturn(0); 246 } 247 248 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info) 249 { 250 PetscErrorCode ierr; 251 252 PetscFunctionBegin; 253 ierr = MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr); 254 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 255 PetscFunctionReturn(0); 256 } 257 258 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info) 259 { 260 PetscErrorCode ierr; 261 262 PetscFunctionBegin; 263 ierr = MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr); 264 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 265 PetscFunctionReturn(0); 266 } 267 268 static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A) 269 { 270 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 271 PetscInt n = A->rmap->n; 272 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 273 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 274 cusparseStatus_t stat; 275 const PetscInt *ai = a->i,*aj = a->j,*vi; 276 const MatScalar *aa = a->a,*v; 277 PetscInt *AiLo, *AjLo; 278 PetscScalar *AALo; 279 PetscInt i,nz, nzLower, offset, rowOffset; 280 PetscErrorCode ierr; 281 cudaError_t cerr; 282 283 PetscFunctionBegin; 284 if (!n) PetscFunctionReturn(0); 285 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 286 try { 287 /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */ 288 nzLower=n+ai[n]-ai[1]; 289 290 /* Allocate Space for the lower triangular matrix */ 291 cerr = cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr); 292 cerr = cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUDA(cerr); 293 cerr = cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr); 294 295 /* Fill the lower triangular matrix */ 296 AiLo[0] = (PetscInt) 0; 297 AiLo[n] = nzLower; 298 AjLo[0] = (PetscInt) 0; 299 AALo[0] = (MatScalar) 1.0; 300 v = aa; 301 vi = aj; 302 offset = 1; 303 rowOffset= 1; 304 for (i=1; i<n; i++) { 305 nz = ai[i+1] - ai[i]; 306 /* additional 1 for the term on the diagonal */ 307 AiLo[i] = rowOffset; 308 rowOffset += nz+1; 309 310 ierr = PetscArraycpy(&(AjLo[offset]), vi, nz);CHKERRQ(ierr); 311 ierr = PetscArraycpy(&(AALo[offset]), v, nz);CHKERRQ(ierr); 312 313 offset += nz; 314 AjLo[offset] = (PetscInt) i; 315 AALo[offset] = (MatScalar) 1.0; 316 offset += 1; 317 318 v += nz; 319 vi += nz; 320 } 321 322 /* allocate space for the triangular factor information */ 323 loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct; 324 325 /* Create the matrix description */ 326 stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat); 327 stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 328 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 329 stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER);CHKERRCUSPARSE(stat); 330 stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat); 331 332 /* Create the solve analysis information */ 333 stat = cusparseCreateSolveAnalysisInfo(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat); 334 335 /* set the operation */ 336 loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 337 338 /* set the matrix */ 339 loTriFactor->csrMat = new CsrMatrix; 340 loTriFactor->csrMat->num_rows = n; 341 loTriFactor->csrMat->num_cols = n; 342 loTriFactor->csrMat->num_entries = nzLower; 343 344 loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1); 345 loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1); 346 347 loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower); 348 loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower); 349 350 loTriFactor->csrMat->values = new THRUSTARRAY(nzLower); 351 loTriFactor->csrMat->values->assign(AALo, AALo+nzLower); 352 353 /* perform the solve analysis */ 354 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, 355 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 356 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 357 loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo);CHKERRCUSPARSE(stat); 358 359 /* assign the pointer. Is this really necessary? */ 360 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor; 361 362 cerr = cudaFreeHost(AiLo);CHKERRCUDA(cerr); 363 cerr = cudaFreeHost(AjLo);CHKERRCUDA(cerr); 364 cerr = cudaFreeHost(AALo);CHKERRCUDA(cerr); 365 ierr = PetscLogCpuToGpu((n+1+nzLower)*sizeof(int)+nzLower*sizeof(PetscScalar));CHKERRQ(ierr); 366 } catch(char *ex) { 367 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 368 } 369 } 370 PetscFunctionReturn(0); 371 } 372 373 static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A) 374 { 375 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 376 PetscInt n = A->rmap->n; 377 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 378 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 379 cusparseStatus_t stat; 380 const PetscInt *aj = a->j,*adiag = a->diag,*vi; 381 const MatScalar *aa = a->a,*v; 382 PetscInt *AiUp, *AjUp; 383 PetscScalar *AAUp; 384 PetscInt i,nz, nzUpper, offset; 385 PetscErrorCode ierr; 386 cudaError_t cerr; 387 388 PetscFunctionBegin; 389 if (!n) PetscFunctionReturn(0); 390 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 391 try { 392 /* next, figure out the number of nonzeros in the upper triangular matrix. */ 393 nzUpper = adiag[0]-adiag[n]; 394 395 /* Allocate Space for the upper triangular matrix */ 396 cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr); 397 cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr); 398 cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr); 399 400 /* Fill the upper triangular matrix */ 401 AiUp[0]=(PetscInt) 0; 402 AiUp[n]=nzUpper; 403 offset = nzUpper; 404 for (i=n-1; i>=0; i--) { 405 v = aa + adiag[i+1] + 1; 406 vi = aj + adiag[i+1] + 1; 407 408 /* number of elements NOT on the diagonal */ 409 nz = adiag[i] - adiag[i+1]-1; 410 411 /* decrement the offset */ 412 offset -= (nz+1); 413 414 /* first, set the diagonal elements */ 415 AjUp[offset] = (PetscInt) i; 416 AAUp[offset] = (MatScalar)1./v[nz]; 417 AiUp[i] = AiUp[i+1] - (nz+1); 418 419 ierr = PetscArraycpy(&(AjUp[offset+1]), vi, nz);CHKERRQ(ierr); 420 ierr = PetscArraycpy(&(AAUp[offset+1]), v, nz);CHKERRQ(ierr); 421 } 422 423 /* allocate space for the triangular factor information */ 424 upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct; 425 426 /* Create the matrix description */ 427 stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat); 428 stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 429 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 430 stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat); 431 stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat); 432 433 /* Create the solve analysis information */ 434 stat = cusparseCreateSolveAnalysisInfo(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat); 435 436 /* set the operation */ 437 upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 438 439 /* set the matrix */ 440 upTriFactor->csrMat = new CsrMatrix; 441 upTriFactor->csrMat->num_rows = n; 442 upTriFactor->csrMat->num_cols = n; 443 upTriFactor->csrMat->num_entries = nzUpper; 444 445 upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1); 446 upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1); 447 448 upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper); 449 upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper); 450 451 upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper); 452 upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper); 453 454 /* perform the solve analysis */ 455 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, 456 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 457 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 458 upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo);CHKERRCUSPARSE(stat); 459 460 /* assign the pointer. Is this really necessary? */ 461 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor; 462 463 cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr); 464 cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr); 465 cerr = cudaFreeHost(AAUp);CHKERRCUDA(cerr); 466 ierr = PetscLogCpuToGpu((n+1+nzUpper)*sizeof(int)+nzUpper*sizeof(PetscScalar));CHKERRQ(ierr); 467 } catch(char *ex) { 468 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 469 } 470 } 471 PetscFunctionReturn(0); 472 } 473 474 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A) 475 { 476 PetscErrorCode ierr; 477 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 478 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 479 IS isrow = a->row,iscol = a->icol; 480 PetscBool row_identity,col_identity; 481 const PetscInt *r,*c; 482 PetscInt n = A->rmap->n; 483 484 PetscFunctionBegin; 485 ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr); 486 ierr = MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);CHKERRQ(ierr); 487 ierr = MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);CHKERRQ(ierr); 488 489 cusparseTriFactors->workVector = new THRUSTARRAY(n); 490 cusparseTriFactors->nnz=a->nz; 491 492 A->offloadmask = PETSC_OFFLOAD_BOTH; 493 /* lower triangular indices */ 494 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 495 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 496 if (!row_identity) { 497 cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 498 cusparseTriFactors->rpermIndices->assign(r, r+n); 499 } 500 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 501 502 /* upper triangular indices */ 503 ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr); 504 ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 505 if (!col_identity) { 506 cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 507 cusparseTriFactors->cpermIndices->assign(c, c+n); 508 } 509 510 if (!row_identity && !col_identity) { 511 ierr = PetscLogCpuToGpu(2*n*sizeof(PetscInt));CHKERRQ(ierr); 512 } else if(!row_identity) { 513 ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr); 514 } else if(!col_identity) { 515 ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr); 516 } 517 518 ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr); 519 PetscFunctionReturn(0); 520 } 521 522 static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A) 523 { 524 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 525 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 526 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 527 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 528 cusparseStatus_t stat; 529 PetscErrorCode ierr; 530 cudaError_t cerr; 531 PetscInt *AiUp, *AjUp; 532 PetscScalar *AAUp; 533 PetscScalar *AALo; 534 PetscInt nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j; 535 Mat_SeqSBAIJ *b = (Mat_SeqSBAIJ*)A->data; 536 const PetscInt *ai = b->i,*aj = b->j,*vj; 537 const MatScalar *aa = b->a,*v; 538 539 PetscFunctionBegin; 540 if (!n) PetscFunctionReturn(0); 541 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 542 try { 543 /* Allocate Space for the upper triangular matrix */ 544 cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr); 545 cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr); 546 cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr); 547 cerr = cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr); 548 549 /* Fill the upper triangular matrix */ 550 AiUp[0]=(PetscInt) 0; 551 AiUp[n]=nzUpper; 552 offset = 0; 553 for (i=0; i<n; i++) { 554 /* set the pointers */ 555 v = aa + ai[i]; 556 vj = aj + ai[i]; 557 nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */ 558 559 /* first, set the diagonal elements */ 560 AjUp[offset] = (PetscInt) i; 561 AAUp[offset] = (MatScalar)1.0/v[nz]; 562 AiUp[i] = offset; 563 AALo[offset] = (MatScalar)1.0/v[nz]; 564 565 offset+=1; 566 if (nz>0) { 567 ierr = PetscArraycpy(&(AjUp[offset]), vj, nz);CHKERRQ(ierr); 568 ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr); 569 for (j=offset; j<offset+nz; j++) { 570 AAUp[j] = -AAUp[j]; 571 AALo[j] = AAUp[j]/v[nz]; 572 } 573 offset+=nz; 574 } 575 } 576 577 /* allocate space for the triangular factor information */ 578 upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct; 579 580 /* Create the matrix description */ 581 stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat); 582 stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 583 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 584 stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat); 585 stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat); 586 587 /* Create the solve analysis information */ 588 stat = cusparseCreateSolveAnalysisInfo(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat); 589 590 /* set the operation */ 591 upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 592 593 /* set the matrix */ 594 upTriFactor->csrMat = new CsrMatrix; 595 upTriFactor->csrMat->num_rows = A->rmap->n; 596 upTriFactor->csrMat->num_cols = A->cmap->n; 597 upTriFactor->csrMat->num_entries = a->nz; 598 599 upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 600 upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1); 601 602 upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 603 upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz); 604 605 upTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 606 upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz); 607 608 /* perform the solve analysis */ 609 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, 610 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 611 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 612 upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo);CHKERRCUSPARSE(stat); 613 614 /* assign the pointer. Is this really necessary? */ 615 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor; 616 617 /* allocate space for the triangular factor information */ 618 loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct; 619 620 /* Create the matrix description */ 621 stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat); 622 stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 623 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 624 stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat); 625 stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat); 626 627 /* Create the solve analysis information */ 628 stat = cusparseCreateSolveAnalysisInfo(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat); 629 630 /* set the operation */ 631 loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE; 632 633 /* set the matrix */ 634 loTriFactor->csrMat = new CsrMatrix; 635 loTriFactor->csrMat->num_rows = A->rmap->n; 636 loTriFactor->csrMat->num_cols = A->cmap->n; 637 loTriFactor->csrMat->num_entries = a->nz; 638 639 loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 640 loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1); 641 642 loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 643 loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz); 644 645 loTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 646 loTriFactor->csrMat->values->assign(AALo, AALo+a->nz); 647 ierr = PetscLogCpuToGpu(2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)));CHKERRQ(ierr); 648 649 /* perform the solve analysis */ 650 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, 651 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 652 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 653 loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo);CHKERRCUSPARSE(stat); 654 655 /* assign the pointer. Is this really necessary? */ 656 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor; 657 658 A->offloadmask = PETSC_OFFLOAD_BOTH; 659 cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr); 660 cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr); 661 cerr = cudaFreeHost(AAUp);CHKERRCUDA(cerr); 662 cerr = cudaFreeHost(AALo);CHKERRCUDA(cerr); 663 } catch(char *ex) { 664 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 665 } 666 } 667 PetscFunctionReturn(0); 668 } 669 670 static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A) 671 { 672 PetscErrorCode ierr; 673 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 674 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 675 IS ip = a->row; 676 const PetscInt *rip; 677 PetscBool perm_identity; 678 PetscInt n = A->rmap->n; 679 680 PetscFunctionBegin; 681 ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr); 682 ierr = MatSeqAIJCUSPARSEBuildICCTriMatrices(A);CHKERRQ(ierr); 683 cusparseTriFactors->workVector = new THRUSTARRAY(n); 684 cusparseTriFactors->nnz=(a->nz-n)*2 + n; 685 686 /* lower triangular indices */ 687 ierr = ISGetIndices(ip,&rip);CHKERRQ(ierr); 688 ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr); 689 if (!perm_identity) { 690 IS iip; 691 const PetscInt *irip; 692 693 ierr = ISInvertPermutation(ip,PETSC_DECIDE,&iip);CHKERRQ(ierr); 694 ierr = ISGetIndices(iip,&irip);CHKERRQ(ierr); 695 cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 696 cusparseTriFactors->rpermIndices->assign(rip, rip+n); 697 cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 698 cusparseTriFactors->cpermIndices->assign(irip, irip+n); 699 ierr = ISRestoreIndices(iip,&irip);CHKERRQ(ierr); 700 ierr = ISDestroy(&iip);CHKERRQ(ierr); 701 ierr = PetscLogCpuToGpu(2*n*sizeof(PetscInt));CHKERRQ(ierr); 702 } 703 ierr = ISRestoreIndices(ip,&rip);CHKERRQ(ierr); 704 PetscFunctionReturn(0); 705 } 706 707 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info) 708 { 709 Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 710 IS isrow = b->row,iscol = b->col; 711 PetscBool row_identity,col_identity; 712 PetscErrorCode ierr; 713 714 PetscFunctionBegin; 715 ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 716 B->offloadmask = PETSC_OFFLOAD_CPU; 717 /* determine which version of MatSolve needs to be used. */ 718 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 719 ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 720 if (row_identity && col_identity) { 721 B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 722 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 723 B->ops->matsolve = NULL; 724 B->ops->matsolvetranspose = NULL; 725 } else { 726 B->ops->solve = MatSolve_SeqAIJCUSPARSE; 727 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 728 B->ops->matsolve = NULL; 729 B->ops->matsolvetranspose = NULL; 730 } 731 732 /* get the triangular factors */ 733 ierr = MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);CHKERRQ(ierr); 734 PetscFunctionReturn(0); 735 } 736 737 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info) 738 { 739 Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 740 IS ip = b->row; 741 PetscBool perm_identity; 742 PetscErrorCode ierr; 743 744 PetscFunctionBegin; 745 ierr = MatCholeskyFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 746 B->offloadmask = PETSC_OFFLOAD_CPU; 747 /* determine which version of MatSolve needs to be used. */ 748 ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr); 749 if (perm_identity) { 750 B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 751 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 752 B->ops->matsolve = NULL; 753 B->ops->matsolvetranspose = NULL; 754 } else { 755 B->ops->solve = MatSolve_SeqAIJCUSPARSE; 756 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 757 B->ops->matsolve = NULL; 758 B->ops->matsolvetranspose = NULL; 759 } 760 761 /* get the triangular factors */ 762 ierr = MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);CHKERRQ(ierr); 763 PetscFunctionReturn(0); 764 } 765 766 static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A) 767 { 768 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 769 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 770 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 771 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 772 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 773 cusparseStatus_t stat; 774 cusparseIndexBase_t indexBase; 775 cusparseMatrixType_t matrixType; 776 cusparseFillMode_t fillMode; 777 cusparseDiagType_t diagType; 778 779 PetscFunctionBegin; 780 781 /*********************************************/ 782 /* Now the Transpose of the Lower Tri Factor */ 783 /*********************************************/ 784 785 /* allocate space for the transpose of the lower triangular factor */ 786 loTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct; 787 788 /* set the matrix descriptors of the lower triangular factor */ 789 matrixType = cusparseGetMatType(loTriFactor->descr); 790 indexBase = cusparseGetMatIndexBase(loTriFactor->descr); 791 fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ? 792 CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 793 diagType = cusparseGetMatDiagType(loTriFactor->descr); 794 795 /* Create the matrix description */ 796 stat = cusparseCreateMatDescr(&loTriFactorT->descr);CHKERRCUSPARSE(stat); 797 stat = cusparseSetMatIndexBase(loTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat); 798 stat = cusparseSetMatType(loTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat); 799 stat = cusparseSetMatFillMode(loTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat); 800 stat = cusparseSetMatDiagType(loTriFactorT->descr, diagType);CHKERRCUSPARSE(stat); 801 802 /* Create the solve analysis information */ 803 stat = cusparseCreateSolveAnalysisInfo(&loTriFactorT->solveInfo);CHKERRCUSPARSE(stat); 804 805 /* set the operation */ 806 loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 807 808 /* allocate GPU space for the CSC of the lower triangular factor*/ 809 loTriFactorT->csrMat = new CsrMatrix; 810 loTriFactorT->csrMat->num_rows = loTriFactor->csrMat->num_rows; 811 loTriFactorT->csrMat->num_cols = loTriFactor->csrMat->num_cols; 812 loTriFactorT->csrMat->num_entries = loTriFactor->csrMat->num_entries; 813 loTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(loTriFactor->csrMat->num_rows+1); 814 loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactor->csrMat->num_entries); 815 loTriFactorT->csrMat->values = new THRUSTARRAY(loTriFactor->csrMat->num_entries); 816 817 /* compute the transpose of the lower triangular factor, i.e. the CSC */ 818 stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, 819 loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, 820 loTriFactor->csrMat->values->data().get(), 821 loTriFactor->csrMat->row_offsets->data().get(), 822 loTriFactor->csrMat->column_indices->data().get(), 823 loTriFactorT->csrMat->values->data().get(), 824 loTriFactorT->csrMat->column_indices->data().get(), 825 loTriFactorT->csrMat->row_offsets->data().get(), 826 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat); 827 828 /* perform the solve analysis on the transposed matrix */ 829 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp, 830 loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, 831 loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(), 832 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), 833 loTriFactorT->solveInfo);CHKERRCUSPARSE(stat); 834 835 /* assign the pointer. Is this really necessary? */ 836 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT; 837 838 /*********************************************/ 839 /* Now the Transpose of the Upper Tri Factor */ 840 /*********************************************/ 841 842 /* allocate space for the transpose of the upper triangular factor */ 843 upTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct; 844 845 /* set the matrix descriptors of the upper triangular factor */ 846 matrixType = cusparseGetMatType(upTriFactor->descr); 847 indexBase = cusparseGetMatIndexBase(upTriFactor->descr); 848 fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ? 849 CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 850 diagType = cusparseGetMatDiagType(upTriFactor->descr); 851 852 /* Create the matrix description */ 853 stat = cusparseCreateMatDescr(&upTriFactorT->descr);CHKERRCUSPARSE(stat); 854 stat = cusparseSetMatIndexBase(upTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat); 855 stat = cusparseSetMatType(upTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat); 856 stat = cusparseSetMatFillMode(upTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat); 857 stat = cusparseSetMatDiagType(upTriFactorT->descr, diagType);CHKERRCUSPARSE(stat); 858 859 /* Create the solve analysis information */ 860 stat = cusparseCreateSolveAnalysisInfo(&upTriFactorT->solveInfo);CHKERRCUSPARSE(stat); 861 862 /* set the operation */ 863 upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 864 865 /* allocate GPU space for the CSC of the upper triangular factor*/ 866 upTriFactorT->csrMat = new CsrMatrix; 867 upTriFactorT->csrMat->num_rows = upTriFactor->csrMat->num_rows; 868 upTriFactorT->csrMat->num_cols = upTriFactor->csrMat->num_cols; 869 upTriFactorT->csrMat->num_entries = upTriFactor->csrMat->num_entries; 870 upTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(upTriFactor->csrMat->num_rows+1); 871 upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactor->csrMat->num_entries); 872 upTriFactorT->csrMat->values = new THRUSTARRAY(upTriFactor->csrMat->num_entries); 873 874 /* compute the transpose of the upper triangular factor, i.e. the CSC */ 875 stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, 876 upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, 877 upTriFactor->csrMat->values->data().get(), 878 upTriFactor->csrMat->row_offsets->data().get(), 879 upTriFactor->csrMat->column_indices->data().get(), 880 upTriFactorT->csrMat->values->data().get(), 881 upTriFactorT->csrMat->column_indices->data().get(), 882 upTriFactorT->csrMat->row_offsets->data().get(), 883 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat); 884 885 /* perform the solve analysis on the transposed matrix */ 886 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp, 887 upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, 888 upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(), 889 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), 890 upTriFactorT->solveInfo);CHKERRCUSPARSE(stat); 891 892 /* assign the pointer. Is this really necessary? */ 893 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT; 894 PetscFunctionReturn(0); 895 } 896 897 static PetscErrorCode MatSeqAIJCUSPARSEGenerateTransposeForMult(Mat A) 898 { 899 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 900 Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 901 Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 902 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 903 cusparseStatus_t stat; 904 cusparseIndexBase_t indexBase; 905 cudaError_t err; 906 PetscErrorCode ierr; 907 908 PetscFunctionBegin; 909 if (!cusparsestruct->transgen) PetscFunctionReturn(0); 910 /* allocate space for the triangular factor information */ 911 matstructT = new Mat_SeqAIJCUSPARSEMultStruct; 912 stat = cusparseCreateMatDescr(&matstructT->descr);CHKERRCUSPARSE(stat); 913 indexBase = cusparseGetMatIndexBase(matstruct->descr); 914 stat = cusparseSetMatIndexBase(matstructT->descr, indexBase);CHKERRCUSPARSE(stat); 915 stat = cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 916 917 /* set alpha and beta */ 918 err = cudaMalloc((void **)&(matstructT->alpha), sizeof(PetscScalar));CHKERRCUDA(err); 919 err = cudaMalloc((void **)&(matstructT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err); 920 err = cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar));CHKERRCUDA(err); 921 err = cudaMemcpy(matstructT->alpha, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 922 err = cudaMemcpy(matstructT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 923 err = cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 924 stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 925 926 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 927 CsrMatrix *matrix = (CsrMatrix*)matstruct->mat; 928 CsrMatrix *matrixT= new CsrMatrix; 929 matrixT->num_rows = A->cmap->n; 930 matrixT->num_cols = A->rmap->n; 931 matrixT->num_entries = a->nz; 932 matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows+1); 933 matrixT->column_indices = new THRUSTINTARRAY32(a->nz); 934 matrixT->values = new THRUSTARRAY(a->nz); 935 936 cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1); 937 cusparsestruct->rowoffsets_gpu->assign(a->i,a->i+A->rmap->n+1); 938 /* compute the transpose, i.e. the CSC */ 939 indexBase = cusparseGetMatIndexBase(matstruct->descr); 940 stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n, 941 A->cmap->n, matrix->num_entries, 942 matrix->values->data().get(), 943 cusparsestruct->rowoffsets_gpu->data().get(), 944 matrix->column_indices->data().get(), 945 matrixT->values->data().get(), 946 matrixT->column_indices->data().get(), 947 matrixT->row_offsets->data().get(), 948 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat); 949 /* assign the pointer */ 950 matstructT->mat = matrixT; 951 ierr = PetscLogCpuToGpu(((A->rmap->n+1)+(a->nz))*sizeof(int)+(3+a->nz)*sizeof(PetscScalar));CHKERRQ(ierr); 952 } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) { 953 /* First convert HYB to CSR */ 954 CsrMatrix *temp= new CsrMatrix; 955 temp->num_rows = A->rmap->n; 956 temp->num_cols = A->cmap->n; 957 temp->num_entries = a->nz; 958 temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 959 temp->column_indices = new THRUSTINTARRAY32(a->nz); 960 temp->values = new THRUSTARRAY(a->nz); 961 962 963 stat = cusparse_hyb2csr(cusparsestruct->handle, 964 matstruct->descr, (cusparseHybMat_t)matstruct->mat, 965 temp->values->data().get(), 966 temp->row_offsets->data().get(), 967 temp->column_indices->data().get());CHKERRCUSPARSE(stat); 968 969 /* Next, convert CSR to CSC (i.e. the matrix transpose) */ 970 CsrMatrix *tempT= new CsrMatrix; 971 tempT->num_rows = A->rmap->n; 972 tempT->num_cols = A->cmap->n; 973 tempT->num_entries = a->nz; 974 tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 975 tempT->column_indices = new THRUSTINTARRAY32(a->nz); 976 tempT->values = new THRUSTARRAY(a->nz); 977 978 stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows, 979 temp->num_cols, temp->num_entries, 980 temp->values->data().get(), 981 temp->row_offsets->data().get(), 982 temp->column_indices->data().get(), 983 tempT->values->data().get(), 984 tempT->column_indices->data().get(), 985 tempT->row_offsets->data().get(), 986 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat); 987 988 /* Last, convert CSC to HYB */ 989 cusparseHybMat_t hybMat; 990 stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat); 991 cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ? 992 CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 993 stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n, 994 matstructT->descr, tempT->values->data().get(), 995 tempT->row_offsets->data().get(), 996 tempT->column_indices->data().get(), 997 hybMat, 0, partition);CHKERRCUSPARSE(stat); 998 999 /* assign the pointer */ 1000 matstructT->mat = hybMat; 1001 ierr = PetscLogCpuToGpu((2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)))+3*sizeof(PetscScalar));CHKERRQ(ierr); 1002 1003 /* delete temporaries */ 1004 if (tempT) { 1005 if (tempT->values) delete (THRUSTARRAY*) tempT->values; 1006 if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices; 1007 if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets; 1008 delete (CsrMatrix*) tempT; 1009 } 1010 if (temp) { 1011 if (temp->values) delete (THRUSTARRAY*) temp->values; 1012 if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices; 1013 if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets; 1014 delete (CsrMatrix*) temp; 1015 } 1016 } 1017 /* the compressed row indices is not used for matTranspose */ 1018 matstructT->cprowIndices = NULL; 1019 /* assign the pointer */ 1020 ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT; 1021 PetscFunctionReturn(0); 1022 } 1023 1024 /* Why do we need to analyze the tranposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */ 1025 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 1026 { 1027 PetscInt n = xx->map->n; 1028 const PetscScalar *barray; 1029 PetscScalar *xarray; 1030 thrust::device_ptr<const PetscScalar> bGPU; 1031 thrust::device_ptr<PetscScalar> xGPU; 1032 cusparseStatus_t stat; 1033 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1034 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1035 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1036 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1037 PetscErrorCode ierr; 1038 cudaError_t cerr; 1039 1040 PetscFunctionBegin; 1041 /* Analyze the matrix and create the transpose ... on the fly */ 1042 if (!loTriFactorT && !upTriFactorT) { 1043 ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr); 1044 loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1045 upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1046 } 1047 1048 /* Get the GPU pointers */ 1049 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1050 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1051 xGPU = thrust::device_pointer_cast(xarray); 1052 bGPU = thrust::device_pointer_cast(barray); 1053 1054 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1055 /* First, reorder with the row permutation */ 1056 thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), 1057 thrust::make_permutation_iterator(bGPU+n, cusparseTriFactors->rpermIndices->end()), 1058 xGPU); 1059 1060 /* First, solve U */ 1061 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, 1062 upTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactorT->descr, 1063 upTriFactorT->csrMat->values->data().get(), 1064 upTriFactorT->csrMat->row_offsets->data().get(), 1065 upTriFactorT->csrMat->column_indices->data().get(), 1066 upTriFactorT->solveInfo, 1067 xarray, tempGPU->data().get());CHKERRCUSPARSE(stat); 1068 1069 /* Then, solve L */ 1070 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, 1071 loTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactorT->descr, 1072 loTriFactorT->csrMat->values->data().get(), 1073 loTriFactorT->csrMat->row_offsets->data().get(), 1074 loTriFactorT->csrMat->column_indices->data().get(), 1075 loTriFactorT->solveInfo, 1076 tempGPU->data().get(), xarray);CHKERRCUSPARSE(stat); 1077 1078 /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */ 1079 thrust::copy(thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()), 1080 thrust::make_permutation_iterator(xGPU+n, cusparseTriFactors->cpermIndices->end()), 1081 tempGPU->begin()); 1082 1083 /* Copy the temporary to the full solution. */ 1084 thrust::copy(tempGPU->begin(), tempGPU->end(), xGPU); 1085 1086 /* restore */ 1087 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1088 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1089 cerr = WaitForGPU();CHKERRCUDA(cerr); 1090 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1091 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1092 PetscFunctionReturn(0); 1093 } 1094 1095 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 1096 { 1097 const PetscScalar *barray; 1098 PetscScalar *xarray; 1099 cusparseStatus_t stat; 1100 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1101 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1102 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1103 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1104 PetscErrorCode ierr; 1105 cudaError_t cerr; 1106 1107 PetscFunctionBegin; 1108 /* Analyze the matrix and create the transpose ... on the fly */ 1109 if (!loTriFactorT && !upTriFactorT) { 1110 ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr); 1111 loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1112 upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1113 } 1114 1115 /* Get the GPU pointers */ 1116 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1117 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1118 1119 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1120 /* First, solve U */ 1121 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, 1122 upTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactorT->descr, 1123 upTriFactorT->csrMat->values->data().get(), 1124 upTriFactorT->csrMat->row_offsets->data().get(), 1125 upTriFactorT->csrMat->column_indices->data().get(), 1126 upTriFactorT->solveInfo, 1127 barray, tempGPU->data().get());CHKERRCUSPARSE(stat); 1128 1129 /* Then, solve L */ 1130 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, 1131 loTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactorT->descr, 1132 loTriFactorT->csrMat->values->data().get(), 1133 loTriFactorT->csrMat->row_offsets->data().get(), 1134 loTriFactorT->csrMat->column_indices->data().get(), 1135 loTriFactorT->solveInfo, 1136 tempGPU->data().get(), xarray);CHKERRCUSPARSE(stat); 1137 1138 /* restore */ 1139 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1140 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1141 cerr = WaitForGPU();CHKERRCUDA(cerr); 1142 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1143 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1144 PetscFunctionReturn(0); 1145 } 1146 1147 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 1148 { 1149 const PetscScalar *barray; 1150 PetscScalar *xarray; 1151 thrust::device_ptr<const PetscScalar> bGPU; 1152 thrust::device_ptr<PetscScalar> xGPU; 1153 cusparseStatus_t stat; 1154 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1155 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1156 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1157 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1158 PetscErrorCode ierr; 1159 cudaError_t cerr; 1160 1161 PetscFunctionBegin; 1162 1163 /* Get the GPU pointers */ 1164 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1165 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1166 xGPU = thrust::device_pointer_cast(xarray); 1167 bGPU = thrust::device_pointer_cast(barray); 1168 1169 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1170 /* First, reorder with the row permutation */ 1171 thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), 1172 thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()), 1173 tempGPU->begin()); 1174 1175 /* Next, solve L */ 1176 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp, 1177 loTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactor->descr, 1178 loTriFactor->csrMat->values->data().get(), 1179 loTriFactor->csrMat->row_offsets->data().get(), 1180 loTriFactor->csrMat->column_indices->data().get(), 1181 loTriFactor->solveInfo, 1182 tempGPU->data().get(), xarray);CHKERRCUSPARSE(stat); 1183 1184 /* Then, solve U */ 1185 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp, 1186 upTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactor->descr, 1187 upTriFactor->csrMat->values->data().get(), 1188 upTriFactor->csrMat->row_offsets->data().get(), 1189 upTriFactor->csrMat->column_indices->data().get(), 1190 upTriFactor->solveInfo, 1191 xarray, tempGPU->data().get());CHKERRCUSPARSE(stat); 1192 1193 /* Last, reorder with the column permutation */ 1194 thrust::copy(thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()), 1195 thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()), 1196 xGPU); 1197 1198 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1199 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1200 cerr = WaitForGPU();CHKERRCUDA(cerr); 1201 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1202 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1203 PetscFunctionReturn(0); 1204 } 1205 1206 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 1207 { 1208 const PetscScalar *barray; 1209 PetscScalar *xarray; 1210 cusparseStatus_t stat; 1211 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1212 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1213 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1214 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1215 PetscErrorCode ierr; 1216 cudaError_t cerr; 1217 1218 PetscFunctionBegin; 1219 /* Get the GPU pointers */ 1220 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1221 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1222 1223 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1224 /* First, solve L */ 1225 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp, 1226 loTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactor->descr, 1227 loTriFactor->csrMat->values->data().get(), 1228 loTriFactor->csrMat->row_offsets->data().get(), 1229 loTriFactor->csrMat->column_indices->data().get(), 1230 loTriFactor->solveInfo, 1231 barray, tempGPU->data().get());CHKERRCUSPARSE(stat); 1232 1233 /* Next, solve U */ 1234 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp, 1235 upTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactor->descr, 1236 upTriFactor->csrMat->values->data().get(), 1237 upTriFactor->csrMat->row_offsets->data().get(), 1238 upTriFactor->csrMat->column_indices->data().get(), 1239 upTriFactor->solveInfo, 1240 tempGPU->data().get(), xarray);CHKERRCUSPARSE(stat); 1241 1242 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1243 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1244 cerr = WaitForGPU();CHKERRCUDA(cerr); 1245 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1246 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1247 PetscFunctionReturn(0); 1248 } 1249 1250 static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A) 1251 { 1252 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1253 Mat_SeqAIJCUSPARSEMultStruct *matstruct = cusparsestruct->mat; 1254 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1255 PetscInt m = A->rmap->n,*ii,*ridx,tmp; 1256 PetscErrorCode ierr; 1257 cusparseStatus_t stat; 1258 cudaError_t err; 1259 1260 PetscFunctionBegin; 1261 if (A->boundtocpu) PetscFunctionReturn(0); 1262 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 1263 ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1264 if (A->was_assembled && A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) { 1265 /* Copy values only */ 1266 CsrMatrix *mat,*matT; 1267 mat = (CsrMatrix*)cusparsestruct->mat->mat; 1268 mat->values->assign(a->a, a->a+a->nz); 1269 ierr = PetscLogCpuToGpu((a->nz)*sizeof(PetscScalar));CHKERRQ(ierr); 1270 1271 /* Update matT when it was built before */ 1272 if (cusparsestruct->matTranspose) { 1273 cusparseIndexBase_t indexBase = cusparseGetMatIndexBase(cusparsestruct->mat->descr); 1274 matT = (CsrMatrix*)cusparsestruct->matTranspose->mat; 1275 stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n, 1276 A->cmap->n, mat->num_entries, 1277 mat->values->data().get(), 1278 cusparsestruct->rowoffsets_gpu->data().get(), 1279 mat->column_indices->data().get(), 1280 matT->values->data().get(), 1281 matT->column_indices->data().get(), 1282 matT->row_offsets->data().get(), 1283 CUSPARSE_ACTION_NUMERIC,indexBase);CHKERRCUSPARSE(stat); 1284 } 1285 } else { 1286 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat,cusparsestruct->format);CHKERRQ(ierr); 1287 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->matTranspose,cusparsestruct->format);CHKERRQ(ierr); 1288 delete cusparsestruct->workVector; 1289 delete cusparsestruct->rowoffsets_gpu; 1290 try { 1291 if (a->compressedrow.use) { 1292 m = a->compressedrow.nrows; 1293 ii = a->compressedrow.i; 1294 ridx = a->compressedrow.rindex; 1295 } else { 1296 m = A->rmap->n; 1297 ii = a->i; 1298 ridx = NULL; 1299 } 1300 cusparsestruct->nrows = m; 1301 1302 /* allocate space for the triangular factor information */ 1303 matstruct = new Mat_SeqAIJCUSPARSEMultStruct; 1304 stat = cusparseCreateMatDescr(&matstruct->descr);CHKERRCUSPARSE(stat); 1305 stat = cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 1306 stat = cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 1307 1308 err = cudaMalloc((void **)&(matstruct->alpha), sizeof(PetscScalar));CHKERRCUDA(err); 1309 err = cudaMalloc((void **)&(matstruct->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err); 1310 err = cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar));CHKERRCUDA(err); 1311 err = cudaMemcpy(matstruct->alpha, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1312 err = cudaMemcpy(matstruct->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1313 err = cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1314 stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 1315 1316 /* Build a hybrid/ellpack matrix if this option is chosen for the storage */ 1317 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1318 /* set the matrix */ 1319 CsrMatrix *matrix= new CsrMatrix; 1320 matrix->num_rows = m; 1321 matrix->num_cols = A->cmap->n; 1322 matrix->num_entries = a->nz; 1323 matrix->row_offsets = new THRUSTINTARRAY32(m+1); 1324 matrix->row_offsets->assign(ii, ii + m+1); 1325 1326 matrix->column_indices = new THRUSTINTARRAY32(a->nz); 1327 matrix->column_indices->assign(a->j, a->j+a->nz); 1328 1329 matrix->values = new THRUSTARRAY(a->nz); 1330 matrix->values->assign(a->a, a->a+a->nz); 1331 1332 /* assign the pointer */ 1333 matstruct->mat = matrix; 1334 1335 } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) { 1336 CsrMatrix *matrix= new CsrMatrix; 1337 matrix->num_rows = m; 1338 matrix->num_cols = A->cmap->n; 1339 matrix->num_entries = a->nz; 1340 matrix->row_offsets = new THRUSTINTARRAY32(m+1); 1341 matrix->row_offsets->assign(ii, ii + m+1); 1342 1343 matrix->column_indices = new THRUSTINTARRAY32(a->nz); 1344 matrix->column_indices->assign(a->j, a->j+a->nz); 1345 1346 matrix->values = new THRUSTARRAY(a->nz); 1347 matrix->values->assign(a->a, a->a+a->nz); 1348 1349 cusparseHybMat_t hybMat; 1350 stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat); 1351 cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ? 1352 CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 1353 stat = cusparse_csr2hyb(cusparsestruct->handle, matrix->num_rows, matrix->num_cols, 1354 matstruct->descr, matrix->values->data().get(), 1355 matrix->row_offsets->data().get(), 1356 matrix->column_indices->data().get(), 1357 hybMat, 0, partition);CHKERRCUSPARSE(stat); 1358 /* assign the pointer */ 1359 matstruct->mat = hybMat; 1360 1361 if (matrix) { 1362 if (matrix->values) delete (THRUSTARRAY*)matrix->values; 1363 if (matrix->column_indices) delete (THRUSTINTARRAY32*)matrix->column_indices; 1364 if (matrix->row_offsets) delete (THRUSTINTARRAY32*)matrix->row_offsets; 1365 delete (CsrMatrix*)matrix; 1366 } 1367 } 1368 1369 /* assign the compressed row indices */ 1370 if (a->compressedrow.use) { 1371 cusparsestruct->workVector = new THRUSTARRAY(m); 1372 matstruct->cprowIndices = new THRUSTINTARRAY(m); 1373 matstruct->cprowIndices->assign(ridx,ridx+m); 1374 tmp = m; 1375 } else { 1376 cusparsestruct->workVector = NULL; 1377 matstruct->cprowIndices = NULL; 1378 tmp = 0; 1379 } 1380 ierr = PetscLogCpuToGpu(((m+1)+(a->nz))*sizeof(int)+tmp*sizeof(PetscInt)+(3+(a->nz))*sizeof(PetscScalar));CHKERRQ(ierr); 1381 1382 /* assign the pointer */ 1383 cusparsestruct->mat = matstruct; 1384 } catch(char *ex) { 1385 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 1386 } 1387 cusparsestruct->nonzerostate = A->nonzerostate; 1388 } 1389 err = WaitForGPU();CHKERRCUDA(err); 1390 A->offloadmask = PETSC_OFFLOAD_BOTH; 1391 ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1392 } 1393 PetscFunctionReturn(0); 1394 } 1395 1396 struct VecCUDAPlusEquals 1397 { 1398 template <typename Tuple> 1399 __host__ __device__ 1400 void operator()(Tuple t) 1401 { 1402 thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t); 1403 } 1404 }; 1405 1406 struct VecCUDAEqualsReverse 1407 { 1408 template <typename Tuple> 1409 __host__ __device__ 1410 void operator()(Tuple t) 1411 { 1412 thrust::get<0>(t) = thrust::get<1>(t); 1413 } 1414 }; 1415 1416 typedef struct { 1417 PetscBool cisdense; 1418 PetscScalar *Bt; 1419 Mat X; 1420 } MatMatCusparse; 1421 1422 static PetscErrorCode MatDestroy_MatMatCusparse(void *data) 1423 { 1424 PetscErrorCode ierr; 1425 MatMatCusparse *mmdata = (MatMatCusparse *)data; 1426 cudaError_t cerr; 1427 1428 PetscFunctionBegin; 1429 cerr = cudaFree(mmdata->Bt);CHKERRCUDA(cerr); 1430 ierr = MatDestroy(&mmdata->X);CHKERRQ(ierr); 1431 ierr = PetscFree(data);CHKERRQ(ierr); 1432 PetscFunctionReturn(0); 1433 } 1434 1435 PETSC_INTERN PetscErrorCode MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(Mat,Mat,Mat,PetscBool,PetscBool); 1436 1437 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C) 1438 { 1439 Mat_Product *product = C->product; 1440 Mat A,B; 1441 PetscInt m,n,k,blda,clda; 1442 PetscBool flg,biscuda; 1443 Mat_SeqAIJCUSPARSE *cusp; 1444 cusparseStatus_t stat; 1445 cusparseOperation_t opA; 1446 cusparseMatDescr_t matA; 1447 const PetscScalar *barray; 1448 PetscScalar *carray; 1449 PetscErrorCode ierr; 1450 MatMatCusparse *mmdata; 1451 Mat_SeqAIJCUSPARSEMultStruct *mat; 1452 CsrMatrix *csrmat; 1453 cudaError_t cuer; 1454 1455 PetscFunctionBegin; 1456 MatCheckProduct(C,1); 1457 if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data empty"); 1458 mmdata = (MatMatCusparse*)product->data; 1459 A = product->A; 1460 B = product->B; 1461 ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 1462 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name); 1463 /* currently CopyToGpu does not copy if the matrix is bound to CPU 1464 Instead of silently accepting the wrong answer, I prefer to raise the error */ 1465 if (A->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 1466 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 1467 cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 1468 switch (product->type) { 1469 case MATPRODUCT_AB: 1470 case MATPRODUCT_PtAP: 1471 mat = cusp->mat; 1472 opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 1473 m = A->rmap->n; 1474 k = A->cmap->n; 1475 n = B->cmap->n; 1476 break; 1477 case MATPRODUCT_AtB: 1478 if (!cusp->transgen) { 1479 mat = cusp->mat; 1480 opA = CUSPARSE_OPERATION_TRANSPOSE; 1481 } else { 1482 if (!cusp->matTranspose) { 1483 ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr); 1484 } 1485 mat = cusp->matTranspose; 1486 opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 1487 } 1488 m = A->cmap->n; 1489 k = A->rmap->n; 1490 n = B->cmap->n; 1491 break; 1492 case MATPRODUCT_ABt: 1493 case MATPRODUCT_RARt: 1494 mat = cusp->mat; 1495 opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 1496 m = A->rmap->n; 1497 k = B->cmap->n; 1498 n = B->rmap->n; 1499 break; 1500 default: 1501 SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]); 1502 } 1503 if (!mat) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 1504 matA = mat->descr; 1505 csrmat = (CsrMatrix*)mat->mat; 1506 /* if the user passed a CPU matrix, copy the data to the GPU */ 1507 ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQDENSECUDA,&biscuda);CHKERRQ(ierr); 1508 if (!biscuda) { 1509 ierr = MatConvert(B,MATSEQDENSECUDA,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr); 1510 } 1511 ierr = MatDenseCUDAGetArrayRead(B,&barray);CHKERRQ(ierr); 1512 ierr = MatDenseGetLDA(B,&blda);CHKERRQ(ierr); 1513 if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) { 1514 ierr = MatDenseCUDAGetArrayWrite(mmdata->X,&carray);CHKERRQ(ierr); 1515 ierr = MatDenseGetLDA(mmdata->X,&clda);CHKERRQ(ierr); 1516 } else { 1517 ierr = MatDenseCUDAGetArrayWrite(C,&carray);CHKERRQ(ierr); 1518 ierr = MatDenseGetLDA(C,&clda);CHKERRQ(ierr); 1519 } 1520 1521 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1522 1523 /* cusparse spmm does not support transpose on B */ 1524 if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) { 1525 cublasHandle_t cublasv2handle; 1526 cublasStatus_t cerr; 1527 1528 ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr); 1529 cerr = cublasXgeam(cublasv2handle,CUBLAS_OP_T,CUBLAS_OP_T, 1530 B->cmap->n,B->rmap->n, 1531 &PETSC_CUSPARSE_ONE ,barray,blda, 1532 &PETSC_CUSPARSE_ZERO,barray,blda, 1533 mmdata->Bt,B->cmap->n);CHKERRCUBLAS(cerr); 1534 blda = B->cmap->n; 1535 } 1536 1537 /* perform the MatMat operation */ 1538 stat = cusparse_csr_spmm(cusp->handle,opA,m,n,k, 1539 csrmat->num_entries,mat->alpha,matA, 1540 csrmat->values->data().get(), 1541 csrmat->row_offsets->data().get(), 1542 csrmat->column_indices->data().get(), 1543 mmdata->Bt ? mmdata->Bt : barray,blda,mat->beta_zero, 1544 carray,clda);CHKERRCUSPARSE(stat); 1545 cuer = WaitForGPU();CHKERRCUDA(cuer); 1546 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1547 ierr = PetscLogGpuFlops(n*2.0*csrmat->num_entries);CHKERRQ(ierr); 1548 ierr = MatDenseCUDARestoreArrayRead(B,&barray);CHKERRQ(ierr); 1549 if (product->type == MATPRODUCT_RARt) { 1550 ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr); 1551 ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr); 1552 } else if (product->type == MATPRODUCT_PtAP) { 1553 ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr); 1554 ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr); 1555 } else { 1556 ierr = MatDenseCUDARestoreArrayWrite(C,&carray);CHKERRQ(ierr); 1557 } 1558 if (mmdata->cisdense) { 1559 ierr = MatConvert(C,MATSEQDENSE,MAT_INPLACE_MATRIX,&C);CHKERRQ(ierr); 1560 } 1561 if (!biscuda) { 1562 ierr = MatConvert(B,MATSEQDENSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr); 1563 } 1564 PetscFunctionReturn(0); 1565 } 1566 1567 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C) 1568 { 1569 Mat_Product *product = C->product; 1570 Mat A,B; 1571 PetscInt m,n; 1572 PetscBool cisdense,flg; 1573 PetscErrorCode ierr; 1574 MatMatCusparse *mmdata; 1575 Mat_SeqAIJCUSPARSE *cusp; 1576 1577 PetscFunctionBegin; 1578 MatCheckProduct(C,1); 1579 if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data not empty"); 1580 A = product->A; 1581 B = product->B; 1582 ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 1583 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name); 1584 cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 1585 if (cusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format"); 1586 switch (product->type) { 1587 case MATPRODUCT_AB: 1588 m = A->rmap->n; 1589 n = B->cmap->n; 1590 break; 1591 case MATPRODUCT_AtB: 1592 m = A->cmap->n; 1593 n = B->cmap->n; 1594 break; 1595 case MATPRODUCT_ABt: 1596 m = A->rmap->n; 1597 n = B->rmap->n; 1598 break; 1599 case MATPRODUCT_PtAP: 1600 m = B->cmap->n; 1601 n = B->cmap->n; 1602 break; 1603 case MATPRODUCT_RARt: 1604 m = B->rmap->n; 1605 n = B->rmap->n; 1606 break; 1607 default: 1608 SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]); 1609 } 1610 ierr = MatSetSizes(C,m,n,m,n);CHKERRQ(ierr); 1611 /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */ 1612 ierr = PetscObjectTypeCompare((PetscObject)C,MATSEQDENSE,&cisdense);CHKERRQ(ierr); 1613 ierr = MatSetType(C,MATSEQDENSECUDA);CHKERRQ(ierr); 1614 1615 /* product data */ 1616 ierr = PetscNew(&mmdata);CHKERRQ(ierr); 1617 mmdata->cisdense = cisdense; 1618 /* cusparse spmm does not support transpose on B */ 1619 if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) { 1620 cudaError_t cerr; 1621 1622 cerr = cudaMalloc((void**)&mmdata->Bt,(size_t)B->rmap->n*(size_t)B->cmap->n*sizeof(PetscScalar));CHKERRCUDA(cerr); 1623 } 1624 /* for these products we need intermediate storage */ 1625 if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) { 1626 ierr = MatCreate(PetscObjectComm((PetscObject)C),&mmdata->X);CHKERRQ(ierr); 1627 ierr = MatSetType(mmdata->X,MATSEQDENSECUDA);CHKERRQ(ierr); 1628 if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */ 1629 ierr = MatSetSizes(mmdata->X,A->rmap->n,B->rmap->n,A->rmap->n,B->rmap->n);CHKERRQ(ierr); 1630 } else { 1631 ierr = MatSetSizes(mmdata->X,A->rmap->n,B->cmap->n,A->rmap->n,B->cmap->n);CHKERRQ(ierr); 1632 } 1633 } 1634 C->product->data = mmdata; 1635 C->product->destroy = MatDestroy_MatMatCusparse; 1636 1637 C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA; 1638 PetscFunctionReturn(0); 1639 } 1640 1641 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat); 1642 1643 /* handles dense B */ 1644 static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat C) 1645 { 1646 Mat_Product *product = C->product; 1647 PetscErrorCode ierr; 1648 1649 PetscFunctionBegin; 1650 MatCheckProduct(C,1); 1651 if (!product->A) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing A"); 1652 if (product->A->boundtocpu) { 1653 ierr = MatProductSetFromOptions_SeqAIJ_SeqDense(C);CHKERRQ(ierr); 1654 PetscFunctionReturn(0); 1655 } 1656 switch (product->type) { 1657 case MATPRODUCT_AB: 1658 case MATPRODUCT_AtB: 1659 case MATPRODUCT_ABt: 1660 case MATPRODUCT_PtAP: 1661 case MATPRODUCT_RARt: 1662 C->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA; 1663 default: 1664 break; 1665 } 1666 PetscFunctionReturn(0); 1667 } 1668 1669 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 1670 { 1671 PetscErrorCode ierr; 1672 1673 PetscFunctionBegin; 1674 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr); 1675 PetscFunctionReturn(0); 1676 } 1677 1678 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy, Vec zz) 1679 { 1680 PetscErrorCode ierr; 1681 1682 PetscFunctionBegin; 1683 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr); 1684 PetscFunctionReturn(0); 1685 } 1686 1687 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 1688 { 1689 PetscErrorCode ierr; 1690 1691 PetscFunctionBegin; 1692 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr); 1693 PetscFunctionReturn(0); 1694 } 1695 1696 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 1697 { 1698 PetscErrorCode ierr; 1699 1700 PetscFunctionBegin; 1701 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr); 1702 PetscFunctionReturn(0); 1703 } 1704 1705 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 1706 { 1707 PetscErrorCode ierr; 1708 1709 PetscFunctionBegin; 1710 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr); 1711 PetscFunctionReturn(0); 1712 } 1713 1714 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz,PetscBool trans,PetscBool herm) 1715 { 1716 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1717 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1718 Mat_SeqAIJCUSPARSEMultStruct *matstruct; 1719 PetscScalar *xarray,*zarray,*dptr,*beta,*xptr; 1720 PetscErrorCode ierr; 1721 cudaError_t cerr; 1722 cusparseStatus_t stat; 1723 cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 1724 PetscBool compressed; 1725 1726 PetscFunctionBegin; 1727 if (herm && !trans) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Hermitian and not transpose not supported"); 1728 if (!a->nonzerorowcnt) { 1729 if (!yy) { 1730 ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr); 1731 } 1732 PetscFunctionReturn(0); 1733 } 1734 /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */ 1735 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 1736 if (!trans) { 1737 matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 1738 } else { 1739 if (herm || !cusparsestruct->transgen) { 1740 opA = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE; 1741 matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 1742 } else { 1743 matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 1744 if (!matstruct) { 1745 ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr); 1746 matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 1747 } 1748 } 1749 } 1750 /* Does the matrix use compressed rows (i.e., drop zero rows)? */ 1751 compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE; 1752 1753 try { 1754 ierr = VecCUDAGetArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr); 1755 if (yy == zz) {ierr = VecCUDAGetArray(zz,&zarray);CHKERRQ(ierr);} /* read & write zz, so need to get uptodate zarray on GPU */ 1756 else {ierr = VecCUDAGetArrayWrite(zz,&zarray);CHKERRQ(ierr);} /* write zz, so no need to init zarray on GPU */ 1757 if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) { 1758 xptr = xarray; 1759 dptr = compressed ? cusparsestruct->workVector->data().get() : zarray; /* Use a work vector for SpMv output if compressed */ 1760 beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero; 1761 } else { 1762 xptr = compressed ? cusparsestruct->workVector->data().get() : xarray; /* Use a work vector for SpMv input if compressed */ 1763 dptr = zarray; 1764 beta = yy ? matstruct->beta_one : matstruct->beta_zero; 1765 1766 if (compressed) { 1767 thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray); 1768 1769 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1770 thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))), 1771 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), 1772 VecCUDAEqualsReverse()); 1773 cerr = WaitForGPU();CHKERRCUDA(cerr); 1774 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1775 } 1776 } 1777 1778 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1779 /* csr_spmv does y = alpha*Ax + beta*y */ 1780 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 1781 CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 1782 stat = cusparse_csr_spmv(cusparsestruct->handle, opA, 1783 mat->num_rows, mat->num_cols, 1784 mat->num_entries, matstruct->alpha, matstruct->descr, 1785 mat->values->data().get(), mat->row_offsets->data().get(), 1786 mat->column_indices->data().get(), xptr, beta, 1787 dptr);CHKERRCUSPARSE(stat); 1788 } else { 1789 if (cusparsestruct->nrows) { 1790 cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat; 1791 stat = cusparse_hyb_spmv(cusparsestruct->handle, opA, 1792 matstruct->alpha, matstruct->descr, hybMat, 1793 xptr, beta, 1794 dptr);CHKERRCUSPARSE(stat); 1795 } 1796 } 1797 cerr = WaitForGPU();CHKERRCUDA(cerr); 1798 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1799 1800 if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) { 1801 if (yy) { /* MatMultAdd: zz = A*xx + yy */ 1802 if (compressed) { /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */ 1803 ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr); /* zz = yy */ 1804 } else if (zz != yy) { /* A is not compressed. zz already contains A*xx, and we just need to add yy */ 1805 ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */ 1806 } 1807 } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */ 1808 ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr); 1809 } 1810 1811 /* ScatterAdd the result from work vector into the full vector when A is compressed */ 1812 if (compressed) { 1813 thrust::device_ptr<PetscScalar> zptr = thrust::device_pointer_cast(zarray); 1814 1815 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1816 thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))), 1817 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), 1818 VecCUDAPlusEquals()); 1819 cerr = WaitForGPU();CHKERRCUDA(cerr); 1820 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1821 } 1822 } else { 1823 if (yy && yy != zz) { 1824 ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */ 1825 } 1826 } 1827 ierr = VecCUDARestoreArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr); 1828 if (yy == zz) {ierr = VecCUDARestoreArray(zz,&zarray);CHKERRQ(ierr);} 1829 else {ierr = VecCUDARestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);} 1830 } catch(char *ex) { 1831 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 1832 } 1833 if (yy) { 1834 ierr = PetscLogGpuFlops(2.0*a->nz);CHKERRQ(ierr); 1835 } else { 1836 ierr = PetscLogGpuFlops(2.0*a->nz-a->nonzerorowcnt);CHKERRQ(ierr); 1837 } 1838 PetscFunctionReturn(0); 1839 } 1840 1841 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 1842 { 1843 PetscErrorCode ierr; 1844 1845 PetscFunctionBegin; 1846 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr); 1847 PetscFunctionReturn(0); 1848 } 1849 1850 static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode) 1851 { 1852 PetscErrorCode ierr; 1853 1854 PetscFunctionBegin; 1855 ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr); 1856 if (mode == MAT_FLUSH_ASSEMBLY || A->boundtocpu) PetscFunctionReturn(0); 1857 if (A->factortype == MAT_FACTOR_NONE) { 1858 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 1859 } 1860 PetscFunctionReturn(0); 1861 } 1862 1863 /* --------------------------------------------------------------------------------*/ 1864 /*@ 1865 MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format 1866 (the default parallel PETSc format). This matrix will ultimately pushed down 1867 to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix 1868 assembly performance the user should preallocate the matrix storage by setting 1869 the parameter nz (or the array nnz). By setting these parameters accurately, 1870 performance during matrix assembly can be increased by more than a factor of 50. 1871 1872 Collective 1873 1874 Input Parameters: 1875 + comm - MPI communicator, set to PETSC_COMM_SELF 1876 . m - number of rows 1877 . n - number of columns 1878 . nz - number of nonzeros per row (same for all rows) 1879 - nnz - array containing the number of nonzeros in the various rows 1880 (possibly different for each row) or NULL 1881 1882 Output Parameter: 1883 . A - the matrix 1884 1885 It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 1886 MatXXXXSetPreallocation() paradgm instead of this routine directly. 1887 [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 1888 1889 Notes: 1890 If nnz is given then nz is ignored 1891 1892 The AIJ format (also called the Yale sparse matrix format or 1893 compressed row storage), is fully compatible with standard Fortran 77 1894 storage. That is, the stored row and column indices can begin at 1895 either one (as in Fortran) or zero. See the users' manual for details. 1896 1897 Specify the preallocated storage with either nz or nnz (not both). 1898 Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 1899 allocation. For large problems you MUST preallocate memory or you 1900 will get TERRIBLE performance, see the users' manual chapter on matrices. 1901 1902 By default, this format uses inodes (identical nodes) when possible, to 1903 improve numerical efficiency of matrix-vector products and solves. We 1904 search for consecutive rows with the same nonzero structure, thereby 1905 reusing matrix information to achieve increased efficiency. 1906 1907 Level: intermediate 1908 1909 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE 1910 @*/ 1911 PetscErrorCode MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A) 1912 { 1913 PetscErrorCode ierr; 1914 1915 PetscFunctionBegin; 1916 ierr = MatCreate(comm,A);CHKERRQ(ierr); 1917 ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr); 1918 ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 1919 ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr); 1920 PetscFunctionReturn(0); 1921 } 1922 1923 static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A) 1924 { 1925 PetscErrorCode ierr; 1926 1927 PetscFunctionBegin; 1928 if (A->factortype == MAT_FACTOR_NONE) { 1929 ierr = MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr);CHKERRQ(ierr); 1930 } else { 1931 ierr = MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr);CHKERRQ(ierr); 1932 } 1933 ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL);CHKERRQ(ierr); 1934 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr); 1935 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr); 1936 ierr = PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL);CHKERRQ(ierr); 1937 ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr); 1938 PetscFunctionReturn(0); 1939 } 1940 1941 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat,MatType,MatReuse,Mat*); 1942 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat,PetscBool); 1943 static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A,MatDuplicateOption cpvalues,Mat *B) 1944 { 1945 PetscErrorCode ierr; 1946 1947 PetscFunctionBegin; 1948 ierr = MatDuplicate_SeqAIJ(A,cpvalues,B);CHKERRQ(ierr); 1949 ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,B);CHKERRQ(ierr); 1950 PetscFunctionReturn(0); 1951 } 1952 1953 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A,PetscBool flg) 1954 { 1955 PetscErrorCode ierr; 1956 1957 PetscFunctionBegin; 1958 if (A->factortype != MAT_FACTOR_NONE) PetscFunctionReturn(0); 1959 /* Currently, there is no case in which an AIJCUSPARSE matrix ever has its offloadmask set to PETS_OFFLOAD_GPU. 1960 If this changes, we need to implement a routine to update the CPU (host) version of the matrix from the GPU one. 1961 Right now, for safety we simply check for PETSC_OFFLOAD_GPU and have MatBindToCPU() error in this case. 1962 TODO: Add MatAIJCUSPARSECopyFromGPU() and make MatBindToCPU() functional for AIJCUSPARSE matries; 1963 can follow the example of MatBindToCPU_SeqAIJViennaCL(). */ 1964 if (flg && A->offloadmask == PETSC_OFFLOAD_GPU) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"PETSC_OFFLOAD_GPU should not happen. Please report your use case to petsc-dev@mcs.anl.gov"); 1965 /* TODO: add support for this? */ 1966 if (flg) { 1967 A->ops->mult = MatMult_SeqAIJ; 1968 A->ops->multadd = MatMultAdd_SeqAIJ; 1969 A->ops->multtranspose = MatMultTranspose_SeqAIJ; 1970 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJ; 1971 A->ops->multhermitiantranspose = NULL; 1972 A->ops->multhermitiantransposeadd = NULL; 1973 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr); 1974 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr); 1975 } else { 1976 A->ops->mult = MatMult_SeqAIJCUSPARSE; 1977 A->ops->multadd = MatMultAdd_SeqAIJCUSPARSE; 1978 A->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE; 1979 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE; 1980 A->ops->multhermitiantranspose = MatMultHermitianTranspose_SeqAIJCUSPARSE; 1981 A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE; 1982 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr); 1983 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr); 1984 } 1985 A->boundtocpu = flg; 1986 PetscFunctionReturn(0); 1987 } 1988 1989 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType mtype, MatReuse reuse, Mat* newmat) 1990 { 1991 PetscErrorCode ierr; 1992 cusparseStatus_t stat; 1993 Mat B; 1994 1995 PetscFunctionBegin; 1996 if (reuse == MAT_INITIAL_MATRIX) { 1997 ierr = MatDuplicate(A,MAT_COPY_VALUES,newmat);CHKERRQ(ierr); 1998 } else if (reuse == MAT_REUSE_MATRIX) { 1999 ierr = MatCopy(A,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr); 2000 } 2001 B = *newmat; 2002 2003 ierr = PetscFree(B->defaultvectype);CHKERRQ(ierr); 2004 ierr = PetscStrallocpy(VECCUDA,&B->defaultvectype);CHKERRQ(ierr); 2005 2006 if (reuse != MAT_REUSE_MATRIX && !B->spptr) { 2007 if (B->factortype == MAT_FACTOR_NONE) { 2008 Mat_SeqAIJCUSPARSE *spptr; 2009 2010 ierr = PetscNew(&spptr);CHKERRQ(ierr); 2011 spptr->format = MAT_CUSPARSE_CSR; 2012 stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat); 2013 B->spptr = spptr; 2014 } else { 2015 Mat_SeqAIJCUSPARSETriFactors *spptr; 2016 2017 ierr = PetscNew(&spptr);CHKERRQ(ierr); 2018 stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat); 2019 B->spptr = spptr; 2020 } 2021 B->offloadmask = PETSC_OFFLOAD_UNALLOCATED; 2022 } 2023 B->ops->assemblyend = MatAssemblyEnd_SeqAIJCUSPARSE; 2024 B->ops->destroy = MatDestroy_SeqAIJCUSPARSE; 2025 B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE; 2026 B->ops->bindtocpu = MatBindToCPU_SeqAIJCUSPARSE; 2027 B->ops->duplicate = MatDuplicate_SeqAIJCUSPARSE; 2028 2029 ierr = MatBindToCPU_SeqAIJCUSPARSE(B,PETSC_FALSE);CHKERRQ(ierr); 2030 ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 2031 ierr = PetscObjectComposeFunction((PetscObject)B,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr); 2032 PetscFunctionReturn(0); 2033 } 2034 2035 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B) 2036 { 2037 PetscErrorCode ierr; 2038 2039 PetscFunctionBegin; 2040 ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr); 2041 ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr); 2042 PetscFunctionReturn(0); 2043 } 2044 2045 /*MC 2046 MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices. 2047 2048 A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either 2049 CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later. 2050 All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library. 2051 2052 Options Database Keys: 2053 + -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions() 2054 . -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 2055 - -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 2056 2057 Level: beginner 2058 2059 .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 2060 M*/ 2061 2062 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat,MatFactorType,Mat*); 2063 2064 2065 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void) 2066 { 2067 PetscErrorCode ierr; 2068 2069 PetscFunctionBegin; 2070 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 2071 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 2072 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 2073 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 2074 PetscFunctionReturn(0); 2075 } 2076 2077 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct) 2078 { 2079 PetscErrorCode ierr; 2080 cusparseStatus_t stat; 2081 cusparseHandle_t handle; 2082 2083 PetscFunctionBegin; 2084 if (*cusparsestruct) { 2085 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format);CHKERRQ(ierr); 2086 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format);CHKERRQ(ierr); 2087 delete (*cusparsestruct)->workVector; 2088 delete (*cusparsestruct)->rowoffsets_gpu; 2089 if (handle = (*cusparsestruct)->handle) { 2090 stat = cusparseDestroy(handle);CHKERRCUSPARSE(stat); 2091 } 2092 ierr = PetscFree(*cusparsestruct);CHKERRQ(ierr); 2093 } 2094 PetscFunctionReturn(0); 2095 } 2096 2097 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat) 2098 { 2099 PetscFunctionBegin; 2100 if (*mat) { 2101 delete (*mat)->values; 2102 delete (*mat)->column_indices; 2103 delete (*mat)->row_offsets; 2104 delete *mat; 2105 *mat = 0; 2106 } 2107 PetscFunctionReturn(0); 2108 } 2109 2110 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor) 2111 { 2112 cusparseStatus_t stat; 2113 PetscErrorCode ierr; 2114 2115 PetscFunctionBegin; 2116 if (*trifactor) { 2117 if ((*trifactor)->descr) { stat = cusparseDestroyMatDescr((*trifactor)->descr);CHKERRCUSPARSE(stat); } 2118 if ((*trifactor)->solveInfo) { stat = cusparseDestroySolveAnalysisInfo((*trifactor)->solveInfo);CHKERRCUSPARSE(stat); } 2119 ierr = CsrMatrix_Destroy(&(*trifactor)->csrMat);CHKERRQ(ierr); 2120 delete *trifactor; 2121 *trifactor = 0; 2122 } 2123 PetscFunctionReturn(0); 2124 } 2125 2126 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format) 2127 { 2128 CsrMatrix *mat; 2129 cusparseStatus_t stat; 2130 cudaError_t err; 2131 2132 PetscFunctionBegin; 2133 if (*matstruct) { 2134 if ((*matstruct)->mat) { 2135 if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) { 2136 cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat; 2137 stat = cusparseDestroyHybMat(hybMat);CHKERRCUSPARSE(stat); 2138 } else { 2139 mat = (CsrMatrix*)(*matstruct)->mat; 2140 CsrMatrix_Destroy(&mat); 2141 } 2142 } 2143 if ((*matstruct)->descr) { stat = cusparseDestroyMatDescr((*matstruct)->descr);CHKERRCUSPARSE(stat); } 2144 delete (*matstruct)->cprowIndices; 2145 if ((*matstruct)->alpha) { err=cudaFree((*matstruct)->alpha);CHKERRCUDA(err); } 2146 if ((*matstruct)->beta_zero) { err=cudaFree((*matstruct)->beta_zero);CHKERRCUDA(err); } 2147 if ((*matstruct)->beta_one) { err=cudaFree((*matstruct)->beta_one);CHKERRCUDA(err); } 2148 delete *matstruct; 2149 *matstruct = 0; 2150 } 2151 PetscFunctionReturn(0); 2152 } 2153 2154 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors** trifactors) 2155 { 2156 PetscErrorCode ierr; 2157 2158 PetscFunctionBegin; 2159 if (*trifactors) { 2160 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtr);CHKERRQ(ierr); 2161 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtr);CHKERRQ(ierr); 2162 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtrTranspose);CHKERRQ(ierr); 2163 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtrTranspose);CHKERRQ(ierr); 2164 delete (*trifactors)->rpermIndices; 2165 delete (*trifactors)->cpermIndices; 2166 delete (*trifactors)->workVector; 2167 (*trifactors)->rpermIndices = 0; 2168 (*trifactors)->cpermIndices = 0; 2169 (*trifactors)->workVector = 0; 2170 } 2171 PetscFunctionReturn(0); 2172 } 2173 2174 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors) 2175 { 2176 PetscErrorCode ierr; 2177 cusparseHandle_t handle; 2178 cusparseStatus_t stat; 2179 2180 PetscFunctionBegin; 2181 if (*trifactors) { 2182 ierr = MatSeqAIJCUSPARSETriFactors_Reset(trifactors);CHKERRQ(ierr); 2183 if (handle = (*trifactors)->handle) { 2184 stat = cusparseDestroy(handle);CHKERRCUSPARSE(stat); 2185 } 2186 ierr = PetscFree(*trifactors);CHKERRQ(ierr); 2187 } 2188 PetscFunctionReturn(0); 2189 } 2190