1 /* 2 Defines the basic matrix operations for the AIJ (compressed row) 3 matrix storage format using the CUSPARSE library, 4 */ 5 #define PETSC_SKIP_SPINLOCK 6 #define PETSC_SKIP_CXX_COMPLEX_FIX 7 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1 8 9 #include <petscconf.h> 10 #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/ 11 #include <../src/mat/impls/sbaij/seq/sbaij.h> 12 #include <../src/vec/vec/impls/dvecimpl.h> 13 #include <petsc/private/vecimpl.h> 14 #undef VecType 15 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h> 16 17 const char *const MatCUSPARSEStorageFormats[] = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0}; 18 19 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*); 20 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*); 21 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*); 22 23 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 24 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 25 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*); 26 27 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec); 28 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 29 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 30 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 31 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat); 32 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec); 33 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 34 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 35 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 36 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 37 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 38 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec,PetscBool,PetscBool); 39 40 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**); 41 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**); 42 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat); 43 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors**); 44 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**); 45 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**); 46 47 PetscErrorCode MatCUSPARSESetStream(Mat A,const cudaStream_t stream) 48 { 49 cusparseStatus_t stat; 50 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 51 52 PetscFunctionBegin; 53 cusparsestruct->stream = stream; 54 stat = cusparseSetStream(cusparsestruct->handle,cusparsestruct->stream);CHKERRCUSPARSE(stat); 55 PetscFunctionReturn(0); 56 } 57 58 PetscErrorCode MatCUSPARSESetHandle(Mat A,const cusparseHandle_t handle) 59 { 60 cusparseStatus_t stat; 61 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 62 63 PetscFunctionBegin; 64 if (cusparsestruct->handle != handle) { 65 if (cusparsestruct->handle) { 66 stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUSPARSE(stat); 67 } 68 cusparsestruct->handle = handle; 69 } 70 stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 71 PetscFunctionReturn(0); 72 } 73 74 PetscErrorCode MatCUSPARSEClearHandle(Mat A) 75 { 76 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 77 78 PetscFunctionBegin; 79 if (cusparsestruct->handle) cusparsestruct->handle = 0; 80 PetscFunctionReturn(0); 81 } 82 83 PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A,MatSolverType *type) 84 { 85 PetscFunctionBegin; 86 *type = MATSOLVERCUSPARSE; 87 PetscFunctionReturn(0); 88 } 89 90 /*MC 91 MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices 92 on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported 93 algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer 94 performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the 95 CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these 96 algorithms are not recommended. This class does NOT support direct solver operations. 97 98 Level: beginner 99 100 .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 101 M*/ 102 103 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B) 104 { 105 PetscErrorCode ierr; 106 PetscInt n = A->rmap->n; 107 108 PetscFunctionBegin; 109 ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr); 110 (*B)->factortype = ftype; 111 ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr); 112 ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 113 114 if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) { 115 ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr); 116 (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE; 117 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJCUSPARSE; 118 } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) { 119 (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJCUSPARSE; 120 (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE; 121 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types"); 122 123 ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr); 124 ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_cusparse);CHKERRQ(ierr); 125 PetscFunctionReturn(0); 126 } 127 128 PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 129 { 130 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 131 132 PetscFunctionBegin; 133 switch (op) { 134 case MAT_CUSPARSE_MULT: 135 cusparsestruct->format = format; 136 break; 137 case MAT_CUSPARSE_ALL: 138 cusparsestruct->format = format; 139 break; 140 default: 141 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op); 142 } 143 PetscFunctionReturn(0); 144 } 145 146 /*@ 147 MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular 148 operation. Only the MatMult operation can use different GPU storage formats 149 for MPIAIJCUSPARSE matrices. 150 Not Collective 151 152 Input Parameters: 153 + A - Matrix of type SEQAIJCUSPARSE 154 . op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL. 155 - format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2) 156 157 Output Parameter: 158 159 Level: intermediate 160 161 .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 162 @*/ 163 PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 164 { 165 PetscErrorCode ierr; 166 167 PetscFunctionBegin; 168 PetscValidHeaderSpecific(A, MAT_CLASSID,1); 169 ierr = PetscTryMethod(A,"MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));CHKERRQ(ierr); 170 PetscFunctionReturn(0); 171 } 172 173 /*@ 174 MatSeqAIJCUSPARSESetGenerateTranspose - Sets the flag to explicitly generate the tranpose matrix before calling MatMultTranspose 175 176 Collective on mat 177 178 Input Parameters: 179 + A - Matrix of type SEQAIJCUSPARSE 180 - transgen - the boolean flag 181 182 Level: intermediate 183 184 .seealso: MATSEQAIJCUSPARSE 185 @*/ 186 PetscErrorCode MatSeqAIJCUSPARSESetGenerateTranspose(Mat A,PetscBool transgen) 187 { 188 PetscErrorCode ierr; 189 PetscBool flg; 190 191 PetscFunctionBegin; 192 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 193 ierr = PetscObjectTypeCompare(((PetscObject)A),MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 194 if (flg) { 195 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 196 if (A->factortype) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix"); 197 cusp->transgen = transgen; 198 } 199 PetscFunctionReturn(0); 200 } 201 202 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A) 203 { 204 PetscErrorCode ierr; 205 MatCUSPARSEStorageFormat format; 206 PetscBool flg; 207 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 208 209 PetscFunctionBegin; 210 ierr = PetscOptionsHead(PetscOptionsObject,"SeqAIJCUSPARSE options");CHKERRQ(ierr); 211 if (A->factortype == MAT_FACTOR_NONE) { 212 ierr = PetscOptionsBool("-mat_cusparse_transgen","Generate explicit transpose for MatMultTranspose","MatSeqAIJCUSPARSESetGenerateTranspose",cusparsestruct->transgen,&cusparsestruct->transgen,NULL);CHKERRQ(ierr); 213 ierr = PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV", 214 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 215 if (flg) { 216 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);CHKERRQ(ierr); 217 } 218 } 219 ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve", 220 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 221 if (flg) { 222 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr); 223 } 224 ierr = PetscOptionsTail();CHKERRQ(ierr); 225 PetscFunctionReturn(0); 226 } 227 228 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 229 { 230 PetscErrorCode ierr; 231 232 PetscFunctionBegin; 233 ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 234 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 235 PetscFunctionReturn(0); 236 } 237 238 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 239 { 240 PetscErrorCode ierr; 241 242 PetscFunctionBegin; 243 ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 244 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 245 PetscFunctionReturn(0); 246 } 247 248 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info) 249 { 250 PetscErrorCode ierr; 251 252 PetscFunctionBegin; 253 ierr = MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr); 254 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 255 PetscFunctionReturn(0); 256 } 257 258 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info) 259 { 260 PetscErrorCode ierr; 261 262 PetscFunctionBegin; 263 ierr = MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr); 264 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 265 PetscFunctionReturn(0); 266 } 267 268 static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A) 269 { 270 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 271 PetscInt n = A->rmap->n; 272 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 273 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 274 cusparseStatus_t stat; 275 const PetscInt *ai = a->i,*aj = a->j,*vi; 276 const MatScalar *aa = a->a,*v; 277 PetscInt *AiLo, *AjLo; 278 PetscScalar *AALo; 279 PetscInt i,nz, nzLower, offset, rowOffset; 280 PetscErrorCode ierr; 281 cudaError_t cerr; 282 283 PetscFunctionBegin; 284 if (!n) PetscFunctionReturn(0); 285 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 286 try { 287 /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */ 288 nzLower=n+ai[n]-ai[1]; 289 290 /* Allocate Space for the lower triangular matrix */ 291 cerr = cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr); 292 cerr = cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUDA(cerr); 293 cerr = cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr); 294 295 /* Fill the lower triangular matrix */ 296 AiLo[0] = (PetscInt) 0; 297 AiLo[n] = nzLower; 298 AjLo[0] = (PetscInt) 0; 299 AALo[0] = (MatScalar) 1.0; 300 v = aa; 301 vi = aj; 302 offset = 1; 303 rowOffset= 1; 304 for (i=1; i<n; i++) { 305 nz = ai[i+1] - ai[i]; 306 /* additional 1 for the term on the diagonal */ 307 AiLo[i] = rowOffset; 308 rowOffset += nz+1; 309 310 ierr = PetscArraycpy(&(AjLo[offset]), vi, nz);CHKERRQ(ierr); 311 ierr = PetscArraycpy(&(AALo[offset]), v, nz);CHKERRQ(ierr); 312 313 offset += nz; 314 AjLo[offset] = (PetscInt) i; 315 AALo[offset] = (MatScalar) 1.0; 316 offset += 1; 317 318 v += nz; 319 vi += nz; 320 } 321 322 /* allocate space for the triangular factor information */ 323 loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct; 324 325 /* Create the matrix description */ 326 stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat); 327 stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 328 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 329 stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER);CHKERRCUSPARSE(stat); 330 stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat); 331 332 /* Create the solve analysis information */ 333 stat = cusparseCreateSolveAnalysisInfo(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat); 334 335 /* set the operation */ 336 loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 337 338 /* set the matrix */ 339 loTriFactor->csrMat = new CsrMatrix; 340 loTriFactor->csrMat->num_rows = n; 341 loTriFactor->csrMat->num_cols = n; 342 loTriFactor->csrMat->num_entries = nzLower; 343 344 loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1); 345 loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1); 346 347 loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower); 348 loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower); 349 350 loTriFactor->csrMat->values = new THRUSTARRAY(nzLower); 351 loTriFactor->csrMat->values->assign(AALo, AALo+nzLower); 352 353 /* perform the solve analysis */ 354 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, 355 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 356 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 357 loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo);CHKERRCUSPARSE(stat); 358 359 /* assign the pointer. Is this really necessary? */ 360 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor; 361 362 cerr = cudaFreeHost(AiLo);CHKERRCUDA(cerr); 363 cerr = cudaFreeHost(AjLo);CHKERRCUDA(cerr); 364 cerr = cudaFreeHost(AALo);CHKERRCUDA(cerr); 365 ierr = PetscLogCpuToGpu((n+1+nzLower)*sizeof(int)+nzLower*sizeof(PetscScalar));CHKERRQ(ierr); 366 } catch(char *ex) { 367 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 368 } 369 } 370 PetscFunctionReturn(0); 371 } 372 373 static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A) 374 { 375 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 376 PetscInt n = A->rmap->n; 377 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 378 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 379 cusparseStatus_t stat; 380 const PetscInt *aj = a->j,*adiag = a->diag,*vi; 381 const MatScalar *aa = a->a,*v; 382 PetscInt *AiUp, *AjUp; 383 PetscScalar *AAUp; 384 PetscInt i,nz, nzUpper, offset; 385 PetscErrorCode ierr; 386 cudaError_t cerr; 387 388 PetscFunctionBegin; 389 if (!n) PetscFunctionReturn(0); 390 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 391 try { 392 /* next, figure out the number of nonzeros in the upper triangular matrix. */ 393 nzUpper = adiag[0]-adiag[n]; 394 395 /* Allocate Space for the upper triangular matrix */ 396 cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr); 397 cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr); 398 cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr); 399 400 /* Fill the upper triangular matrix */ 401 AiUp[0]=(PetscInt) 0; 402 AiUp[n]=nzUpper; 403 offset = nzUpper; 404 for (i=n-1; i>=0; i--) { 405 v = aa + adiag[i+1] + 1; 406 vi = aj + adiag[i+1] + 1; 407 408 /* number of elements NOT on the diagonal */ 409 nz = adiag[i] - adiag[i+1]-1; 410 411 /* decrement the offset */ 412 offset -= (nz+1); 413 414 /* first, set the diagonal elements */ 415 AjUp[offset] = (PetscInt) i; 416 AAUp[offset] = (MatScalar)1./v[nz]; 417 AiUp[i] = AiUp[i+1] - (nz+1); 418 419 ierr = PetscArraycpy(&(AjUp[offset+1]), vi, nz);CHKERRQ(ierr); 420 ierr = PetscArraycpy(&(AAUp[offset+1]), v, nz);CHKERRQ(ierr); 421 } 422 423 /* allocate space for the triangular factor information */ 424 upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct; 425 426 /* Create the matrix description */ 427 stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat); 428 stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 429 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 430 stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat); 431 stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat); 432 433 /* Create the solve analysis information */ 434 stat = cusparseCreateSolveAnalysisInfo(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat); 435 436 /* set the operation */ 437 upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 438 439 /* set the matrix */ 440 upTriFactor->csrMat = new CsrMatrix; 441 upTriFactor->csrMat->num_rows = n; 442 upTriFactor->csrMat->num_cols = n; 443 upTriFactor->csrMat->num_entries = nzUpper; 444 445 upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1); 446 upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1); 447 448 upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper); 449 upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper); 450 451 upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper); 452 upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper); 453 454 /* perform the solve analysis */ 455 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, 456 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 457 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 458 upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo);CHKERRCUSPARSE(stat); 459 460 /* assign the pointer. Is this really necessary? */ 461 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor; 462 463 cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr); 464 cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr); 465 cerr = cudaFreeHost(AAUp);CHKERRCUDA(cerr); 466 ierr = PetscLogCpuToGpu((n+1+nzUpper)*sizeof(int)+nzUpper*sizeof(PetscScalar));CHKERRQ(ierr); 467 } catch(char *ex) { 468 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 469 } 470 } 471 PetscFunctionReturn(0); 472 } 473 474 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A) 475 { 476 PetscErrorCode ierr; 477 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 478 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 479 IS isrow = a->row,iscol = a->icol; 480 PetscBool row_identity,col_identity; 481 const PetscInt *r,*c; 482 PetscInt n = A->rmap->n; 483 484 PetscFunctionBegin; 485 ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr); 486 ierr = MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);CHKERRQ(ierr); 487 ierr = MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);CHKERRQ(ierr); 488 489 cusparseTriFactors->workVector = new THRUSTARRAY(n); 490 cusparseTriFactors->nnz=a->nz; 491 492 A->offloadmask = PETSC_OFFLOAD_BOTH; 493 /* lower triangular indices */ 494 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 495 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 496 if (!row_identity) { 497 cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 498 cusparseTriFactors->rpermIndices->assign(r, r+n); 499 } 500 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 501 502 /* upper triangular indices */ 503 ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr); 504 ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 505 if (!col_identity) { 506 cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 507 cusparseTriFactors->cpermIndices->assign(c, c+n); 508 } 509 510 if (!row_identity && !col_identity) { 511 ierr = PetscLogCpuToGpu(2*n*sizeof(PetscInt));CHKERRQ(ierr); 512 } else if(!row_identity) { 513 ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr); 514 } else if(!col_identity) { 515 ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr); 516 } 517 518 ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr); 519 PetscFunctionReturn(0); 520 } 521 522 static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A) 523 { 524 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 525 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 526 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 527 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 528 cusparseStatus_t stat; 529 PetscErrorCode ierr; 530 cudaError_t cerr; 531 PetscInt *AiUp, *AjUp; 532 PetscScalar *AAUp; 533 PetscScalar *AALo; 534 PetscInt nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j; 535 Mat_SeqSBAIJ *b = (Mat_SeqSBAIJ*)A->data; 536 const PetscInt *ai = b->i,*aj = b->j,*vj; 537 const MatScalar *aa = b->a,*v; 538 539 PetscFunctionBegin; 540 if (!n) PetscFunctionReturn(0); 541 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 542 try { 543 /* Allocate Space for the upper triangular matrix */ 544 cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr); 545 cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr); 546 cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr); 547 cerr = cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr); 548 549 /* Fill the upper triangular matrix */ 550 AiUp[0]=(PetscInt) 0; 551 AiUp[n]=nzUpper; 552 offset = 0; 553 for (i=0; i<n; i++) { 554 /* set the pointers */ 555 v = aa + ai[i]; 556 vj = aj + ai[i]; 557 nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */ 558 559 /* first, set the diagonal elements */ 560 AjUp[offset] = (PetscInt) i; 561 AAUp[offset] = (MatScalar)1.0/v[nz]; 562 AiUp[i] = offset; 563 AALo[offset] = (MatScalar)1.0/v[nz]; 564 565 offset+=1; 566 if (nz>0) { 567 ierr = PetscArraycpy(&(AjUp[offset]), vj, nz);CHKERRQ(ierr); 568 ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr); 569 for (j=offset; j<offset+nz; j++) { 570 AAUp[j] = -AAUp[j]; 571 AALo[j] = AAUp[j]/v[nz]; 572 } 573 offset+=nz; 574 } 575 } 576 577 /* allocate space for the triangular factor information */ 578 upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct; 579 580 /* Create the matrix description */ 581 stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat); 582 stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 583 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 584 stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat); 585 stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat); 586 587 /* Create the solve analysis information */ 588 stat = cusparseCreateSolveAnalysisInfo(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat); 589 590 /* set the operation */ 591 upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 592 593 /* set the matrix */ 594 upTriFactor->csrMat = new CsrMatrix; 595 upTriFactor->csrMat->num_rows = A->rmap->n; 596 upTriFactor->csrMat->num_cols = A->cmap->n; 597 upTriFactor->csrMat->num_entries = a->nz; 598 599 upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 600 upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1); 601 602 upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 603 upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz); 604 605 upTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 606 upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz); 607 608 /* perform the solve analysis */ 609 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, 610 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 611 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 612 upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo);CHKERRCUSPARSE(stat); 613 614 /* assign the pointer. Is this really necessary? */ 615 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor; 616 617 /* allocate space for the triangular factor information */ 618 loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct; 619 620 /* Create the matrix description */ 621 stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat); 622 stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 623 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 624 stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat); 625 stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat); 626 627 /* Create the solve analysis information */ 628 stat = cusparseCreateSolveAnalysisInfo(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat); 629 630 /* set the operation */ 631 loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE; 632 633 /* set the matrix */ 634 loTriFactor->csrMat = new CsrMatrix; 635 loTriFactor->csrMat->num_rows = A->rmap->n; 636 loTriFactor->csrMat->num_cols = A->cmap->n; 637 loTriFactor->csrMat->num_entries = a->nz; 638 639 loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 640 loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1); 641 642 loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 643 loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz); 644 645 loTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 646 loTriFactor->csrMat->values->assign(AALo, AALo+a->nz); 647 ierr = PetscLogCpuToGpu(2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)));CHKERRQ(ierr); 648 649 /* perform the solve analysis */ 650 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, 651 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 652 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 653 loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo);CHKERRCUSPARSE(stat); 654 655 /* assign the pointer. Is this really necessary? */ 656 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor; 657 658 A->offloadmask = PETSC_OFFLOAD_BOTH; 659 cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr); 660 cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr); 661 cerr = cudaFreeHost(AAUp);CHKERRCUDA(cerr); 662 cerr = cudaFreeHost(AALo);CHKERRCUDA(cerr); 663 } catch(char *ex) { 664 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 665 } 666 } 667 PetscFunctionReturn(0); 668 } 669 670 static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A) 671 { 672 PetscErrorCode ierr; 673 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 674 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 675 IS ip = a->row; 676 const PetscInt *rip; 677 PetscBool perm_identity; 678 PetscInt n = A->rmap->n; 679 680 PetscFunctionBegin; 681 ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr); 682 ierr = MatSeqAIJCUSPARSEBuildICCTriMatrices(A);CHKERRQ(ierr); 683 cusparseTriFactors->workVector = new THRUSTARRAY(n); 684 cusparseTriFactors->nnz=(a->nz-n)*2 + n; 685 686 /* lower triangular indices */ 687 ierr = ISGetIndices(ip,&rip);CHKERRQ(ierr); 688 ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr); 689 if (!perm_identity) { 690 IS iip; 691 const PetscInt *irip; 692 693 ierr = ISInvertPermutation(ip,PETSC_DECIDE,&iip);CHKERRQ(ierr); 694 ierr = ISGetIndices(iip,&irip);CHKERRQ(ierr); 695 cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 696 cusparseTriFactors->rpermIndices->assign(rip, rip+n); 697 cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 698 cusparseTriFactors->cpermIndices->assign(irip, irip+n); 699 ierr = ISRestoreIndices(iip,&irip);CHKERRQ(ierr); 700 ierr = ISDestroy(&iip);CHKERRQ(ierr); 701 ierr = PetscLogCpuToGpu(2*n*sizeof(PetscInt));CHKERRQ(ierr); 702 } 703 ierr = ISRestoreIndices(ip,&rip);CHKERRQ(ierr); 704 PetscFunctionReturn(0); 705 } 706 707 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info) 708 { 709 Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 710 IS isrow = b->row,iscol = b->col; 711 PetscBool row_identity,col_identity; 712 PetscErrorCode ierr; 713 714 PetscFunctionBegin; 715 ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 716 B->offloadmask = PETSC_OFFLOAD_CPU; 717 /* determine which version of MatSolve needs to be used. */ 718 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 719 ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 720 if (row_identity && col_identity) { 721 B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 722 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 723 B->ops->matsolve = NULL; 724 B->ops->matsolvetranspose = NULL; 725 } else { 726 B->ops->solve = MatSolve_SeqAIJCUSPARSE; 727 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 728 B->ops->matsolve = NULL; 729 B->ops->matsolvetranspose = NULL; 730 } 731 732 /* get the triangular factors */ 733 ierr = MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);CHKERRQ(ierr); 734 PetscFunctionReturn(0); 735 } 736 737 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info) 738 { 739 Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 740 IS ip = b->row; 741 PetscBool perm_identity; 742 PetscErrorCode ierr; 743 744 PetscFunctionBegin; 745 ierr = MatCholeskyFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 746 B->offloadmask = PETSC_OFFLOAD_CPU; 747 /* determine which version of MatSolve needs to be used. */ 748 ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr); 749 if (perm_identity) { 750 B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 751 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 752 B->ops->matsolve = NULL; 753 B->ops->matsolvetranspose = NULL; 754 } else { 755 B->ops->solve = MatSolve_SeqAIJCUSPARSE; 756 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 757 B->ops->matsolve = NULL; 758 B->ops->matsolvetranspose = NULL; 759 } 760 761 /* get the triangular factors */ 762 ierr = MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);CHKERRQ(ierr); 763 PetscFunctionReturn(0); 764 } 765 766 static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A) 767 { 768 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 769 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 770 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 771 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 772 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 773 cusparseStatus_t stat; 774 cusparseIndexBase_t indexBase; 775 cusparseMatrixType_t matrixType; 776 cusparseFillMode_t fillMode; 777 cusparseDiagType_t diagType; 778 779 PetscFunctionBegin; 780 781 /*********************************************/ 782 /* Now the Transpose of the Lower Tri Factor */ 783 /*********************************************/ 784 785 /* allocate space for the transpose of the lower triangular factor */ 786 loTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct; 787 788 /* set the matrix descriptors of the lower triangular factor */ 789 matrixType = cusparseGetMatType(loTriFactor->descr); 790 indexBase = cusparseGetMatIndexBase(loTriFactor->descr); 791 fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ? 792 CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 793 diagType = cusparseGetMatDiagType(loTriFactor->descr); 794 795 /* Create the matrix description */ 796 stat = cusparseCreateMatDescr(&loTriFactorT->descr);CHKERRCUSPARSE(stat); 797 stat = cusparseSetMatIndexBase(loTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat); 798 stat = cusparseSetMatType(loTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat); 799 stat = cusparseSetMatFillMode(loTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat); 800 stat = cusparseSetMatDiagType(loTriFactorT->descr, diagType);CHKERRCUSPARSE(stat); 801 802 /* Create the solve analysis information */ 803 stat = cusparseCreateSolveAnalysisInfo(&loTriFactorT->solveInfo);CHKERRCUSPARSE(stat); 804 805 /* set the operation */ 806 loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 807 808 /* allocate GPU space for the CSC of the lower triangular factor*/ 809 loTriFactorT->csrMat = new CsrMatrix; 810 loTriFactorT->csrMat->num_rows = loTriFactor->csrMat->num_rows; 811 loTriFactorT->csrMat->num_cols = loTriFactor->csrMat->num_cols; 812 loTriFactorT->csrMat->num_entries = loTriFactor->csrMat->num_entries; 813 loTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(loTriFactor->csrMat->num_rows+1); 814 loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactor->csrMat->num_entries); 815 loTriFactorT->csrMat->values = new THRUSTARRAY(loTriFactor->csrMat->num_entries); 816 817 /* compute the transpose of the lower triangular factor, i.e. the CSC */ 818 stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, 819 loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, 820 loTriFactor->csrMat->values->data().get(), 821 loTriFactor->csrMat->row_offsets->data().get(), 822 loTriFactor->csrMat->column_indices->data().get(), 823 loTriFactorT->csrMat->values->data().get(), 824 loTriFactorT->csrMat->column_indices->data().get(), 825 loTriFactorT->csrMat->row_offsets->data().get(), 826 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat); 827 828 /* perform the solve analysis on the transposed matrix */ 829 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp, 830 loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, 831 loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(), 832 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), 833 loTriFactorT->solveInfo);CHKERRCUSPARSE(stat); 834 835 /* assign the pointer. Is this really necessary? */ 836 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT; 837 838 /*********************************************/ 839 /* Now the Transpose of the Upper Tri Factor */ 840 /*********************************************/ 841 842 /* allocate space for the transpose of the upper triangular factor */ 843 upTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct; 844 845 /* set the matrix descriptors of the upper triangular factor */ 846 matrixType = cusparseGetMatType(upTriFactor->descr); 847 indexBase = cusparseGetMatIndexBase(upTriFactor->descr); 848 fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ? 849 CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 850 diagType = cusparseGetMatDiagType(upTriFactor->descr); 851 852 /* Create the matrix description */ 853 stat = cusparseCreateMatDescr(&upTriFactorT->descr);CHKERRCUSPARSE(stat); 854 stat = cusparseSetMatIndexBase(upTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat); 855 stat = cusparseSetMatType(upTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat); 856 stat = cusparseSetMatFillMode(upTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat); 857 stat = cusparseSetMatDiagType(upTriFactorT->descr, diagType);CHKERRCUSPARSE(stat); 858 859 /* Create the solve analysis information */ 860 stat = cusparseCreateSolveAnalysisInfo(&upTriFactorT->solveInfo);CHKERRCUSPARSE(stat); 861 862 /* set the operation */ 863 upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 864 865 /* allocate GPU space for the CSC of the upper triangular factor*/ 866 upTriFactorT->csrMat = new CsrMatrix; 867 upTriFactorT->csrMat->num_rows = upTriFactor->csrMat->num_rows; 868 upTriFactorT->csrMat->num_cols = upTriFactor->csrMat->num_cols; 869 upTriFactorT->csrMat->num_entries = upTriFactor->csrMat->num_entries; 870 upTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(upTriFactor->csrMat->num_rows+1); 871 upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactor->csrMat->num_entries); 872 upTriFactorT->csrMat->values = new THRUSTARRAY(upTriFactor->csrMat->num_entries); 873 874 /* compute the transpose of the upper triangular factor, i.e. the CSC */ 875 stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, 876 upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, 877 upTriFactor->csrMat->values->data().get(), 878 upTriFactor->csrMat->row_offsets->data().get(), 879 upTriFactor->csrMat->column_indices->data().get(), 880 upTriFactorT->csrMat->values->data().get(), 881 upTriFactorT->csrMat->column_indices->data().get(), 882 upTriFactorT->csrMat->row_offsets->data().get(), 883 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat); 884 885 /* perform the solve analysis on the transposed matrix */ 886 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp, 887 upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, 888 upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(), 889 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), 890 upTriFactorT->solveInfo);CHKERRCUSPARSE(stat); 891 892 /* assign the pointer. Is this really necessary? */ 893 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT; 894 PetscFunctionReturn(0); 895 } 896 897 static PetscErrorCode MatSeqAIJCUSPARSEGenerateTransposeForMult(Mat A) 898 { 899 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 900 Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 901 Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 902 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 903 cusparseStatus_t stat; 904 cusparseIndexBase_t indexBase; 905 cudaError_t err; 906 PetscErrorCode ierr; 907 908 PetscFunctionBegin; 909 if (!cusparsestruct->transgen || cusparsestruct->matTranspose) PetscFunctionReturn(0); 910 ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 911 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 912 /* create cusparse matrix */ 913 matstructT = new Mat_SeqAIJCUSPARSEMultStruct; 914 stat = cusparseCreateMatDescr(&matstructT->descr);CHKERRCUSPARSE(stat); 915 indexBase = cusparseGetMatIndexBase(matstruct->descr); 916 stat = cusparseSetMatIndexBase(matstructT->descr, indexBase);CHKERRCUSPARSE(stat); 917 stat = cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 918 919 /* set alpha and beta */ 920 err = cudaMalloc((void **)&(matstructT->alpha), sizeof(PetscScalar));CHKERRCUDA(err); 921 err = cudaMalloc((void **)&(matstructT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err); 922 err = cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar));CHKERRCUDA(err); 923 err = cudaMemcpy(matstructT->alpha, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 924 err = cudaMemcpy(matstructT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 925 err = cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 926 stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 927 928 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 929 CsrMatrix *matrix = (CsrMatrix*)matstruct->mat; 930 CsrMatrix *matrixT= new CsrMatrix; 931 matrixT->num_rows = A->cmap->n; 932 matrixT->num_cols = A->rmap->n; 933 matrixT->num_entries = a->nz; 934 matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows+1); 935 matrixT->column_indices = new THRUSTINTARRAY32(a->nz); 936 matrixT->values = new THRUSTARRAY(a->nz); 937 938 cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1); 939 cusparsestruct->rowoffsets_gpu->assign(a->i,a->i+A->rmap->n+1); 940 /* compute the transpose, i.e. the CSC */ 941 indexBase = cusparseGetMatIndexBase(matstruct->descr); 942 stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n, 943 A->cmap->n, matrix->num_entries, 944 matrix->values->data().get(), 945 cusparsestruct->rowoffsets_gpu->data().get(), 946 matrix->column_indices->data().get(), 947 matrixT->values->data().get(), 948 matrixT->column_indices->data().get(), 949 matrixT->row_offsets->data().get(), 950 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat); 951 matstructT->mat = matrixT; 952 } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) { 953 CsrMatrix *temp = new CsrMatrix; 954 CsrMatrix *tempT = new CsrMatrix; 955 956 /* First convert HYB to CSR */ 957 temp->num_rows = A->rmap->n; 958 temp->num_cols = A->cmap->n; 959 temp->num_entries = a->nz; 960 temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 961 temp->column_indices = new THRUSTINTARRAY32(a->nz); 962 temp->values = new THRUSTARRAY(a->nz); 963 964 stat = cusparse_hyb2csr(cusparsestruct->handle, 965 matstruct->descr, (cusparseHybMat_t)matstruct->mat, 966 temp->values->data().get(), 967 temp->row_offsets->data().get(), 968 temp->column_indices->data().get());CHKERRCUSPARSE(stat); 969 970 /* Next, convert CSR to CSC (i.e. the matrix transpose) */ 971 tempT->num_rows = A->rmap->n; 972 tempT->num_cols = A->cmap->n; 973 tempT->num_entries = a->nz; 974 tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 975 tempT->column_indices = new THRUSTINTARRAY32(a->nz); 976 tempT->values = new THRUSTARRAY(a->nz); 977 978 stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows, 979 temp->num_cols, temp->num_entries, 980 temp->values->data().get(), 981 temp->row_offsets->data().get(), 982 temp->column_indices->data().get(), 983 tempT->values->data().get(), 984 tempT->column_indices->data().get(), 985 tempT->row_offsets->data().get(), 986 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat); 987 988 /* Last, convert CSC to HYB */ 989 cusparseHybMat_t hybMat; 990 stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat); 991 cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ? 992 CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 993 stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n, 994 matstructT->descr, tempT->values->data().get(), 995 tempT->row_offsets->data().get(), 996 tempT->column_indices->data().get(), 997 hybMat, 0, partition);CHKERRCUSPARSE(stat); 998 999 /* assign the pointer */ 1000 matstructT->mat = hybMat; 1001 /* delete temporaries */ 1002 if (tempT) { 1003 if (tempT->values) delete (THRUSTARRAY*) tempT->values; 1004 if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices; 1005 if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets; 1006 delete (CsrMatrix*) tempT; 1007 } 1008 if (temp) { 1009 if (temp->values) delete (THRUSTARRAY*) temp->values; 1010 if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices; 1011 if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets; 1012 delete (CsrMatrix*) temp; 1013 } 1014 } 1015 err = WaitForGPU();CHKERRCUDA(err); 1016 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1017 ierr = PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1018 /* the compressed row indices is not used for matTranspose */ 1019 matstructT->cprowIndices = NULL; 1020 /* assign the pointer */ 1021 ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT; 1022 PetscFunctionReturn(0); 1023 } 1024 1025 /* Why do we need to analyze the tranposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */ 1026 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 1027 { 1028 PetscInt n = xx->map->n; 1029 const PetscScalar *barray; 1030 PetscScalar *xarray; 1031 thrust::device_ptr<const PetscScalar> bGPU; 1032 thrust::device_ptr<PetscScalar> xGPU; 1033 cusparseStatus_t stat; 1034 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1035 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1036 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1037 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1038 PetscErrorCode ierr; 1039 cudaError_t cerr; 1040 1041 PetscFunctionBegin; 1042 /* Analyze the matrix and create the transpose ... on the fly */ 1043 if (!loTriFactorT && !upTriFactorT) { 1044 ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr); 1045 loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1046 upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1047 } 1048 1049 /* Get the GPU pointers */ 1050 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1051 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1052 xGPU = thrust::device_pointer_cast(xarray); 1053 bGPU = thrust::device_pointer_cast(barray); 1054 1055 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1056 /* First, reorder with the row permutation */ 1057 thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), 1058 thrust::make_permutation_iterator(bGPU+n, cusparseTriFactors->rpermIndices->end()), 1059 xGPU); 1060 1061 /* First, solve U */ 1062 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, 1063 upTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactorT->descr, 1064 upTriFactorT->csrMat->values->data().get(), 1065 upTriFactorT->csrMat->row_offsets->data().get(), 1066 upTriFactorT->csrMat->column_indices->data().get(), 1067 upTriFactorT->solveInfo, 1068 xarray, tempGPU->data().get());CHKERRCUSPARSE(stat); 1069 1070 /* Then, solve L */ 1071 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, 1072 loTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactorT->descr, 1073 loTriFactorT->csrMat->values->data().get(), 1074 loTriFactorT->csrMat->row_offsets->data().get(), 1075 loTriFactorT->csrMat->column_indices->data().get(), 1076 loTriFactorT->solveInfo, 1077 tempGPU->data().get(), xarray);CHKERRCUSPARSE(stat); 1078 1079 /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */ 1080 thrust::copy(thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()), 1081 thrust::make_permutation_iterator(xGPU+n, cusparseTriFactors->cpermIndices->end()), 1082 tempGPU->begin()); 1083 1084 /* Copy the temporary to the full solution. */ 1085 thrust::copy(tempGPU->begin(), tempGPU->end(), xGPU); 1086 1087 /* restore */ 1088 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1089 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1090 cerr = WaitForGPU();CHKERRCUDA(cerr); 1091 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1092 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1093 PetscFunctionReturn(0); 1094 } 1095 1096 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 1097 { 1098 const PetscScalar *barray; 1099 PetscScalar *xarray; 1100 cusparseStatus_t stat; 1101 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1102 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1103 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1104 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1105 PetscErrorCode ierr; 1106 cudaError_t cerr; 1107 1108 PetscFunctionBegin; 1109 /* Analyze the matrix and create the transpose ... on the fly */ 1110 if (!loTriFactorT && !upTriFactorT) { 1111 ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr); 1112 loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1113 upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1114 } 1115 1116 /* Get the GPU pointers */ 1117 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1118 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1119 1120 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1121 /* First, solve U */ 1122 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, 1123 upTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactorT->descr, 1124 upTriFactorT->csrMat->values->data().get(), 1125 upTriFactorT->csrMat->row_offsets->data().get(), 1126 upTriFactorT->csrMat->column_indices->data().get(), 1127 upTriFactorT->solveInfo, 1128 barray, tempGPU->data().get());CHKERRCUSPARSE(stat); 1129 1130 /* Then, solve L */ 1131 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, 1132 loTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactorT->descr, 1133 loTriFactorT->csrMat->values->data().get(), 1134 loTriFactorT->csrMat->row_offsets->data().get(), 1135 loTriFactorT->csrMat->column_indices->data().get(), 1136 loTriFactorT->solveInfo, 1137 tempGPU->data().get(), xarray);CHKERRCUSPARSE(stat); 1138 1139 /* restore */ 1140 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1141 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1142 cerr = WaitForGPU();CHKERRCUDA(cerr); 1143 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1144 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1145 PetscFunctionReturn(0); 1146 } 1147 1148 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 1149 { 1150 const PetscScalar *barray; 1151 PetscScalar *xarray; 1152 thrust::device_ptr<const PetscScalar> bGPU; 1153 thrust::device_ptr<PetscScalar> xGPU; 1154 cusparseStatus_t stat; 1155 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1156 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1157 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1158 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1159 PetscErrorCode ierr; 1160 cudaError_t cerr; 1161 1162 PetscFunctionBegin; 1163 1164 /* Get the GPU pointers */ 1165 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1166 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1167 xGPU = thrust::device_pointer_cast(xarray); 1168 bGPU = thrust::device_pointer_cast(barray); 1169 1170 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1171 /* First, reorder with the row permutation */ 1172 thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), 1173 thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()), 1174 tempGPU->begin()); 1175 1176 /* Next, solve L */ 1177 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp, 1178 loTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactor->descr, 1179 loTriFactor->csrMat->values->data().get(), 1180 loTriFactor->csrMat->row_offsets->data().get(), 1181 loTriFactor->csrMat->column_indices->data().get(), 1182 loTriFactor->solveInfo, 1183 tempGPU->data().get(), xarray);CHKERRCUSPARSE(stat); 1184 1185 /* Then, solve U */ 1186 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp, 1187 upTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactor->descr, 1188 upTriFactor->csrMat->values->data().get(), 1189 upTriFactor->csrMat->row_offsets->data().get(), 1190 upTriFactor->csrMat->column_indices->data().get(), 1191 upTriFactor->solveInfo, 1192 xarray, tempGPU->data().get());CHKERRCUSPARSE(stat); 1193 1194 /* Last, reorder with the column permutation */ 1195 thrust::copy(thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()), 1196 thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()), 1197 xGPU); 1198 1199 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1200 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1201 cerr = WaitForGPU();CHKERRCUDA(cerr); 1202 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1203 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1204 PetscFunctionReturn(0); 1205 } 1206 1207 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 1208 { 1209 const PetscScalar *barray; 1210 PetscScalar *xarray; 1211 cusparseStatus_t stat; 1212 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1213 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1214 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1215 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1216 PetscErrorCode ierr; 1217 cudaError_t cerr; 1218 1219 PetscFunctionBegin; 1220 /* Get the GPU pointers */ 1221 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1222 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1223 1224 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1225 /* First, solve L */ 1226 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp, 1227 loTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactor->descr, 1228 loTriFactor->csrMat->values->data().get(), 1229 loTriFactor->csrMat->row_offsets->data().get(), 1230 loTriFactor->csrMat->column_indices->data().get(), 1231 loTriFactor->solveInfo, 1232 barray, tempGPU->data().get());CHKERRCUSPARSE(stat); 1233 1234 /* Next, solve U */ 1235 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp, 1236 upTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactor->descr, 1237 upTriFactor->csrMat->values->data().get(), 1238 upTriFactor->csrMat->row_offsets->data().get(), 1239 upTriFactor->csrMat->column_indices->data().get(), 1240 upTriFactor->solveInfo, 1241 tempGPU->data().get(), xarray);CHKERRCUSPARSE(stat); 1242 1243 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1244 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1245 cerr = WaitForGPU();CHKERRCUDA(cerr); 1246 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1247 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1248 PetscFunctionReturn(0); 1249 } 1250 1251 static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A) 1252 { 1253 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1254 Mat_SeqAIJCUSPARSEMultStruct *matstruct = cusparsestruct->mat; 1255 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1256 PetscInt m = A->rmap->n,*ii,*ridx,tmp; 1257 PetscErrorCode ierr; 1258 cusparseStatus_t stat; 1259 cudaError_t err; 1260 1261 PetscFunctionBegin; 1262 if (A->boundtocpu) PetscFunctionReturn(0); 1263 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 1264 if (A->was_assembled && A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) { 1265 /* Copy values only */ 1266 CsrMatrix *mat,*matT; 1267 mat = (CsrMatrix*)cusparsestruct->mat->mat; 1268 1269 ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1270 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1271 mat->values->assign(a->a, a->a+a->nz); 1272 err = WaitForGPU();CHKERRCUDA(err); 1273 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1274 ierr = PetscLogCpuToGpu((a->nz)*sizeof(PetscScalar));CHKERRQ(ierr); 1275 ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1276 1277 /* Update matT when it was built before */ 1278 if (cusparsestruct->matTranspose) { 1279 cusparseIndexBase_t indexBase = cusparseGetMatIndexBase(cusparsestruct->mat->descr); 1280 matT = (CsrMatrix*)cusparsestruct->matTranspose->mat; 1281 ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1282 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1283 stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n, 1284 A->cmap->n, mat->num_entries, 1285 mat->values->data().get(), 1286 cusparsestruct->rowoffsets_gpu->data().get(), 1287 mat->column_indices->data().get(), 1288 matT->values->data().get(), 1289 matT->column_indices->data().get(), 1290 matT->row_offsets->data().get(), 1291 CUSPARSE_ACTION_NUMERIC,indexBase);CHKERRCUSPARSE(stat); 1292 err = WaitForGPU();CHKERRCUDA(err); 1293 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1294 ierr = PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1295 } 1296 } else { 1297 ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1298 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1299 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat,cusparsestruct->format);CHKERRQ(ierr); 1300 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->matTranspose,cusparsestruct->format);CHKERRQ(ierr); 1301 delete cusparsestruct->workVector; 1302 delete cusparsestruct->rowoffsets_gpu; 1303 try { 1304 if (a->compressedrow.use) { 1305 m = a->compressedrow.nrows; 1306 ii = a->compressedrow.i; 1307 ridx = a->compressedrow.rindex; 1308 } else { 1309 m = A->rmap->n; 1310 ii = a->i; 1311 ridx = NULL; 1312 } 1313 cusparsestruct->nrows = m; 1314 1315 /* create cusparse matrix */ 1316 matstruct = new Mat_SeqAIJCUSPARSEMultStruct; 1317 stat = cusparseCreateMatDescr(&matstruct->descr);CHKERRCUSPARSE(stat); 1318 stat = cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 1319 stat = cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 1320 1321 err = cudaMalloc((void **)&(matstruct->alpha), sizeof(PetscScalar));CHKERRCUDA(err); 1322 err = cudaMalloc((void **)&(matstruct->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err); 1323 err = cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar));CHKERRCUDA(err); 1324 err = cudaMemcpy(matstruct->alpha, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1325 err = cudaMemcpy(matstruct->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1326 err = cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1327 stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 1328 1329 /* Build a hybrid/ellpack matrix if this option is chosen for the storage */ 1330 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1331 /* set the matrix */ 1332 CsrMatrix *matrix= new CsrMatrix; 1333 matrix->num_rows = m; 1334 matrix->num_cols = A->cmap->n; 1335 matrix->num_entries = a->nz; 1336 matrix->row_offsets = new THRUSTINTARRAY32(m+1); 1337 matrix->row_offsets->assign(ii, ii + m+1); 1338 1339 matrix->column_indices = new THRUSTINTARRAY32(a->nz); 1340 matrix->column_indices->assign(a->j, a->j+a->nz); 1341 1342 matrix->values = new THRUSTARRAY(a->nz); 1343 matrix->values->assign(a->a, a->a+a->nz); 1344 1345 /* assign the pointer */ 1346 matstruct->mat = matrix; 1347 1348 } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) { 1349 CsrMatrix *matrix= new CsrMatrix; 1350 matrix->num_rows = m; 1351 matrix->num_cols = A->cmap->n; 1352 matrix->num_entries = a->nz; 1353 matrix->row_offsets = new THRUSTINTARRAY32(m+1); 1354 matrix->row_offsets->assign(ii, ii + m+1); 1355 1356 matrix->column_indices = new THRUSTINTARRAY32(a->nz); 1357 matrix->column_indices->assign(a->j, a->j+a->nz); 1358 1359 matrix->values = new THRUSTARRAY(a->nz); 1360 matrix->values->assign(a->a, a->a+a->nz); 1361 1362 cusparseHybMat_t hybMat; 1363 stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat); 1364 cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ? 1365 CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 1366 stat = cusparse_csr2hyb(cusparsestruct->handle, matrix->num_rows, matrix->num_cols, 1367 matstruct->descr, matrix->values->data().get(), 1368 matrix->row_offsets->data().get(), 1369 matrix->column_indices->data().get(), 1370 hybMat, 0, partition);CHKERRCUSPARSE(stat); 1371 /* assign the pointer */ 1372 matstruct->mat = hybMat; 1373 1374 if (matrix) { 1375 if (matrix->values) delete (THRUSTARRAY*)matrix->values; 1376 if (matrix->column_indices) delete (THRUSTINTARRAY32*)matrix->column_indices; 1377 if (matrix->row_offsets) delete (THRUSTINTARRAY32*)matrix->row_offsets; 1378 delete (CsrMatrix*)matrix; 1379 } 1380 } 1381 1382 /* assign the compressed row indices */ 1383 if (a->compressedrow.use) { 1384 cusparsestruct->workVector = new THRUSTARRAY(m); 1385 matstruct->cprowIndices = new THRUSTINTARRAY(m); 1386 matstruct->cprowIndices->assign(ridx,ridx+m); 1387 tmp = m; 1388 } else { 1389 cusparsestruct->workVector = NULL; 1390 matstruct->cprowIndices = NULL; 1391 tmp = 0; 1392 } 1393 ierr = PetscLogCpuToGpu(((m+1)+(a->nz))*sizeof(int)+tmp*sizeof(PetscInt)+(3+(a->nz))*sizeof(PetscScalar));CHKERRQ(ierr); 1394 1395 /* assign the pointer */ 1396 cusparsestruct->mat = matstruct; 1397 } catch(char *ex) { 1398 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 1399 } 1400 err = WaitForGPU();CHKERRCUDA(err); 1401 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1402 ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1403 cusparsestruct->nonzerostate = A->nonzerostate; 1404 } 1405 A->offloadmask = PETSC_OFFLOAD_BOTH; 1406 } 1407 PetscFunctionReturn(0); 1408 } 1409 1410 struct VecCUDAPlusEquals 1411 { 1412 template <typename Tuple> 1413 __host__ __device__ 1414 void operator()(Tuple t) 1415 { 1416 thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t); 1417 } 1418 }; 1419 1420 struct VecCUDAEqualsReverse 1421 { 1422 template <typename Tuple> 1423 __host__ __device__ 1424 void operator()(Tuple t) 1425 { 1426 thrust::get<0>(t) = thrust::get<1>(t); 1427 } 1428 }; 1429 1430 typedef struct { 1431 PetscBool cisdense; 1432 PetscScalar *Bt; 1433 Mat X; 1434 } MatMatCusparse; 1435 1436 static PetscErrorCode MatDestroy_MatMatCusparse(void *data) 1437 { 1438 PetscErrorCode ierr; 1439 MatMatCusparse *mmdata = (MatMatCusparse *)data; 1440 cudaError_t cerr; 1441 1442 PetscFunctionBegin; 1443 cerr = cudaFree(mmdata->Bt);CHKERRCUDA(cerr); 1444 ierr = MatDestroy(&mmdata->X);CHKERRQ(ierr); 1445 ierr = PetscFree(data);CHKERRQ(ierr); 1446 PetscFunctionReturn(0); 1447 } 1448 1449 PETSC_INTERN PetscErrorCode MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(Mat,Mat,Mat,PetscBool,PetscBool); 1450 1451 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C) 1452 { 1453 Mat_Product *product = C->product; 1454 Mat A,B; 1455 PetscInt m,n,k,blda,clda; 1456 PetscBool flg,biscuda; 1457 Mat_SeqAIJCUSPARSE *cusp; 1458 cusparseStatus_t stat; 1459 cusparseOperation_t opA; 1460 cusparseMatDescr_t matA; 1461 const PetscScalar *barray; 1462 PetscScalar *carray; 1463 PetscErrorCode ierr; 1464 MatMatCusparse *mmdata; 1465 Mat_SeqAIJCUSPARSEMultStruct *mat; 1466 CsrMatrix *csrmat; 1467 cudaError_t cuer; 1468 1469 PetscFunctionBegin; 1470 MatCheckProduct(C,1); 1471 if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data empty"); 1472 mmdata = (MatMatCusparse*)product->data; 1473 A = product->A; 1474 B = product->B; 1475 ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 1476 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name); 1477 /* currently CopyToGpu does not copy if the matrix is bound to CPU 1478 Instead of silently accepting the wrong answer, I prefer to raise the error */ 1479 if (A->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 1480 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 1481 cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 1482 switch (product->type) { 1483 case MATPRODUCT_AB: 1484 case MATPRODUCT_PtAP: 1485 mat = cusp->mat; 1486 opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 1487 m = A->rmap->n; 1488 k = A->cmap->n; 1489 n = B->cmap->n; 1490 break; 1491 case MATPRODUCT_AtB: 1492 if (!cusp->transgen) { 1493 mat = cusp->mat; 1494 opA = CUSPARSE_OPERATION_TRANSPOSE; 1495 } else { 1496 ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr); 1497 mat = cusp->matTranspose; 1498 opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 1499 } 1500 m = A->cmap->n; 1501 k = A->rmap->n; 1502 n = B->cmap->n; 1503 break; 1504 case MATPRODUCT_ABt: 1505 case MATPRODUCT_RARt: 1506 mat = cusp->mat; 1507 opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 1508 m = A->rmap->n; 1509 k = B->cmap->n; 1510 n = B->rmap->n; 1511 break; 1512 default: 1513 SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]); 1514 } 1515 if (!mat) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 1516 matA = mat->descr; 1517 csrmat = (CsrMatrix*)mat->mat; 1518 /* if the user passed a CPU matrix, copy the data to the GPU */ 1519 ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQDENSECUDA,&biscuda);CHKERRQ(ierr); 1520 if (!biscuda) { 1521 ierr = MatConvert(B,MATSEQDENSECUDA,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr); 1522 } 1523 ierr = MatDenseCUDAGetArrayRead(B,&barray);CHKERRQ(ierr); 1524 ierr = MatDenseGetLDA(B,&blda);CHKERRQ(ierr); 1525 if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) { 1526 ierr = MatDenseCUDAGetArrayWrite(mmdata->X,&carray);CHKERRQ(ierr); 1527 ierr = MatDenseGetLDA(mmdata->X,&clda);CHKERRQ(ierr); 1528 } else { 1529 ierr = MatDenseCUDAGetArrayWrite(C,&carray);CHKERRQ(ierr); 1530 ierr = MatDenseGetLDA(C,&clda);CHKERRQ(ierr); 1531 } 1532 1533 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1534 1535 /* cusparse spmm does not support transpose on B */ 1536 if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) { 1537 cublasHandle_t cublasv2handle; 1538 cublasStatus_t cerr; 1539 1540 ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr); 1541 cerr = cublasXgeam(cublasv2handle,CUBLAS_OP_T,CUBLAS_OP_T, 1542 B->cmap->n,B->rmap->n, 1543 &PETSC_CUSPARSE_ONE ,barray,blda, 1544 &PETSC_CUSPARSE_ZERO,barray,blda, 1545 mmdata->Bt,B->cmap->n);CHKERRCUBLAS(cerr); 1546 blda = B->cmap->n; 1547 } 1548 1549 /* perform the MatMat operation */ 1550 stat = cusparse_csr_spmm(cusp->handle,opA,m,n,k, 1551 csrmat->num_entries,mat->alpha,matA, 1552 csrmat->values->data().get(), 1553 csrmat->row_offsets->data().get(), 1554 csrmat->column_indices->data().get(), 1555 mmdata->Bt ? mmdata->Bt : barray,blda,mat->beta_zero, 1556 carray,clda);CHKERRCUSPARSE(stat); 1557 cuer = WaitForGPU();CHKERRCUDA(cuer); 1558 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1559 ierr = PetscLogGpuFlops(n*2.0*csrmat->num_entries);CHKERRQ(ierr); 1560 ierr = MatDenseCUDARestoreArrayRead(B,&barray);CHKERRQ(ierr); 1561 if (product->type == MATPRODUCT_RARt) { 1562 ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr); 1563 ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr); 1564 } else if (product->type == MATPRODUCT_PtAP) { 1565 ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr); 1566 ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr); 1567 } else { 1568 ierr = MatDenseCUDARestoreArrayWrite(C,&carray);CHKERRQ(ierr); 1569 } 1570 if (mmdata->cisdense) { 1571 ierr = MatConvert(C,MATSEQDENSE,MAT_INPLACE_MATRIX,&C);CHKERRQ(ierr); 1572 } 1573 if (!biscuda) { 1574 ierr = MatConvert(B,MATSEQDENSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr); 1575 } 1576 PetscFunctionReturn(0); 1577 } 1578 1579 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C) 1580 { 1581 Mat_Product *product = C->product; 1582 Mat A,B; 1583 PetscInt m,n; 1584 PetscBool cisdense,flg; 1585 PetscErrorCode ierr; 1586 MatMatCusparse *mmdata; 1587 Mat_SeqAIJCUSPARSE *cusp; 1588 1589 PetscFunctionBegin; 1590 MatCheckProduct(C,1); 1591 if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data not empty"); 1592 A = product->A; 1593 B = product->B; 1594 ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 1595 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name); 1596 cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 1597 if (cusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format"); 1598 switch (product->type) { 1599 case MATPRODUCT_AB: 1600 m = A->rmap->n; 1601 n = B->cmap->n; 1602 break; 1603 case MATPRODUCT_AtB: 1604 m = A->cmap->n; 1605 n = B->cmap->n; 1606 break; 1607 case MATPRODUCT_ABt: 1608 m = A->rmap->n; 1609 n = B->rmap->n; 1610 break; 1611 case MATPRODUCT_PtAP: 1612 m = B->cmap->n; 1613 n = B->cmap->n; 1614 break; 1615 case MATPRODUCT_RARt: 1616 m = B->rmap->n; 1617 n = B->rmap->n; 1618 break; 1619 default: 1620 SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]); 1621 } 1622 ierr = MatSetSizes(C,m,n,m,n);CHKERRQ(ierr); 1623 /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */ 1624 ierr = PetscObjectTypeCompare((PetscObject)C,MATSEQDENSE,&cisdense);CHKERRQ(ierr); 1625 ierr = MatSetType(C,MATSEQDENSECUDA);CHKERRQ(ierr); 1626 1627 /* product data */ 1628 ierr = PetscNew(&mmdata);CHKERRQ(ierr); 1629 mmdata->cisdense = cisdense; 1630 /* cusparse spmm does not support transpose on B */ 1631 if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) { 1632 cudaError_t cerr; 1633 1634 cerr = cudaMalloc((void**)&mmdata->Bt,(size_t)B->rmap->n*(size_t)B->cmap->n*sizeof(PetscScalar));CHKERRCUDA(cerr); 1635 } 1636 /* for these products we need intermediate storage */ 1637 if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) { 1638 ierr = MatCreate(PetscObjectComm((PetscObject)C),&mmdata->X);CHKERRQ(ierr); 1639 ierr = MatSetType(mmdata->X,MATSEQDENSECUDA);CHKERRQ(ierr); 1640 if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */ 1641 ierr = MatSetSizes(mmdata->X,A->rmap->n,B->rmap->n,A->rmap->n,B->rmap->n);CHKERRQ(ierr); 1642 } else { 1643 ierr = MatSetSizes(mmdata->X,A->rmap->n,B->cmap->n,A->rmap->n,B->cmap->n);CHKERRQ(ierr); 1644 } 1645 } 1646 C->product->data = mmdata; 1647 C->product->destroy = MatDestroy_MatMatCusparse; 1648 1649 C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA; 1650 PetscFunctionReturn(0); 1651 } 1652 1653 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat); 1654 1655 /* handles dense B */ 1656 static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat C) 1657 { 1658 Mat_Product *product = C->product; 1659 PetscErrorCode ierr; 1660 1661 PetscFunctionBegin; 1662 MatCheckProduct(C,1); 1663 if (!product->A) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing A"); 1664 if (product->A->boundtocpu) { 1665 ierr = MatProductSetFromOptions_SeqAIJ_SeqDense(C);CHKERRQ(ierr); 1666 PetscFunctionReturn(0); 1667 } 1668 switch (product->type) { 1669 case MATPRODUCT_AB: 1670 case MATPRODUCT_AtB: 1671 case MATPRODUCT_ABt: 1672 case MATPRODUCT_PtAP: 1673 case MATPRODUCT_RARt: 1674 C->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA; 1675 default: 1676 break; 1677 } 1678 PetscFunctionReturn(0); 1679 } 1680 1681 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 1682 { 1683 PetscErrorCode ierr; 1684 1685 PetscFunctionBegin; 1686 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr); 1687 PetscFunctionReturn(0); 1688 } 1689 1690 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy, Vec zz) 1691 { 1692 PetscErrorCode ierr; 1693 1694 PetscFunctionBegin; 1695 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr); 1696 PetscFunctionReturn(0); 1697 } 1698 1699 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 1700 { 1701 PetscErrorCode ierr; 1702 1703 PetscFunctionBegin; 1704 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr); 1705 PetscFunctionReturn(0); 1706 } 1707 1708 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 1709 { 1710 PetscErrorCode ierr; 1711 1712 PetscFunctionBegin; 1713 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr); 1714 PetscFunctionReturn(0); 1715 } 1716 1717 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 1718 { 1719 PetscErrorCode ierr; 1720 1721 PetscFunctionBegin; 1722 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr); 1723 PetscFunctionReturn(0); 1724 } 1725 1726 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz,PetscBool trans,PetscBool herm) 1727 { 1728 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1729 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1730 Mat_SeqAIJCUSPARSEMultStruct *matstruct; 1731 PetscScalar *xarray,*zarray,*dptr,*beta,*xptr; 1732 PetscErrorCode ierr; 1733 cudaError_t cerr; 1734 cusparseStatus_t stat; 1735 cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 1736 PetscBool compressed; 1737 1738 PetscFunctionBegin; 1739 if (herm && !trans) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Hermitian and not transpose not supported"); 1740 if (!a->nonzerorowcnt) { 1741 if (!yy) { 1742 ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr); 1743 } 1744 PetscFunctionReturn(0); 1745 } 1746 /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */ 1747 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 1748 if (!trans) { 1749 matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 1750 } else { 1751 if (herm || !cusparsestruct->transgen) { 1752 opA = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE; 1753 matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 1754 } else { 1755 matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 1756 if (!matstruct) { 1757 ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr); 1758 matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 1759 } 1760 } 1761 } 1762 /* Does the matrix use compressed rows (i.e., drop zero rows)? */ 1763 compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE; 1764 1765 try { 1766 ierr = VecCUDAGetArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr); 1767 if (yy == zz) {ierr = VecCUDAGetArray(zz,&zarray);CHKERRQ(ierr);} /* read & write zz, so need to get uptodate zarray on GPU */ 1768 else {ierr = VecCUDAGetArrayWrite(zz,&zarray);CHKERRQ(ierr);} /* write zz, so no need to init zarray on GPU */ 1769 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1770 if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) { 1771 xptr = xarray; 1772 dptr = compressed ? cusparsestruct->workVector->data().get() : zarray; /* Use a work vector for SpMv output if compressed */ 1773 beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero; 1774 } else { 1775 xptr = compressed ? cusparsestruct->workVector->data().get() : xarray; /* Use a work vector for SpMv input if compressed */ 1776 dptr = zarray; 1777 beta = yy ? matstruct->beta_one : matstruct->beta_zero; 1778 1779 if (compressed) { 1780 thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray); 1781 1782 thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))), 1783 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), 1784 VecCUDAEqualsReverse()); 1785 } 1786 } 1787 1788 /* csr_spmv does y = alpha*Ax + beta*y */ 1789 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 1790 CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 1791 stat = cusparse_csr_spmv(cusparsestruct->handle, opA, 1792 mat->num_rows, mat->num_cols, 1793 mat->num_entries, matstruct->alpha, matstruct->descr, 1794 mat->values->data().get(), mat->row_offsets->data().get(), 1795 mat->column_indices->data().get(), xptr, beta, 1796 dptr);CHKERRCUSPARSE(stat); 1797 } else { 1798 if (cusparsestruct->nrows) { 1799 cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat; 1800 stat = cusparse_hyb_spmv(cusparsestruct->handle, opA, 1801 matstruct->alpha, matstruct->descr, hybMat, 1802 xptr, beta, 1803 dptr);CHKERRCUSPARSE(stat); 1804 } 1805 } 1806 cerr = WaitForGPU();CHKERRCUDA(cerr); 1807 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1808 1809 if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) { 1810 if (yy) { /* MatMultAdd: zz = A*xx + yy */ 1811 if (compressed) { /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */ 1812 ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr); /* zz = yy */ 1813 } else if (zz != yy) { /* A is not compressed. zz already contains A*xx, and we just need to add yy */ 1814 ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */ 1815 } 1816 } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */ 1817 ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr); 1818 } 1819 1820 /* ScatterAdd the result from work vector into the full vector when A is compressed */ 1821 if (compressed) { 1822 thrust::device_ptr<PetscScalar> zptr = thrust::device_pointer_cast(zarray); 1823 1824 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1825 thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))), 1826 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), 1827 VecCUDAPlusEquals()); 1828 cerr = WaitForGPU();CHKERRCUDA(cerr); 1829 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1830 } 1831 } else { 1832 if (yy && yy != zz) { 1833 ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */ 1834 } 1835 } 1836 ierr = VecCUDARestoreArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr); 1837 if (yy == zz) {ierr = VecCUDARestoreArray(zz,&zarray);CHKERRQ(ierr);} 1838 else {ierr = VecCUDARestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);} 1839 } catch(char *ex) { 1840 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 1841 } 1842 if (yy) { 1843 ierr = PetscLogGpuFlops(2.0*a->nz);CHKERRQ(ierr); 1844 } else { 1845 ierr = PetscLogGpuFlops(2.0*a->nz-a->nonzerorowcnt);CHKERRQ(ierr); 1846 } 1847 PetscFunctionReturn(0); 1848 } 1849 1850 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 1851 { 1852 PetscErrorCode ierr; 1853 1854 PetscFunctionBegin; 1855 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr); 1856 PetscFunctionReturn(0); 1857 } 1858 1859 static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode) 1860 { 1861 PetscErrorCode ierr; 1862 1863 PetscFunctionBegin; 1864 ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr); 1865 if (mode == MAT_FLUSH_ASSEMBLY || A->boundtocpu) PetscFunctionReturn(0); 1866 if (A->factortype == MAT_FACTOR_NONE) { 1867 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 1868 } 1869 PetscFunctionReturn(0); 1870 } 1871 1872 /* --------------------------------------------------------------------------------*/ 1873 /*@ 1874 MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format 1875 (the default parallel PETSc format). This matrix will ultimately pushed down 1876 to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix 1877 assembly performance the user should preallocate the matrix storage by setting 1878 the parameter nz (or the array nnz). By setting these parameters accurately, 1879 performance during matrix assembly can be increased by more than a factor of 50. 1880 1881 Collective 1882 1883 Input Parameters: 1884 + comm - MPI communicator, set to PETSC_COMM_SELF 1885 . m - number of rows 1886 . n - number of columns 1887 . nz - number of nonzeros per row (same for all rows) 1888 - nnz - array containing the number of nonzeros in the various rows 1889 (possibly different for each row) or NULL 1890 1891 Output Parameter: 1892 . A - the matrix 1893 1894 It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 1895 MatXXXXSetPreallocation() paradgm instead of this routine directly. 1896 [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 1897 1898 Notes: 1899 If nnz is given then nz is ignored 1900 1901 The AIJ format (also called the Yale sparse matrix format or 1902 compressed row storage), is fully compatible with standard Fortran 77 1903 storage. That is, the stored row and column indices can begin at 1904 either one (as in Fortran) or zero. See the users' manual for details. 1905 1906 Specify the preallocated storage with either nz or nnz (not both). 1907 Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 1908 allocation. For large problems you MUST preallocate memory or you 1909 will get TERRIBLE performance, see the users' manual chapter on matrices. 1910 1911 By default, this format uses inodes (identical nodes) when possible, to 1912 improve numerical efficiency of matrix-vector products and solves. We 1913 search for consecutive rows with the same nonzero structure, thereby 1914 reusing matrix information to achieve increased efficiency. 1915 1916 Level: intermediate 1917 1918 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE 1919 @*/ 1920 PetscErrorCode MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A) 1921 { 1922 PetscErrorCode ierr; 1923 1924 PetscFunctionBegin; 1925 ierr = MatCreate(comm,A);CHKERRQ(ierr); 1926 ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr); 1927 ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 1928 ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr); 1929 PetscFunctionReturn(0); 1930 } 1931 1932 static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A) 1933 { 1934 PetscErrorCode ierr; 1935 1936 PetscFunctionBegin; 1937 if (A->factortype == MAT_FACTOR_NONE) { 1938 ierr = MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr);CHKERRQ(ierr); 1939 } else { 1940 ierr = MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr);CHKERRQ(ierr); 1941 } 1942 ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL);CHKERRQ(ierr); 1943 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr); 1944 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr); 1945 ierr = PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL);CHKERRQ(ierr); 1946 ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr); 1947 PetscFunctionReturn(0); 1948 } 1949 1950 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat,MatType,MatReuse,Mat*); 1951 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat,PetscBool); 1952 static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A,MatDuplicateOption cpvalues,Mat *B) 1953 { 1954 PetscErrorCode ierr; 1955 1956 PetscFunctionBegin; 1957 ierr = MatDuplicate_SeqAIJ(A,cpvalues,B);CHKERRQ(ierr); 1958 ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,B);CHKERRQ(ierr); 1959 PetscFunctionReturn(0); 1960 } 1961 1962 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A,PetscBool flg) 1963 { 1964 PetscErrorCode ierr; 1965 1966 PetscFunctionBegin; 1967 if (A->factortype != MAT_FACTOR_NONE) PetscFunctionReturn(0); 1968 /* Currently, there is no case in which an AIJCUSPARSE matrix ever has its offloadmask set to PETS_OFFLOAD_GPU. 1969 If this changes, we need to implement a routine to update the CPU (host) version of the matrix from the GPU one. 1970 Right now, for safety we simply check for PETSC_OFFLOAD_GPU and have MatBindToCPU() error in this case. 1971 TODO: Add MatAIJCUSPARSECopyFromGPU() and make MatBindToCPU() functional for AIJCUSPARSE matries; 1972 can follow the example of MatBindToCPU_SeqAIJViennaCL(). */ 1973 if (flg && A->offloadmask == PETSC_OFFLOAD_GPU) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"PETSC_OFFLOAD_GPU should not happen. Please report your use case to petsc-dev@mcs.anl.gov"); 1974 /* TODO: add support for this? */ 1975 if (flg) { 1976 A->ops->mult = MatMult_SeqAIJ; 1977 A->ops->multadd = MatMultAdd_SeqAIJ; 1978 A->ops->multtranspose = MatMultTranspose_SeqAIJ; 1979 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJ; 1980 A->ops->multhermitiantranspose = NULL; 1981 A->ops->multhermitiantransposeadd = NULL; 1982 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr); 1983 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr); 1984 } else { 1985 A->ops->mult = MatMult_SeqAIJCUSPARSE; 1986 A->ops->multadd = MatMultAdd_SeqAIJCUSPARSE; 1987 A->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE; 1988 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE; 1989 A->ops->multhermitiantranspose = MatMultHermitianTranspose_SeqAIJCUSPARSE; 1990 A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE; 1991 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr); 1992 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr); 1993 } 1994 A->boundtocpu = flg; 1995 PetscFunctionReturn(0); 1996 } 1997 1998 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType mtype, MatReuse reuse, Mat* newmat) 1999 { 2000 PetscErrorCode ierr; 2001 cusparseStatus_t stat; 2002 Mat B; 2003 2004 PetscFunctionBegin; 2005 if (reuse == MAT_INITIAL_MATRIX) { 2006 ierr = MatDuplicate(A,MAT_COPY_VALUES,newmat);CHKERRQ(ierr); 2007 } else if (reuse == MAT_REUSE_MATRIX) { 2008 ierr = MatCopy(A,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr); 2009 } 2010 B = *newmat; 2011 2012 ierr = PetscFree(B->defaultvectype);CHKERRQ(ierr); 2013 ierr = PetscStrallocpy(VECCUDA,&B->defaultvectype);CHKERRQ(ierr); 2014 2015 if (reuse != MAT_REUSE_MATRIX && !B->spptr) { 2016 if (B->factortype == MAT_FACTOR_NONE) { 2017 Mat_SeqAIJCUSPARSE *spptr; 2018 2019 ierr = PetscNew(&spptr);CHKERRQ(ierr); 2020 spptr->format = MAT_CUSPARSE_CSR; 2021 stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat); 2022 B->spptr = spptr; 2023 } else { 2024 Mat_SeqAIJCUSPARSETriFactors *spptr; 2025 2026 ierr = PetscNew(&spptr);CHKERRQ(ierr); 2027 stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat); 2028 B->spptr = spptr; 2029 } 2030 B->offloadmask = PETSC_OFFLOAD_UNALLOCATED; 2031 } 2032 B->ops->assemblyend = MatAssemblyEnd_SeqAIJCUSPARSE; 2033 B->ops->destroy = MatDestroy_SeqAIJCUSPARSE; 2034 B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE; 2035 B->ops->bindtocpu = MatBindToCPU_SeqAIJCUSPARSE; 2036 B->ops->duplicate = MatDuplicate_SeqAIJCUSPARSE; 2037 2038 ierr = MatBindToCPU_SeqAIJCUSPARSE(B,PETSC_FALSE);CHKERRQ(ierr); 2039 ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 2040 ierr = PetscObjectComposeFunction((PetscObject)B,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr); 2041 PetscFunctionReturn(0); 2042 } 2043 2044 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B) 2045 { 2046 PetscErrorCode ierr; 2047 2048 PetscFunctionBegin; 2049 ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr); 2050 ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr); 2051 PetscFunctionReturn(0); 2052 } 2053 2054 /*MC 2055 MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices. 2056 2057 A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either 2058 CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later. 2059 All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library. 2060 2061 Options Database Keys: 2062 + -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions() 2063 . -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 2064 - -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 2065 2066 Level: beginner 2067 2068 .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 2069 M*/ 2070 2071 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat,MatFactorType,Mat*); 2072 2073 2074 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void) 2075 { 2076 PetscErrorCode ierr; 2077 2078 PetscFunctionBegin; 2079 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 2080 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 2081 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 2082 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 2083 PetscFunctionReturn(0); 2084 } 2085 2086 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct) 2087 { 2088 PetscErrorCode ierr; 2089 cusparseStatus_t stat; 2090 cusparseHandle_t handle; 2091 2092 PetscFunctionBegin; 2093 if (*cusparsestruct) { 2094 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format);CHKERRQ(ierr); 2095 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format);CHKERRQ(ierr); 2096 delete (*cusparsestruct)->workVector; 2097 delete (*cusparsestruct)->rowoffsets_gpu; 2098 if (handle = (*cusparsestruct)->handle) { 2099 stat = cusparseDestroy(handle);CHKERRCUSPARSE(stat); 2100 } 2101 ierr = PetscFree(*cusparsestruct);CHKERRQ(ierr); 2102 } 2103 PetscFunctionReturn(0); 2104 } 2105 2106 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat) 2107 { 2108 PetscFunctionBegin; 2109 if (*mat) { 2110 delete (*mat)->values; 2111 delete (*mat)->column_indices; 2112 delete (*mat)->row_offsets; 2113 delete *mat; 2114 *mat = 0; 2115 } 2116 PetscFunctionReturn(0); 2117 } 2118 2119 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor) 2120 { 2121 cusparseStatus_t stat; 2122 PetscErrorCode ierr; 2123 2124 PetscFunctionBegin; 2125 if (*trifactor) { 2126 if ((*trifactor)->descr) { stat = cusparseDestroyMatDescr((*trifactor)->descr);CHKERRCUSPARSE(stat); } 2127 if ((*trifactor)->solveInfo) { stat = cusparseDestroySolveAnalysisInfo((*trifactor)->solveInfo);CHKERRCUSPARSE(stat); } 2128 ierr = CsrMatrix_Destroy(&(*trifactor)->csrMat);CHKERRQ(ierr); 2129 delete *trifactor; 2130 *trifactor = 0; 2131 } 2132 PetscFunctionReturn(0); 2133 } 2134 2135 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format) 2136 { 2137 CsrMatrix *mat; 2138 cusparseStatus_t stat; 2139 cudaError_t err; 2140 2141 PetscFunctionBegin; 2142 if (*matstruct) { 2143 if ((*matstruct)->mat) { 2144 if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) { 2145 cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat; 2146 stat = cusparseDestroyHybMat(hybMat);CHKERRCUSPARSE(stat); 2147 } else { 2148 mat = (CsrMatrix*)(*matstruct)->mat; 2149 CsrMatrix_Destroy(&mat); 2150 } 2151 } 2152 if ((*matstruct)->descr) { stat = cusparseDestroyMatDescr((*matstruct)->descr);CHKERRCUSPARSE(stat); } 2153 delete (*matstruct)->cprowIndices; 2154 if ((*matstruct)->alpha) { err=cudaFree((*matstruct)->alpha);CHKERRCUDA(err); } 2155 if ((*matstruct)->beta_zero) { err=cudaFree((*matstruct)->beta_zero);CHKERRCUDA(err); } 2156 if ((*matstruct)->beta_one) { err=cudaFree((*matstruct)->beta_one);CHKERRCUDA(err); } 2157 delete *matstruct; 2158 *matstruct = 0; 2159 } 2160 PetscFunctionReturn(0); 2161 } 2162 2163 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors** trifactors) 2164 { 2165 PetscErrorCode ierr; 2166 2167 PetscFunctionBegin; 2168 if (*trifactors) { 2169 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtr);CHKERRQ(ierr); 2170 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtr);CHKERRQ(ierr); 2171 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtrTranspose);CHKERRQ(ierr); 2172 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtrTranspose);CHKERRQ(ierr); 2173 delete (*trifactors)->rpermIndices; 2174 delete (*trifactors)->cpermIndices; 2175 delete (*trifactors)->workVector; 2176 (*trifactors)->rpermIndices = 0; 2177 (*trifactors)->cpermIndices = 0; 2178 (*trifactors)->workVector = 0; 2179 } 2180 PetscFunctionReturn(0); 2181 } 2182 2183 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors) 2184 { 2185 PetscErrorCode ierr; 2186 cusparseHandle_t handle; 2187 cusparseStatus_t stat; 2188 2189 PetscFunctionBegin; 2190 if (*trifactors) { 2191 ierr = MatSeqAIJCUSPARSETriFactors_Reset(trifactors);CHKERRQ(ierr); 2192 if (handle = (*trifactors)->handle) { 2193 stat = cusparseDestroy(handle);CHKERRCUSPARSE(stat); 2194 } 2195 ierr = PetscFree(*trifactors);CHKERRQ(ierr); 2196 } 2197 PetscFunctionReturn(0); 2198 } 2199