1 /* 2 Defines the basic matrix operations for the AIJ (compressed row) 3 matrix storage format using the CUSPARSE library, 4 */ 5 #define PETSC_SKIP_SPINLOCK 6 #define PETSC_SKIP_CXX_COMPLEX_FIX 7 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1 8 9 #include <petscconf.h> 10 #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/ 11 #include <../src/mat/impls/sbaij/seq/sbaij.h> 12 #include <../src/vec/vec/impls/dvecimpl.h> 13 #include <petsc/private/vecimpl.h> 14 #undef VecType 15 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h> 16 17 const char *const MatCUSPARSEStorageFormats[] = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0}; 18 19 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*); 20 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*); 21 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*); 22 23 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 24 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 25 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*); 26 27 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec); 28 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 29 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 30 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 31 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat); 32 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec); 33 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 34 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 35 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 36 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 37 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 38 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec,PetscBool,PetscBool); 39 40 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**); 41 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**); 42 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat); 43 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors**); 44 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**); 45 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**); 46 47 PetscErrorCode MatCUSPARSESetStream(Mat A,const cudaStream_t stream) 48 { 49 cusparseStatus_t stat; 50 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 51 52 PetscFunctionBegin; 53 cusparsestruct->stream = stream; 54 stat = cusparseSetStream(cusparsestruct->handle,cusparsestruct->stream);CHKERRCUSPARSE(stat); 55 PetscFunctionReturn(0); 56 } 57 58 PetscErrorCode MatCUSPARSESetHandle(Mat A,const cusparseHandle_t handle) 59 { 60 cusparseStatus_t stat; 61 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 62 63 PetscFunctionBegin; 64 if (cusparsestruct->handle != handle) { 65 if (cusparsestruct->handle) { 66 stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUSPARSE(stat); 67 } 68 cusparsestruct->handle = handle; 69 } 70 stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 71 PetscFunctionReturn(0); 72 } 73 74 PetscErrorCode MatCUSPARSEClearHandle(Mat A) 75 { 76 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 77 78 PetscFunctionBegin; 79 if (cusparsestruct->handle) cusparsestruct->handle = 0; 80 PetscFunctionReturn(0); 81 } 82 83 PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A,MatSolverType *type) 84 { 85 PetscFunctionBegin; 86 *type = MATSOLVERCUSPARSE; 87 PetscFunctionReturn(0); 88 } 89 90 /*MC 91 MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices 92 on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported 93 algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer 94 performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the 95 CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these 96 algorithms are not recommended. This class does NOT support direct solver operations. 97 98 Level: beginner 99 100 .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 101 M*/ 102 103 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B) 104 { 105 PetscErrorCode ierr; 106 PetscInt n = A->rmap->n; 107 108 PetscFunctionBegin; 109 ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr); 110 (*B)->factortype = ftype; 111 ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr); 112 ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 113 114 if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) { 115 ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr); 116 (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE; 117 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJCUSPARSE; 118 } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) { 119 (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJCUSPARSE; 120 (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE; 121 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types"); 122 123 ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr); 124 ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_cusparse);CHKERRQ(ierr); 125 PetscFunctionReturn(0); 126 } 127 128 PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 129 { 130 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 131 132 PetscFunctionBegin; 133 switch (op) { 134 case MAT_CUSPARSE_MULT: 135 cusparsestruct->format = format; 136 break; 137 case MAT_CUSPARSE_ALL: 138 cusparsestruct->format = format; 139 break; 140 default: 141 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op); 142 } 143 PetscFunctionReturn(0); 144 } 145 146 /*@ 147 MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular 148 operation. Only the MatMult operation can use different GPU storage formats 149 for MPIAIJCUSPARSE matrices. 150 Not Collective 151 152 Input Parameters: 153 + A - Matrix of type SEQAIJCUSPARSE 154 . op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL. 155 - format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2) 156 157 Output Parameter: 158 159 Level: intermediate 160 161 .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 162 @*/ 163 PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 164 { 165 PetscErrorCode ierr; 166 167 PetscFunctionBegin; 168 PetscValidHeaderSpecific(A, MAT_CLASSID,1); 169 ierr = PetscTryMethod(A,"MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));CHKERRQ(ierr); 170 PetscFunctionReturn(0); 171 } 172 173 /*@ 174 MatSeqAIJCUSPARSESetGenerateTranspose - Sets the flag to explicitly generate the tranpose matrix before calling MatMultTranspose 175 176 Collective on mat 177 178 Input Parameters: 179 + A - Matrix of type SEQAIJCUSPARSE 180 - transgen - the boolean flag 181 182 Level: intermediate 183 184 .seealso: MATSEQAIJCUSPARSE 185 @*/ 186 PetscErrorCode MatSeqAIJCUSPARSESetGenerateTranspose(Mat A,PetscBool transgen) 187 { 188 PetscErrorCode ierr; 189 PetscBool flg; 190 191 PetscFunctionBegin; 192 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 193 ierr = PetscObjectTypeCompare(((PetscObject)A),MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 194 if (flg) { 195 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 196 if (A->factortype) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix"); 197 cusp->transgen = transgen; 198 } 199 PetscFunctionReturn(0); 200 } 201 202 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A) 203 { 204 PetscErrorCode ierr; 205 MatCUSPARSEStorageFormat format; 206 PetscBool flg; 207 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 208 209 PetscFunctionBegin; 210 ierr = PetscOptionsHead(PetscOptionsObject,"SeqAIJCUSPARSE options");CHKERRQ(ierr); 211 if (A->factortype == MAT_FACTOR_NONE) { 212 ierr = PetscOptionsBool("-mat_cusparse_transgen","Generate explicit transpose for MatMultTranspose","MatSeqAIJCUSPARSESetGenerateTranspose",cusparsestruct->transgen,&cusparsestruct->transgen,NULL);CHKERRQ(ierr); 213 ierr = PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV", 214 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 215 if (flg) { 216 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);CHKERRQ(ierr); 217 } 218 } 219 ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve", 220 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 221 if (flg) { 222 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr); 223 } 224 ierr = PetscOptionsTail();CHKERRQ(ierr); 225 PetscFunctionReturn(0); 226 } 227 228 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 229 { 230 PetscErrorCode ierr; 231 232 PetscFunctionBegin; 233 ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 234 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 235 PetscFunctionReturn(0); 236 } 237 238 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 239 { 240 PetscErrorCode ierr; 241 242 PetscFunctionBegin; 243 ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 244 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 245 PetscFunctionReturn(0); 246 } 247 248 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info) 249 { 250 PetscErrorCode ierr; 251 252 PetscFunctionBegin; 253 ierr = MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr); 254 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 255 PetscFunctionReturn(0); 256 } 257 258 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info) 259 { 260 PetscErrorCode ierr; 261 262 PetscFunctionBegin; 263 ierr = MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr); 264 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 265 PetscFunctionReturn(0); 266 } 267 268 static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A) 269 { 270 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 271 PetscInt n = A->rmap->n; 272 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 273 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 274 cusparseStatus_t stat; 275 const PetscInt *ai = a->i,*aj = a->j,*vi; 276 const MatScalar *aa = a->a,*v; 277 PetscInt *AiLo, *AjLo; 278 PetscScalar *AALo; 279 PetscInt i,nz, nzLower, offset, rowOffset; 280 PetscErrorCode ierr; 281 cudaError_t cerr; 282 283 PetscFunctionBegin; 284 if (!n) PetscFunctionReturn(0); 285 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 286 try { 287 /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */ 288 nzLower=n+ai[n]-ai[1]; 289 290 /* Allocate Space for the lower triangular matrix */ 291 cerr = cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr); 292 cerr = cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUDA(cerr); 293 cerr = cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr); 294 295 /* Fill the lower triangular matrix */ 296 AiLo[0] = (PetscInt) 0; 297 AiLo[n] = nzLower; 298 AjLo[0] = (PetscInt) 0; 299 AALo[0] = (MatScalar) 1.0; 300 v = aa; 301 vi = aj; 302 offset = 1; 303 rowOffset= 1; 304 for (i=1; i<n; i++) { 305 nz = ai[i+1] - ai[i]; 306 /* additional 1 for the term on the diagonal */ 307 AiLo[i] = rowOffset; 308 rowOffset += nz+1; 309 310 ierr = PetscArraycpy(&(AjLo[offset]), vi, nz);CHKERRQ(ierr); 311 ierr = PetscArraycpy(&(AALo[offset]), v, nz);CHKERRQ(ierr); 312 313 offset += nz; 314 AjLo[offset] = (PetscInt) i; 315 AALo[offset] = (MatScalar) 1.0; 316 offset += 1; 317 318 v += nz; 319 vi += nz; 320 } 321 322 /* allocate space for the triangular factor information */ 323 loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct; 324 325 /* Create the matrix description */ 326 stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat); 327 stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 328 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 329 stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER);CHKERRCUSPARSE(stat); 330 stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat); 331 332 /* Create the solve analysis information */ 333 stat = cusparseCreateSolveAnalysisInfo(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat); 334 335 /* set the operation */ 336 loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 337 338 /* set the matrix */ 339 loTriFactor->csrMat = new CsrMatrix; 340 loTriFactor->csrMat->num_rows = n; 341 loTriFactor->csrMat->num_cols = n; 342 loTriFactor->csrMat->num_entries = nzLower; 343 344 loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1); 345 loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1); 346 347 loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower); 348 loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower); 349 350 loTriFactor->csrMat->values = new THRUSTARRAY(nzLower); 351 loTriFactor->csrMat->values->assign(AALo, AALo+nzLower); 352 353 /* perform the solve analysis */ 354 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, 355 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 356 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 357 loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo);CHKERRCUSPARSE(stat); 358 359 /* assign the pointer. Is this really necessary? */ 360 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor; 361 362 cerr = cudaFreeHost(AiLo);CHKERRCUDA(cerr); 363 cerr = cudaFreeHost(AjLo);CHKERRCUDA(cerr); 364 cerr = cudaFreeHost(AALo);CHKERRCUDA(cerr); 365 ierr = PetscLogCpuToGpu((n+1+nzLower)*sizeof(int)+nzLower*sizeof(PetscScalar));CHKERRQ(ierr); 366 } catch(char *ex) { 367 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 368 } 369 } 370 PetscFunctionReturn(0); 371 } 372 373 static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A) 374 { 375 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 376 PetscInt n = A->rmap->n; 377 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 378 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 379 cusparseStatus_t stat; 380 const PetscInt *aj = a->j,*adiag = a->diag,*vi; 381 const MatScalar *aa = a->a,*v; 382 PetscInt *AiUp, *AjUp; 383 PetscScalar *AAUp; 384 PetscInt i,nz, nzUpper, offset; 385 PetscErrorCode ierr; 386 cudaError_t cerr; 387 388 PetscFunctionBegin; 389 if (!n) PetscFunctionReturn(0); 390 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 391 try { 392 /* next, figure out the number of nonzeros in the upper triangular matrix. */ 393 nzUpper = adiag[0]-adiag[n]; 394 395 /* Allocate Space for the upper triangular matrix */ 396 cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr); 397 cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr); 398 cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr); 399 400 /* Fill the upper triangular matrix */ 401 AiUp[0]=(PetscInt) 0; 402 AiUp[n]=nzUpper; 403 offset = nzUpper; 404 for (i=n-1; i>=0; i--) { 405 v = aa + adiag[i+1] + 1; 406 vi = aj + adiag[i+1] + 1; 407 408 /* number of elements NOT on the diagonal */ 409 nz = adiag[i] - adiag[i+1]-1; 410 411 /* decrement the offset */ 412 offset -= (nz+1); 413 414 /* first, set the diagonal elements */ 415 AjUp[offset] = (PetscInt) i; 416 AAUp[offset] = (MatScalar)1./v[nz]; 417 AiUp[i] = AiUp[i+1] - (nz+1); 418 419 ierr = PetscArraycpy(&(AjUp[offset+1]), vi, nz);CHKERRQ(ierr); 420 ierr = PetscArraycpy(&(AAUp[offset+1]), v, nz);CHKERRQ(ierr); 421 } 422 423 /* allocate space for the triangular factor information */ 424 upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct; 425 426 /* Create the matrix description */ 427 stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat); 428 stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 429 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 430 stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat); 431 stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat); 432 433 /* Create the solve analysis information */ 434 stat = cusparseCreateSolveAnalysisInfo(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat); 435 436 /* set the operation */ 437 upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 438 439 /* set the matrix */ 440 upTriFactor->csrMat = new CsrMatrix; 441 upTriFactor->csrMat->num_rows = n; 442 upTriFactor->csrMat->num_cols = n; 443 upTriFactor->csrMat->num_entries = nzUpper; 444 445 upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1); 446 upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1); 447 448 upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper); 449 upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper); 450 451 upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper); 452 upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper); 453 454 /* perform the solve analysis */ 455 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, 456 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 457 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 458 upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo);CHKERRCUSPARSE(stat); 459 460 /* assign the pointer. Is this really necessary? */ 461 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor; 462 463 cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr); 464 cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr); 465 cerr = cudaFreeHost(AAUp);CHKERRCUDA(cerr); 466 ierr = PetscLogCpuToGpu((n+1+nzUpper)*sizeof(int)+nzUpper*sizeof(PetscScalar));CHKERRQ(ierr); 467 } catch(char *ex) { 468 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 469 } 470 } 471 PetscFunctionReturn(0); 472 } 473 474 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A) 475 { 476 PetscErrorCode ierr; 477 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 478 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 479 IS isrow = a->row,iscol = a->icol; 480 PetscBool row_identity,col_identity; 481 const PetscInt *r,*c; 482 PetscInt n = A->rmap->n; 483 484 PetscFunctionBegin; 485 ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr); 486 ierr = MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);CHKERRQ(ierr); 487 ierr = MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);CHKERRQ(ierr); 488 489 cusparseTriFactors->workVector = new THRUSTARRAY(n); 490 cusparseTriFactors->nnz=a->nz; 491 492 A->offloadmask = PETSC_OFFLOAD_BOTH; 493 /* lower triangular indices */ 494 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 495 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 496 if (!row_identity) { 497 cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 498 cusparseTriFactors->rpermIndices->assign(r, r+n); 499 } 500 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 501 502 /* upper triangular indices */ 503 ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr); 504 ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 505 if (!col_identity) { 506 cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 507 cusparseTriFactors->cpermIndices->assign(c, c+n); 508 } 509 510 if (!row_identity && !col_identity) { 511 ierr = PetscLogCpuToGpu(2*n*sizeof(PetscInt));CHKERRQ(ierr); 512 } else if(!row_identity) { 513 ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr); 514 } else if(!col_identity) { 515 ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr); 516 } 517 518 ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr); 519 PetscFunctionReturn(0); 520 } 521 522 static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A) 523 { 524 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 525 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 526 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 527 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 528 cusparseStatus_t stat; 529 PetscErrorCode ierr; 530 cudaError_t cerr; 531 PetscInt *AiUp, *AjUp; 532 PetscScalar *AAUp; 533 PetscScalar *AALo; 534 PetscInt nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j; 535 Mat_SeqSBAIJ *b = (Mat_SeqSBAIJ*)A->data; 536 const PetscInt *ai = b->i,*aj = b->j,*vj; 537 const MatScalar *aa = b->a,*v; 538 539 PetscFunctionBegin; 540 if (!n) PetscFunctionReturn(0); 541 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 542 try { 543 /* Allocate Space for the upper triangular matrix */ 544 cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr); 545 cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr); 546 cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr); 547 cerr = cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr); 548 549 /* Fill the upper triangular matrix */ 550 AiUp[0]=(PetscInt) 0; 551 AiUp[n]=nzUpper; 552 offset = 0; 553 for (i=0; i<n; i++) { 554 /* set the pointers */ 555 v = aa + ai[i]; 556 vj = aj + ai[i]; 557 nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */ 558 559 /* first, set the diagonal elements */ 560 AjUp[offset] = (PetscInt) i; 561 AAUp[offset] = (MatScalar)1.0/v[nz]; 562 AiUp[i] = offset; 563 AALo[offset] = (MatScalar)1.0/v[nz]; 564 565 offset+=1; 566 if (nz>0) { 567 ierr = PetscArraycpy(&(AjUp[offset]), vj, nz);CHKERRQ(ierr); 568 ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr); 569 for (j=offset; j<offset+nz; j++) { 570 AAUp[j] = -AAUp[j]; 571 AALo[j] = AAUp[j]/v[nz]; 572 } 573 offset+=nz; 574 } 575 } 576 577 /* allocate space for the triangular factor information */ 578 upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct; 579 580 /* Create the matrix description */ 581 stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat); 582 stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 583 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 584 stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat); 585 stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat); 586 587 /* Create the solve analysis information */ 588 stat = cusparseCreateSolveAnalysisInfo(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat); 589 590 /* set the operation */ 591 upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 592 593 /* set the matrix */ 594 upTriFactor->csrMat = new CsrMatrix; 595 upTriFactor->csrMat->num_rows = A->rmap->n; 596 upTriFactor->csrMat->num_cols = A->cmap->n; 597 upTriFactor->csrMat->num_entries = a->nz; 598 599 upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 600 upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1); 601 602 upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 603 upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz); 604 605 upTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 606 upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz); 607 608 /* perform the solve analysis */ 609 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, 610 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 611 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 612 upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo);CHKERRCUSPARSE(stat); 613 614 /* assign the pointer. Is this really necessary? */ 615 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor; 616 617 /* allocate space for the triangular factor information */ 618 loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct; 619 620 /* Create the matrix description */ 621 stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat); 622 stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 623 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 624 stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat); 625 stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat); 626 627 /* Create the solve analysis information */ 628 stat = cusparseCreateSolveAnalysisInfo(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat); 629 630 /* set the operation */ 631 loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE; 632 633 /* set the matrix */ 634 loTriFactor->csrMat = new CsrMatrix; 635 loTriFactor->csrMat->num_rows = A->rmap->n; 636 loTriFactor->csrMat->num_cols = A->cmap->n; 637 loTriFactor->csrMat->num_entries = a->nz; 638 639 loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 640 loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1); 641 642 loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 643 loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz); 644 645 loTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 646 loTriFactor->csrMat->values->assign(AALo, AALo+a->nz); 647 ierr = PetscLogCpuToGpu(2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)));CHKERRQ(ierr); 648 649 /* perform the solve analysis */ 650 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, 651 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 652 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 653 loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo);CHKERRCUSPARSE(stat); 654 655 /* assign the pointer. Is this really necessary? */ 656 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor; 657 658 A->offloadmask = PETSC_OFFLOAD_BOTH; 659 cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr); 660 cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr); 661 cerr = cudaFreeHost(AAUp);CHKERRCUDA(cerr); 662 cerr = cudaFreeHost(AALo);CHKERRCUDA(cerr); 663 } catch(char *ex) { 664 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 665 } 666 } 667 PetscFunctionReturn(0); 668 } 669 670 static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A) 671 { 672 PetscErrorCode ierr; 673 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 674 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 675 IS ip = a->row; 676 const PetscInt *rip; 677 PetscBool perm_identity; 678 PetscInt n = A->rmap->n; 679 680 PetscFunctionBegin; 681 ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr); 682 ierr = MatSeqAIJCUSPARSEBuildICCTriMatrices(A);CHKERRQ(ierr); 683 cusparseTriFactors->workVector = new THRUSTARRAY(n); 684 cusparseTriFactors->nnz=(a->nz-n)*2 + n; 685 686 /* lower triangular indices */ 687 ierr = ISGetIndices(ip,&rip);CHKERRQ(ierr); 688 ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr); 689 if (!perm_identity) { 690 IS iip; 691 const PetscInt *irip; 692 693 ierr = ISInvertPermutation(ip,PETSC_DECIDE,&iip);CHKERRQ(ierr); 694 ierr = ISGetIndices(iip,&irip);CHKERRQ(ierr); 695 cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 696 cusparseTriFactors->rpermIndices->assign(rip, rip+n); 697 cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 698 cusparseTriFactors->cpermIndices->assign(irip, irip+n); 699 ierr = ISRestoreIndices(iip,&irip);CHKERRQ(ierr); 700 ierr = ISDestroy(&iip);CHKERRQ(ierr); 701 ierr = PetscLogCpuToGpu(2*n*sizeof(PetscInt));CHKERRQ(ierr); 702 } 703 ierr = ISRestoreIndices(ip,&rip);CHKERRQ(ierr); 704 PetscFunctionReturn(0); 705 } 706 707 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info) 708 { 709 Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 710 IS isrow = b->row,iscol = b->col; 711 PetscBool row_identity,col_identity; 712 PetscErrorCode ierr; 713 714 PetscFunctionBegin; 715 ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 716 B->offloadmask = PETSC_OFFLOAD_CPU; 717 /* determine which version of MatSolve needs to be used. */ 718 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 719 ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 720 if (row_identity && col_identity) { 721 B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 722 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 723 B->ops->matsolve = NULL; 724 B->ops->matsolvetranspose = NULL; 725 } else { 726 B->ops->solve = MatSolve_SeqAIJCUSPARSE; 727 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 728 B->ops->matsolve = NULL; 729 B->ops->matsolvetranspose = NULL; 730 } 731 732 /* get the triangular factors */ 733 ierr = MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);CHKERRQ(ierr); 734 PetscFunctionReturn(0); 735 } 736 737 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info) 738 { 739 Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 740 IS ip = b->row; 741 PetscBool perm_identity; 742 PetscErrorCode ierr; 743 744 PetscFunctionBegin; 745 ierr = MatCholeskyFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 746 B->offloadmask = PETSC_OFFLOAD_CPU; 747 /* determine which version of MatSolve needs to be used. */ 748 ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr); 749 if (perm_identity) { 750 B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 751 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 752 B->ops->matsolve = NULL; 753 B->ops->matsolvetranspose = NULL; 754 } else { 755 B->ops->solve = MatSolve_SeqAIJCUSPARSE; 756 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 757 B->ops->matsolve = NULL; 758 B->ops->matsolvetranspose = NULL; 759 } 760 761 /* get the triangular factors */ 762 ierr = MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);CHKERRQ(ierr); 763 PetscFunctionReturn(0); 764 } 765 766 static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A) 767 { 768 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 769 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 770 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 771 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 772 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 773 cusparseStatus_t stat; 774 cusparseIndexBase_t indexBase; 775 cusparseMatrixType_t matrixType; 776 cusparseFillMode_t fillMode; 777 cusparseDiagType_t diagType; 778 779 PetscFunctionBegin; 780 781 /*********************************************/ 782 /* Now the Transpose of the Lower Tri Factor */ 783 /*********************************************/ 784 785 /* allocate space for the transpose of the lower triangular factor */ 786 loTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct; 787 788 /* set the matrix descriptors of the lower triangular factor */ 789 matrixType = cusparseGetMatType(loTriFactor->descr); 790 indexBase = cusparseGetMatIndexBase(loTriFactor->descr); 791 fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ? 792 CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 793 diagType = cusparseGetMatDiagType(loTriFactor->descr); 794 795 /* Create the matrix description */ 796 stat = cusparseCreateMatDescr(&loTriFactorT->descr);CHKERRCUSPARSE(stat); 797 stat = cusparseSetMatIndexBase(loTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat); 798 stat = cusparseSetMatType(loTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat); 799 stat = cusparseSetMatFillMode(loTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat); 800 stat = cusparseSetMatDiagType(loTriFactorT->descr, diagType);CHKERRCUSPARSE(stat); 801 802 /* Create the solve analysis information */ 803 stat = cusparseCreateSolveAnalysisInfo(&loTriFactorT->solveInfo);CHKERRCUSPARSE(stat); 804 805 /* set the operation */ 806 loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 807 808 /* allocate GPU space for the CSC of the lower triangular factor*/ 809 loTriFactorT->csrMat = new CsrMatrix; 810 loTriFactorT->csrMat->num_rows = loTriFactor->csrMat->num_rows; 811 loTriFactorT->csrMat->num_cols = loTriFactor->csrMat->num_cols; 812 loTriFactorT->csrMat->num_entries = loTriFactor->csrMat->num_entries; 813 loTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(loTriFactor->csrMat->num_rows+1); 814 loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactor->csrMat->num_entries); 815 loTriFactorT->csrMat->values = new THRUSTARRAY(loTriFactor->csrMat->num_entries); 816 817 /* compute the transpose of the lower triangular factor, i.e. the CSC */ 818 stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, 819 loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, 820 loTriFactor->csrMat->values->data().get(), 821 loTriFactor->csrMat->row_offsets->data().get(), 822 loTriFactor->csrMat->column_indices->data().get(), 823 loTriFactorT->csrMat->values->data().get(), 824 loTriFactorT->csrMat->column_indices->data().get(), 825 loTriFactorT->csrMat->row_offsets->data().get(), 826 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat); 827 828 /* perform the solve analysis on the transposed matrix */ 829 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp, 830 loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, 831 loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(), 832 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), 833 loTriFactorT->solveInfo);CHKERRCUSPARSE(stat); 834 835 /* assign the pointer. Is this really necessary? */ 836 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT; 837 838 /*********************************************/ 839 /* Now the Transpose of the Upper Tri Factor */ 840 /*********************************************/ 841 842 /* allocate space for the transpose of the upper triangular factor */ 843 upTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct; 844 845 /* set the matrix descriptors of the upper triangular factor */ 846 matrixType = cusparseGetMatType(upTriFactor->descr); 847 indexBase = cusparseGetMatIndexBase(upTriFactor->descr); 848 fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ? 849 CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 850 diagType = cusparseGetMatDiagType(upTriFactor->descr); 851 852 /* Create the matrix description */ 853 stat = cusparseCreateMatDescr(&upTriFactorT->descr);CHKERRCUSPARSE(stat); 854 stat = cusparseSetMatIndexBase(upTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat); 855 stat = cusparseSetMatType(upTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat); 856 stat = cusparseSetMatFillMode(upTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat); 857 stat = cusparseSetMatDiagType(upTriFactorT->descr, diagType);CHKERRCUSPARSE(stat); 858 859 /* Create the solve analysis information */ 860 stat = cusparseCreateSolveAnalysisInfo(&upTriFactorT->solveInfo);CHKERRCUSPARSE(stat); 861 862 /* set the operation */ 863 upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 864 865 /* allocate GPU space for the CSC of the upper triangular factor*/ 866 upTriFactorT->csrMat = new CsrMatrix; 867 upTriFactorT->csrMat->num_rows = upTriFactor->csrMat->num_rows; 868 upTriFactorT->csrMat->num_cols = upTriFactor->csrMat->num_cols; 869 upTriFactorT->csrMat->num_entries = upTriFactor->csrMat->num_entries; 870 upTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(upTriFactor->csrMat->num_rows+1); 871 upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactor->csrMat->num_entries); 872 upTriFactorT->csrMat->values = new THRUSTARRAY(upTriFactor->csrMat->num_entries); 873 874 /* compute the transpose of the upper triangular factor, i.e. the CSC */ 875 stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, 876 upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, 877 upTriFactor->csrMat->values->data().get(), 878 upTriFactor->csrMat->row_offsets->data().get(), 879 upTriFactor->csrMat->column_indices->data().get(), 880 upTriFactorT->csrMat->values->data().get(), 881 upTriFactorT->csrMat->column_indices->data().get(), 882 upTriFactorT->csrMat->row_offsets->data().get(), 883 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat); 884 885 /* perform the solve analysis on the transposed matrix */ 886 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp, 887 upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, 888 upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(), 889 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), 890 upTriFactorT->solveInfo);CHKERRCUSPARSE(stat); 891 892 /* assign the pointer. Is this really necessary? */ 893 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT; 894 PetscFunctionReturn(0); 895 } 896 897 static PetscErrorCode MatSeqAIJCUSPARSEGenerateTransposeForMult(Mat A) 898 { 899 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 900 Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 901 Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 902 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 903 cusparseStatus_t stat; 904 cusparseIndexBase_t indexBase; 905 cudaError_t err; 906 907 PetscFunctionBegin; 908 if (!cusparsestruct->transgen) PetscFunctionReturn(0); 909 /* allocate space for the triangular factor information */ 910 matstructT = new Mat_SeqAIJCUSPARSEMultStruct; 911 stat = cusparseCreateMatDescr(&matstructT->descr);CHKERRCUSPARSE(stat); 912 indexBase = cusparseGetMatIndexBase(matstruct->descr); 913 stat = cusparseSetMatIndexBase(matstructT->descr, indexBase);CHKERRCUSPARSE(stat); 914 stat = cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 915 916 /* set alpha and beta */ 917 err = cudaMalloc((void **)&(matstructT->alpha), sizeof(PetscScalar));CHKERRCUDA(err); 918 err = cudaMalloc((void **)&(matstructT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err); 919 err = cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar));CHKERRCUDA(err); 920 err = cudaMemcpy(matstructT->alpha, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 921 err = cudaMemcpy(matstructT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 922 err = cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 923 stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 924 925 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 926 CsrMatrix *matrix = (CsrMatrix*)matstruct->mat; 927 CsrMatrix *matrixT= new CsrMatrix; 928 matrixT->num_rows = A->cmap->n; 929 matrixT->num_cols = A->rmap->n; 930 matrixT->num_entries = a->nz; 931 matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows+1); 932 matrixT->column_indices = new THRUSTINTARRAY32(a->nz); 933 matrixT->values = new THRUSTARRAY(a->nz); 934 935 cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1); 936 cusparsestruct->rowoffsets_gpu->assign(a->i,a->i+A->rmap->n+1); 937 /* compute the transpose, i.e. the CSC */ 938 indexBase = cusparseGetMatIndexBase(matstruct->descr); 939 stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n, 940 A->cmap->n, matrix->num_entries, 941 matrix->values->data().get(), 942 cusparsestruct->rowoffsets_gpu->data().get(), 943 matrix->column_indices->data().get(), 944 matrixT->values->data().get(), 945 matrixT->column_indices->data().get(), 946 matrixT->row_offsets->data().get(), 947 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat); 948 matstructT->mat = matrixT; 949 } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) { 950 CsrMatrix *temp = new CsrMatrix; 951 CsrMatrix *tempT = new CsrMatrix; 952 953 /* First convert HYB to CSR */ 954 temp->num_rows = A->rmap->n; 955 temp->num_cols = A->cmap->n; 956 temp->num_entries = a->nz; 957 temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 958 temp->column_indices = new THRUSTINTARRAY32(a->nz); 959 temp->values = new THRUSTARRAY(a->nz); 960 961 962 stat = cusparse_hyb2csr(cusparsestruct->handle, 963 matstruct->descr, (cusparseHybMat_t)matstruct->mat, 964 temp->values->data().get(), 965 temp->row_offsets->data().get(), 966 temp->column_indices->data().get());CHKERRCUSPARSE(stat); 967 968 /* Next, convert CSR to CSC (i.e. the matrix transpose) */ 969 tempT->num_rows = A->rmap->n; 970 tempT->num_cols = A->cmap->n; 971 tempT->num_entries = a->nz; 972 tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 973 tempT->column_indices = new THRUSTINTARRAY32(a->nz); 974 tempT->values = new THRUSTARRAY(a->nz); 975 976 stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows, 977 temp->num_cols, temp->num_entries, 978 temp->values->data().get(), 979 temp->row_offsets->data().get(), 980 temp->column_indices->data().get(), 981 tempT->values->data().get(), 982 tempT->column_indices->data().get(), 983 tempT->row_offsets->data().get(), 984 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat); 985 986 /* Last, convert CSC to HYB */ 987 cusparseHybMat_t hybMat; 988 stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat); 989 cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ? 990 CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 991 stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n, 992 matstructT->descr, tempT->values->data().get(), 993 tempT->row_offsets->data().get(), 994 tempT->column_indices->data().get(), 995 hybMat, 0, partition);CHKERRCUSPARSE(stat); 996 997 /* assign the pointer */ 998 matstructT->mat = hybMat; 999 /* delete temporaries */ 1000 if (tempT) { 1001 if (tempT->values) delete (THRUSTARRAY*) tempT->values; 1002 if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices; 1003 if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets; 1004 delete (CsrMatrix*) tempT; 1005 } 1006 if (temp) { 1007 if (temp->values) delete (THRUSTARRAY*) temp->values; 1008 if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices; 1009 if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets; 1010 delete (CsrMatrix*) temp; 1011 } 1012 } 1013 /* the compressed row indices is not used for matTranspose */ 1014 matstructT->cprowIndices = NULL; 1015 /* assign the pointer */ 1016 ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT; 1017 PetscFunctionReturn(0); 1018 } 1019 1020 /* Why do we need to analyze the tranposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */ 1021 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 1022 { 1023 PetscInt n = xx->map->n; 1024 const PetscScalar *barray; 1025 PetscScalar *xarray; 1026 thrust::device_ptr<const PetscScalar> bGPU; 1027 thrust::device_ptr<PetscScalar> xGPU; 1028 cusparseStatus_t stat; 1029 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1030 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1031 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1032 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1033 PetscErrorCode ierr; 1034 cudaError_t cerr; 1035 1036 PetscFunctionBegin; 1037 /* Analyze the matrix and create the transpose ... on the fly */ 1038 if (!loTriFactorT && !upTriFactorT) { 1039 ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr); 1040 loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1041 upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1042 } 1043 1044 /* Get the GPU pointers */ 1045 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1046 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1047 xGPU = thrust::device_pointer_cast(xarray); 1048 bGPU = thrust::device_pointer_cast(barray); 1049 1050 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1051 /* First, reorder with the row permutation */ 1052 thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), 1053 thrust::make_permutation_iterator(bGPU+n, cusparseTriFactors->rpermIndices->end()), 1054 xGPU); 1055 1056 /* First, solve U */ 1057 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, 1058 upTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactorT->descr, 1059 upTriFactorT->csrMat->values->data().get(), 1060 upTriFactorT->csrMat->row_offsets->data().get(), 1061 upTriFactorT->csrMat->column_indices->data().get(), 1062 upTriFactorT->solveInfo, 1063 xarray, tempGPU->data().get());CHKERRCUSPARSE(stat); 1064 1065 /* Then, solve L */ 1066 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, 1067 loTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactorT->descr, 1068 loTriFactorT->csrMat->values->data().get(), 1069 loTriFactorT->csrMat->row_offsets->data().get(), 1070 loTriFactorT->csrMat->column_indices->data().get(), 1071 loTriFactorT->solveInfo, 1072 tempGPU->data().get(), xarray);CHKERRCUSPARSE(stat); 1073 1074 /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */ 1075 thrust::copy(thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()), 1076 thrust::make_permutation_iterator(xGPU+n, cusparseTriFactors->cpermIndices->end()), 1077 tempGPU->begin()); 1078 1079 /* Copy the temporary to the full solution. */ 1080 thrust::copy(tempGPU->begin(), tempGPU->end(), xGPU); 1081 1082 /* restore */ 1083 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1084 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1085 cerr = WaitForGPU();CHKERRCUDA(cerr); 1086 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1087 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1088 PetscFunctionReturn(0); 1089 } 1090 1091 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 1092 { 1093 const PetscScalar *barray; 1094 PetscScalar *xarray; 1095 cusparseStatus_t stat; 1096 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1097 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1098 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1099 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1100 PetscErrorCode ierr; 1101 cudaError_t cerr; 1102 1103 PetscFunctionBegin; 1104 /* Analyze the matrix and create the transpose ... on the fly */ 1105 if (!loTriFactorT && !upTriFactorT) { 1106 ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr); 1107 loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1108 upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1109 } 1110 1111 /* Get the GPU pointers */ 1112 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1113 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1114 1115 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1116 /* First, solve U */ 1117 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, 1118 upTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactorT->descr, 1119 upTriFactorT->csrMat->values->data().get(), 1120 upTriFactorT->csrMat->row_offsets->data().get(), 1121 upTriFactorT->csrMat->column_indices->data().get(), 1122 upTriFactorT->solveInfo, 1123 barray, tempGPU->data().get());CHKERRCUSPARSE(stat); 1124 1125 /* Then, solve L */ 1126 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, 1127 loTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactorT->descr, 1128 loTriFactorT->csrMat->values->data().get(), 1129 loTriFactorT->csrMat->row_offsets->data().get(), 1130 loTriFactorT->csrMat->column_indices->data().get(), 1131 loTriFactorT->solveInfo, 1132 tempGPU->data().get(), xarray);CHKERRCUSPARSE(stat); 1133 1134 /* restore */ 1135 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1136 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1137 cerr = WaitForGPU();CHKERRCUDA(cerr); 1138 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1139 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1140 PetscFunctionReturn(0); 1141 } 1142 1143 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 1144 { 1145 const PetscScalar *barray; 1146 PetscScalar *xarray; 1147 thrust::device_ptr<const PetscScalar> bGPU; 1148 thrust::device_ptr<PetscScalar> xGPU; 1149 cusparseStatus_t stat; 1150 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1151 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1152 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1153 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1154 PetscErrorCode ierr; 1155 cudaError_t cerr; 1156 1157 PetscFunctionBegin; 1158 1159 /* Get the GPU pointers */ 1160 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1161 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1162 xGPU = thrust::device_pointer_cast(xarray); 1163 bGPU = thrust::device_pointer_cast(barray); 1164 1165 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1166 /* First, reorder with the row permutation */ 1167 thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), 1168 thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()), 1169 tempGPU->begin()); 1170 1171 /* Next, solve L */ 1172 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp, 1173 loTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactor->descr, 1174 loTriFactor->csrMat->values->data().get(), 1175 loTriFactor->csrMat->row_offsets->data().get(), 1176 loTriFactor->csrMat->column_indices->data().get(), 1177 loTriFactor->solveInfo, 1178 tempGPU->data().get(), xarray);CHKERRCUSPARSE(stat); 1179 1180 /* Then, solve U */ 1181 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp, 1182 upTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactor->descr, 1183 upTriFactor->csrMat->values->data().get(), 1184 upTriFactor->csrMat->row_offsets->data().get(), 1185 upTriFactor->csrMat->column_indices->data().get(), 1186 upTriFactor->solveInfo, 1187 xarray, tempGPU->data().get());CHKERRCUSPARSE(stat); 1188 1189 /* Last, reorder with the column permutation */ 1190 thrust::copy(thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()), 1191 thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()), 1192 xGPU); 1193 1194 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1195 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1196 cerr = WaitForGPU();CHKERRCUDA(cerr); 1197 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1198 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1199 PetscFunctionReturn(0); 1200 } 1201 1202 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 1203 { 1204 const PetscScalar *barray; 1205 PetscScalar *xarray; 1206 cusparseStatus_t stat; 1207 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1208 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1209 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1210 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1211 PetscErrorCode ierr; 1212 cudaError_t cerr; 1213 1214 PetscFunctionBegin; 1215 /* Get the GPU pointers */ 1216 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1217 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1218 1219 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1220 /* First, solve L */ 1221 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp, 1222 loTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactor->descr, 1223 loTriFactor->csrMat->values->data().get(), 1224 loTriFactor->csrMat->row_offsets->data().get(), 1225 loTriFactor->csrMat->column_indices->data().get(), 1226 loTriFactor->solveInfo, 1227 barray, tempGPU->data().get());CHKERRCUSPARSE(stat); 1228 1229 /* Next, solve U */ 1230 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp, 1231 upTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactor->descr, 1232 upTriFactor->csrMat->values->data().get(), 1233 upTriFactor->csrMat->row_offsets->data().get(), 1234 upTriFactor->csrMat->column_indices->data().get(), 1235 upTriFactor->solveInfo, 1236 tempGPU->data().get(), xarray);CHKERRCUSPARSE(stat); 1237 1238 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1239 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1240 cerr = WaitForGPU();CHKERRCUDA(cerr); 1241 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1242 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1243 PetscFunctionReturn(0); 1244 } 1245 1246 static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A) 1247 { 1248 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1249 Mat_SeqAIJCUSPARSEMultStruct *matstruct = cusparsestruct->mat; 1250 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1251 PetscInt m = A->rmap->n,*ii,*ridx,tmp; 1252 PetscErrorCode ierr; 1253 cusparseStatus_t stat; 1254 cudaError_t err; 1255 1256 PetscFunctionBegin; 1257 if (A->boundtocpu) PetscFunctionReturn(0); 1258 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 1259 ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1260 if (A->was_assembled && A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) { 1261 /* Copy values only */ 1262 CsrMatrix *mat,*matT; 1263 mat = (CsrMatrix*)cusparsestruct->mat->mat; 1264 mat->values->assign(a->a, a->a+a->nz); 1265 ierr = PetscLogCpuToGpu((a->nz)*sizeof(PetscScalar));CHKERRQ(ierr); 1266 1267 /* Update matT when it was built before */ 1268 if (cusparsestruct->matTranspose) { 1269 cusparseIndexBase_t indexBase = cusparseGetMatIndexBase(cusparsestruct->mat->descr); 1270 matT = (CsrMatrix*)cusparsestruct->matTranspose->mat; 1271 stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n, 1272 A->cmap->n, mat->num_entries, 1273 mat->values->data().get(), 1274 cusparsestruct->rowoffsets_gpu->data().get(), 1275 mat->column_indices->data().get(), 1276 matT->values->data().get(), 1277 matT->column_indices->data().get(), 1278 matT->row_offsets->data().get(), 1279 CUSPARSE_ACTION_NUMERIC,indexBase);CHKERRCUSPARSE(stat); 1280 } 1281 } else { 1282 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat,cusparsestruct->format);CHKERRQ(ierr); 1283 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->matTranspose,cusparsestruct->format);CHKERRQ(ierr); 1284 delete cusparsestruct->workVector; 1285 delete cusparsestruct->rowoffsets_gpu; 1286 try { 1287 if (a->compressedrow.use) { 1288 m = a->compressedrow.nrows; 1289 ii = a->compressedrow.i; 1290 ridx = a->compressedrow.rindex; 1291 } else { 1292 m = A->rmap->n; 1293 ii = a->i; 1294 ridx = NULL; 1295 } 1296 cusparsestruct->nrows = m; 1297 1298 /* allocate space for the triangular factor information */ 1299 matstruct = new Mat_SeqAIJCUSPARSEMultStruct; 1300 stat = cusparseCreateMatDescr(&matstruct->descr);CHKERRCUSPARSE(stat); 1301 stat = cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 1302 stat = cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 1303 1304 err = cudaMalloc((void **)&(matstruct->alpha), sizeof(PetscScalar));CHKERRCUDA(err); 1305 err = cudaMalloc((void **)&(matstruct->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err); 1306 err = cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar));CHKERRCUDA(err); 1307 err = cudaMemcpy(matstruct->alpha, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1308 err = cudaMemcpy(matstruct->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1309 err = cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1310 stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 1311 1312 /* Build a hybrid/ellpack matrix if this option is chosen for the storage */ 1313 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1314 /* set the matrix */ 1315 CsrMatrix *matrix= new CsrMatrix; 1316 matrix->num_rows = m; 1317 matrix->num_cols = A->cmap->n; 1318 matrix->num_entries = a->nz; 1319 matrix->row_offsets = new THRUSTINTARRAY32(m+1); 1320 matrix->row_offsets->assign(ii, ii + m+1); 1321 1322 matrix->column_indices = new THRUSTINTARRAY32(a->nz); 1323 matrix->column_indices->assign(a->j, a->j+a->nz); 1324 1325 matrix->values = new THRUSTARRAY(a->nz); 1326 matrix->values->assign(a->a, a->a+a->nz); 1327 1328 /* assign the pointer */ 1329 matstruct->mat = matrix; 1330 1331 } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) { 1332 CsrMatrix *matrix= new CsrMatrix; 1333 matrix->num_rows = m; 1334 matrix->num_cols = A->cmap->n; 1335 matrix->num_entries = a->nz; 1336 matrix->row_offsets = new THRUSTINTARRAY32(m+1); 1337 matrix->row_offsets->assign(ii, ii + m+1); 1338 1339 matrix->column_indices = new THRUSTINTARRAY32(a->nz); 1340 matrix->column_indices->assign(a->j, a->j+a->nz); 1341 1342 matrix->values = new THRUSTARRAY(a->nz); 1343 matrix->values->assign(a->a, a->a+a->nz); 1344 1345 cusparseHybMat_t hybMat; 1346 stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat); 1347 cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ? 1348 CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 1349 stat = cusparse_csr2hyb(cusparsestruct->handle, matrix->num_rows, matrix->num_cols, 1350 matstruct->descr, matrix->values->data().get(), 1351 matrix->row_offsets->data().get(), 1352 matrix->column_indices->data().get(), 1353 hybMat, 0, partition);CHKERRCUSPARSE(stat); 1354 /* assign the pointer */ 1355 matstruct->mat = hybMat; 1356 1357 if (matrix) { 1358 if (matrix->values) delete (THRUSTARRAY*)matrix->values; 1359 if (matrix->column_indices) delete (THRUSTINTARRAY32*)matrix->column_indices; 1360 if (matrix->row_offsets) delete (THRUSTINTARRAY32*)matrix->row_offsets; 1361 delete (CsrMatrix*)matrix; 1362 } 1363 } 1364 1365 /* assign the compressed row indices */ 1366 if (a->compressedrow.use) { 1367 cusparsestruct->workVector = new THRUSTARRAY(m); 1368 matstruct->cprowIndices = new THRUSTINTARRAY(m); 1369 matstruct->cprowIndices->assign(ridx,ridx+m); 1370 tmp = m; 1371 } else { 1372 cusparsestruct->workVector = NULL; 1373 matstruct->cprowIndices = NULL; 1374 tmp = 0; 1375 } 1376 ierr = PetscLogCpuToGpu(((m+1)+(a->nz))*sizeof(int)+tmp*sizeof(PetscInt)+(3+(a->nz))*sizeof(PetscScalar));CHKERRQ(ierr); 1377 1378 /* assign the pointer */ 1379 cusparsestruct->mat = matstruct; 1380 } catch(char *ex) { 1381 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 1382 } 1383 cusparsestruct->nonzerostate = A->nonzerostate; 1384 } 1385 err = WaitForGPU();CHKERRCUDA(err); 1386 A->offloadmask = PETSC_OFFLOAD_BOTH; 1387 ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1388 } 1389 PetscFunctionReturn(0); 1390 } 1391 1392 struct VecCUDAPlusEquals 1393 { 1394 template <typename Tuple> 1395 __host__ __device__ 1396 void operator()(Tuple t) 1397 { 1398 thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t); 1399 } 1400 }; 1401 1402 struct VecCUDAEqualsReverse 1403 { 1404 template <typename Tuple> 1405 __host__ __device__ 1406 void operator()(Tuple t) 1407 { 1408 thrust::get<0>(t) = thrust::get<1>(t); 1409 } 1410 }; 1411 1412 typedef struct { 1413 PetscBool cisdense; 1414 PetscScalar *Bt; 1415 Mat X; 1416 } MatMatCusparse; 1417 1418 static PetscErrorCode MatDestroy_MatMatCusparse(void *data) 1419 { 1420 PetscErrorCode ierr; 1421 MatMatCusparse *mmdata = (MatMatCusparse *)data; 1422 cudaError_t cerr; 1423 1424 PetscFunctionBegin; 1425 cerr = cudaFree(mmdata->Bt);CHKERRCUDA(cerr); 1426 ierr = MatDestroy(&mmdata->X);CHKERRQ(ierr); 1427 ierr = PetscFree(data);CHKERRQ(ierr); 1428 PetscFunctionReturn(0); 1429 } 1430 1431 PETSC_INTERN PetscErrorCode MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(Mat,Mat,Mat,PetscBool,PetscBool); 1432 1433 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C) 1434 { 1435 Mat_Product *product = C->product; 1436 Mat A,B; 1437 PetscInt m,n,k,blda,clda; 1438 PetscBool flg,biscuda; 1439 Mat_SeqAIJCUSPARSE *cusp; 1440 cusparseStatus_t stat; 1441 cusparseOperation_t opA; 1442 cusparseMatDescr_t matA; 1443 const PetscScalar *barray; 1444 PetscScalar *carray; 1445 PetscErrorCode ierr; 1446 MatMatCusparse *mmdata; 1447 Mat_SeqAIJCUSPARSEMultStruct *mat; 1448 CsrMatrix *csrmat; 1449 cudaError_t cuer; 1450 1451 PetscFunctionBegin; 1452 MatCheckProduct(C,1); 1453 if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data empty"); 1454 mmdata = (MatMatCusparse*)product->data; 1455 A = product->A; 1456 B = product->B; 1457 ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 1458 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name); 1459 /* currently CopyToGpu does not copy if the matrix is bound to CPU 1460 Instead of silently accepting the wrong answer, I prefer to raise the error */ 1461 if (A->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 1462 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 1463 cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 1464 switch (product->type) { 1465 case MATPRODUCT_AB: 1466 case MATPRODUCT_PtAP: 1467 mat = cusp->mat; 1468 opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 1469 m = A->rmap->n; 1470 k = A->cmap->n; 1471 n = B->cmap->n; 1472 break; 1473 case MATPRODUCT_AtB: 1474 if (!cusp->transgen) { 1475 mat = cusp->mat; 1476 opA = CUSPARSE_OPERATION_TRANSPOSE; 1477 } else { 1478 if (!cusp->matTranspose) { 1479 ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr); 1480 } 1481 mat = cusp->matTranspose; 1482 opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 1483 } 1484 m = A->cmap->n; 1485 k = A->rmap->n; 1486 n = B->cmap->n; 1487 break; 1488 case MATPRODUCT_ABt: 1489 case MATPRODUCT_RARt: 1490 mat = cusp->mat; 1491 opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 1492 m = A->rmap->n; 1493 k = B->cmap->n; 1494 n = B->rmap->n; 1495 break; 1496 default: 1497 SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]); 1498 } 1499 if (!mat) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 1500 matA = mat->descr; 1501 csrmat = (CsrMatrix*)mat->mat; 1502 /* if the user passed a CPU matrix, copy the data to the GPU */ 1503 ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQDENSECUDA,&biscuda);CHKERRQ(ierr); 1504 if (!biscuda) { 1505 ierr = MatConvert(B,MATSEQDENSECUDA,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr); 1506 } 1507 ierr = MatDenseCUDAGetArrayRead(B,&barray);CHKERRQ(ierr); 1508 ierr = MatDenseGetLDA(B,&blda);CHKERRQ(ierr); 1509 if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) { 1510 ierr = MatDenseCUDAGetArrayWrite(mmdata->X,&carray);CHKERRQ(ierr); 1511 ierr = MatDenseGetLDA(mmdata->X,&clda);CHKERRQ(ierr); 1512 } else { 1513 ierr = MatDenseCUDAGetArrayWrite(C,&carray);CHKERRQ(ierr); 1514 ierr = MatDenseGetLDA(C,&clda);CHKERRQ(ierr); 1515 } 1516 1517 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1518 1519 /* cusparse spmm does not support transpose on B */ 1520 if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) { 1521 cublasHandle_t cublasv2handle; 1522 cublasStatus_t cerr; 1523 1524 ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr); 1525 cerr = cublasXgeam(cublasv2handle,CUBLAS_OP_T,CUBLAS_OP_T, 1526 B->cmap->n,B->rmap->n, 1527 &PETSC_CUSPARSE_ONE ,barray,blda, 1528 &PETSC_CUSPARSE_ZERO,barray,blda, 1529 mmdata->Bt,B->cmap->n);CHKERRCUBLAS(cerr); 1530 blda = B->cmap->n; 1531 } 1532 1533 /* perform the MatMat operation */ 1534 stat = cusparse_csr_spmm(cusp->handle,opA,m,n,k, 1535 csrmat->num_entries,mat->alpha,matA, 1536 csrmat->values->data().get(), 1537 csrmat->row_offsets->data().get(), 1538 csrmat->column_indices->data().get(), 1539 mmdata->Bt ? mmdata->Bt : barray,blda,mat->beta_zero, 1540 carray,clda);CHKERRCUSPARSE(stat); 1541 cuer = WaitForGPU();CHKERRCUDA(cuer); 1542 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1543 ierr = PetscLogGpuFlops(n*2.0*csrmat->num_entries);CHKERRQ(ierr); 1544 ierr = MatDenseCUDARestoreArrayRead(B,&barray);CHKERRQ(ierr); 1545 if (product->type == MATPRODUCT_RARt) { 1546 ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr); 1547 ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr); 1548 } else if (product->type == MATPRODUCT_PtAP) { 1549 ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr); 1550 ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr); 1551 } else { 1552 ierr = MatDenseCUDARestoreArrayWrite(C,&carray);CHKERRQ(ierr); 1553 } 1554 if (mmdata->cisdense) { 1555 ierr = MatConvert(C,MATSEQDENSE,MAT_INPLACE_MATRIX,&C);CHKERRQ(ierr); 1556 } 1557 if (!biscuda) { 1558 ierr = MatConvert(B,MATSEQDENSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr); 1559 } 1560 PetscFunctionReturn(0); 1561 } 1562 1563 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C) 1564 { 1565 Mat_Product *product = C->product; 1566 Mat A,B; 1567 PetscInt m,n; 1568 PetscBool cisdense,flg; 1569 PetscErrorCode ierr; 1570 MatMatCusparse *mmdata; 1571 Mat_SeqAIJCUSPARSE *cusp; 1572 1573 PetscFunctionBegin; 1574 MatCheckProduct(C,1); 1575 if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data not empty"); 1576 A = product->A; 1577 B = product->B; 1578 ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 1579 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name); 1580 cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 1581 if (cusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format"); 1582 switch (product->type) { 1583 case MATPRODUCT_AB: 1584 m = A->rmap->n; 1585 n = B->cmap->n; 1586 break; 1587 case MATPRODUCT_AtB: 1588 m = A->cmap->n; 1589 n = B->cmap->n; 1590 break; 1591 case MATPRODUCT_ABt: 1592 m = A->rmap->n; 1593 n = B->rmap->n; 1594 break; 1595 case MATPRODUCT_PtAP: 1596 m = B->cmap->n; 1597 n = B->cmap->n; 1598 break; 1599 case MATPRODUCT_RARt: 1600 m = B->rmap->n; 1601 n = B->rmap->n; 1602 break; 1603 default: 1604 SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]); 1605 } 1606 ierr = MatSetSizes(C,m,n,m,n);CHKERRQ(ierr); 1607 /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */ 1608 ierr = PetscObjectTypeCompare((PetscObject)C,MATSEQDENSE,&cisdense);CHKERRQ(ierr); 1609 ierr = MatSetType(C,MATSEQDENSECUDA);CHKERRQ(ierr); 1610 1611 /* product data */ 1612 ierr = PetscNew(&mmdata);CHKERRQ(ierr); 1613 mmdata->cisdense = cisdense; 1614 /* cusparse spmm does not support transpose on B */ 1615 if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) { 1616 cudaError_t cerr; 1617 1618 cerr = cudaMalloc((void**)&mmdata->Bt,(size_t)B->rmap->n*(size_t)B->cmap->n*sizeof(PetscScalar));CHKERRCUDA(cerr); 1619 } 1620 /* for these products we need intermediate storage */ 1621 if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) { 1622 ierr = MatCreate(PetscObjectComm((PetscObject)C),&mmdata->X);CHKERRQ(ierr); 1623 ierr = MatSetType(mmdata->X,MATSEQDENSECUDA);CHKERRQ(ierr); 1624 if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */ 1625 ierr = MatSetSizes(mmdata->X,A->rmap->n,B->rmap->n,A->rmap->n,B->rmap->n);CHKERRQ(ierr); 1626 } else { 1627 ierr = MatSetSizes(mmdata->X,A->rmap->n,B->cmap->n,A->rmap->n,B->cmap->n);CHKERRQ(ierr); 1628 } 1629 } 1630 C->product->data = mmdata; 1631 C->product->destroy = MatDestroy_MatMatCusparse; 1632 1633 C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA; 1634 PetscFunctionReturn(0); 1635 } 1636 1637 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat); 1638 1639 /* handles dense B */ 1640 static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat C) 1641 { 1642 Mat_Product *product = C->product; 1643 PetscErrorCode ierr; 1644 1645 PetscFunctionBegin; 1646 MatCheckProduct(C,1); 1647 if (!product->A) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing A"); 1648 if (product->A->boundtocpu) { 1649 ierr = MatProductSetFromOptions_SeqAIJ_SeqDense(C);CHKERRQ(ierr); 1650 PetscFunctionReturn(0); 1651 } 1652 switch (product->type) { 1653 case MATPRODUCT_AB: 1654 case MATPRODUCT_AtB: 1655 case MATPRODUCT_ABt: 1656 case MATPRODUCT_PtAP: 1657 case MATPRODUCT_RARt: 1658 C->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA; 1659 default: 1660 break; 1661 } 1662 PetscFunctionReturn(0); 1663 } 1664 1665 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 1666 { 1667 PetscErrorCode ierr; 1668 1669 PetscFunctionBegin; 1670 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr); 1671 PetscFunctionReturn(0); 1672 } 1673 1674 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy, Vec zz) 1675 { 1676 PetscErrorCode ierr; 1677 1678 PetscFunctionBegin; 1679 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr); 1680 PetscFunctionReturn(0); 1681 } 1682 1683 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 1684 { 1685 PetscErrorCode ierr; 1686 1687 PetscFunctionBegin; 1688 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr); 1689 PetscFunctionReturn(0); 1690 } 1691 1692 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 1693 { 1694 PetscErrorCode ierr; 1695 1696 PetscFunctionBegin; 1697 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr); 1698 PetscFunctionReturn(0); 1699 } 1700 1701 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 1702 { 1703 PetscErrorCode ierr; 1704 1705 PetscFunctionBegin; 1706 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr); 1707 PetscFunctionReturn(0); 1708 } 1709 1710 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz,PetscBool trans,PetscBool herm) 1711 { 1712 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1713 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1714 Mat_SeqAIJCUSPARSEMultStruct *matstruct; 1715 PetscScalar *xarray,*zarray,*dptr,*beta,*xptr; 1716 PetscErrorCode ierr; 1717 cudaError_t cerr; 1718 cusparseStatus_t stat; 1719 cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 1720 PetscBool compressed; 1721 1722 PetscFunctionBegin; 1723 if (herm && !trans) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Hermitian and not transpose not supported"); 1724 if (!a->nonzerorowcnt) { 1725 if (!yy) { 1726 ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr); 1727 } 1728 PetscFunctionReturn(0); 1729 } 1730 /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */ 1731 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 1732 if (!trans) { 1733 matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 1734 } else { 1735 if (herm || !cusparsestruct->transgen) { 1736 opA = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE; 1737 matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 1738 } else { 1739 matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 1740 if (!matstruct) { 1741 ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr); 1742 matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 1743 } 1744 } 1745 } 1746 /* Does the matrix use compressed rows (i.e., drop zero rows)? */ 1747 compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE; 1748 1749 try { 1750 ierr = VecCUDAGetArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr); 1751 if (yy == zz) {ierr = VecCUDAGetArray(zz,&zarray);CHKERRQ(ierr);} /* read & write zz, so need to get uptodate zarray on GPU */ 1752 else {ierr = VecCUDAGetArrayWrite(zz,&zarray);CHKERRQ(ierr);} /* write zz, so no need to init zarray on GPU */ 1753 if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) { 1754 xptr = xarray; 1755 dptr = compressed ? cusparsestruct->workVector->data().get() : zarray; /* Use a work vector for SpMv output if compressed */ 1756 beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero; 1757 } else { 1758 xptr = compressed ? cusparsestruct->workVector->data().get() : xarray; /* Use a work vector for SpMv input if compressed */ 1759 dptr = zarray; 1760 beta = yy ? matstruct->beta_one : matstruct->beta_zero; 1761 1762 if (compressed) { 1763 thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray); 1764 1765 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1766 thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))), 1767 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), 1768 VecCUDAEqualsReverse()); 1769 cerr = WaitForGPU();CHKERRCUDA(cerr); 1770 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1771 } 1772 } 1773 1774 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1775 /* csr_spmv does y = alpha*Ax + beta*y */ 1776 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 1777 CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 1778 stat = cusparse_csr_spmv(cusparsestruct->handle, opA, 1779 mat->num_rows, mat->num_cols, 1780 mat->num_entries, matstruct->alpha, matstruct->descr, 1781 mat->values->data().get(), mat->row_offsets->data().get(), 1782 mat->column_indices->data().get(), xptr, beta, 1783 dptr);CHKERRCUSPARSE(stat); 1784 } else { 1785 if (cusparsestruct->nrows) { 1786 cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat; 1787 stat = cusparse_hyb_spmv(cusparsestruct->handle, opA, 1788 matstruct->alpha, matstruct->descr, hybMat, 1789 xptr, beta, 1790 dptr);CHKERRCUSPARSE(stat); 1791 } 1792 } 1793 cerr = WaitForGPU();CHKERRCUDA(cerr); 1794 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1795 1796 if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) { 1797 if (yy) { /* MatMultAdd: zz = A*xx + yy */ 1798 if (compressed) { /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */ 1799 ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr); /* zz = yy */ 1800 } else if (zz != yy) { /* A is not compressed. zz already contains A*xx, and we just need to add yy */ 1801 ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */ 1802 } 1803 } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */ 1804 ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr); 1805 } 1806 1807 /* ScatterAdd the result from work vector into the full vector when A is compressed */ 1808 if (compressed) { 1809 thrust::device_ptr<PetscScalar> zptr = thrust::device_pointer_cast(zarray); 1810 1811 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1812 thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))), 1813 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), 1814 VecCUDAPlusEquals()); 1815 cerr = WaitForGPU();CHKERRCUDA(cerr); 1816 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1817 } 1818 } else { 1819 if (yy && yy != zz) { 1820 ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */ 1821 } 1822 } 1823 ierr = VecCUDARestoreArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr); 1824 if (yy == zz) {ierr = VecCUDARestoreArray(zz,&zarray);CHKERRQ(ierr);} 1825 else {ierr = VecCUDARestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);} 1826 } catch(char *ex) { 1827 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 1828 } 1829 if (yy) { 1830 ierr = PetscLogGpuFlops(2.0*a->nz);CHKERRQ(ierr); 1831 } else { 1832 ierr = PetscLogGpuFlops(2.0*a->nz-a->nonzerorowcnt);CHKERRQ(ierr); 1833 } 1834 PetscFunctionReturn(0); 1835 } 1836 1837 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 1838 { 1839 PetscErrorCode ierr; 1840 1841 PetscFunctionBegin; 1842 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr); 1843 PetscFunctionReturn(0); 1844 } 1845 1846 static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode) 1847 { 1848 PetscErrorCode ierr; 1849 1850 PetscFunctionBegin; 1851 ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr); 1852 if (mode == MAT_FLUSH_ASSEMBLY || A->boundtocpu) PetscFunctionReturn(0); 1853 if (A->factortype == MAT_FACTOR_NONE) { 1854 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 1855 } 1856 PetscFunctionReturn(0); 1857 } 1858 1859 /* --------------------------------------------------------------------------------*/ 1860 /*@ 1861 MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format 1862 (the default parallel PETSc format). This matrix will ultimately pushed down 1863 to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix 1864 assembly performance the user should preallocate the matrix storage by setting 1865 the parameter nz (or the array nnz). By setting these parameters accurately, 1866 performance during matrix assembly can be increased by more than a factor of 50. 1867 1868 Collective 1869 1870 Input Parameters: 1871 + comm - MPI communicator, set to PETSC_COMM_SELF 1872 . m - number of rows 1873 . n - number of columns 1874 . nz - number of nonzeros per row (same for all rows) 1875 - nnz - array containing the number of nonzeros in the various rows 1876 (possibly different for each row) or NULL 1877 1878 Output Parameter: 1879 . A - the matrix 1880 1881 It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 1882 MatXXXXSetPreallocation() paradgm instead of this routine directly. 1883 [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 1884 1885 Notes: 1886 If nnz is given then nz is ignored 1887 1888 The AIJ format (also called the Yale sparse matrix format or 1889 compressed row storage), is fully compatible with standard Fortran 77 1890 storage. That is, the stored row and column indices can begin at 1891 either one (as in Fortran) or zero. See the users' manual for details. 1892 1893 Specify the preallocated storage with either nz or nnz (not both). 1894 Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 1895 allocation. For large problems you MUST preallocate memory or you 1896 will get TERRIBLE performance, see the users' manual chapter on matrices. 1897 1898 By default, this format uses inodes (identical nodes) when possible, to 1899 improve numerical efficiency of matrix-vector products and solves. We 1900 search for consecutive rows with the same nonzero structure, thereby 1901 reusing matrix information to achieve increased efficiency. 1902 1903 Level: intermediate 1904 1905 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE 1906 @*/ 1907 PetscErrorCode MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A) 1908 { 1909 PetscErrorCode ierr; 1910 1911 PetscFunctionBegin; 1912 ierr = MatCreate(comm,A);CHKERRQ(ierr); 1913 ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr); 1914 ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 1915 ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr); 1916 PetscFunctionReturn(0); 1917 } 1918 1919 static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A) 1920 { 1921 PetscErrorCode ierr; 1922 1923 PetscFunctionBegin; 1924 if (A->factortype == MAT_FACTOR_NONE) { 1925 ierr = MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr);CHKERRQ(ierr); 1926 } else { 1927 ierr = MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr);CHKERRQ(ierr); 1928 } 1929 ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL);CHKERRQ(ierr); 1930 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr); 1931 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr); 1932 ierr = PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL);CHKERRQ(ierr); 1933 ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr); 1934 PetscFunctionReturn(0); 1935 } 1936 1937 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat,MatType,MatReuse,Mat*); 1938 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat,PetscBool); 1939 static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A,MatDuplicateOption cpvalues,Mat *B) 1940 { 1941 PetscErrorCode ierr; 1942 1943 PetscFunctionBegin; 1944 ierr = MatDuplicate_SeqAIJ(A,cpvalues,B);CHKERRQ(ierr); 1945 ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,B);CHKERRQ(ierr); 1946 PetscFunctionReturn(0); 1947 } 1948 1949 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A,PetscBool flg) 1950 { 1951 PetscErrorCode ierr; 1952 1953 PetscFunctionBegin; 1954 if (A->factortype != MAT_FACTOR_NONE) PetscFunctionReturn(0); 1955 /* Currently, there is no case in which an AIJCUSPARSE matrix ever has its offloadmask set to PETS_OFFLOAD_GPU. 1956 If this changes, we need to implement a routine to update the CPU (host) version of the matrix from the GPU one. 1957 Right now, for safety we simply check for PETSC_OFFLOAD_GPU and have MatBindToCPU() error in this case. 1958 TODO: Add MatAIJCUSPARSECopyFromGPU() and make MatBindToCPU() functional for AIJCUSPARSE matries; 1959 can follow the example of MatBindToCPU_SeqAIJViennaCL(). */ 1960 if (flg && A->offloadmask == PETSC_OFFLOAD_GPU) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"PETSC_OFFLOAD_GPU should not happen. Please report your use case to petsc-dev@mcs.anl.gov"); 1961 /* TODO: add support for this? */ 1962 if (flg) { 1963 A->ops->mult = MatMult_SeqAIJ; 1964 A->ops->multadd = MatMultAdd_SeqAIJ; 1965 A->ops->multtranspose = MatMultTranspose_SeqAIJ; 1966 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJ; 1967 A->ops->multhermitiantranspose = NULL; 1968 A->ops->multhermitiantransposeadd = NULL; 1969 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr); 1970 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr); 1971 } else { 1972 A->ops->mult = MatMult_SeqAIJCUSPARSE; 1973 A->ops->multadd = MatMultAdd_SeqAIJCUSPARSE; 1974 A->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE; 1975 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE; 1976 A->ops->multhermitiantranspose = MatMultHermitianTranspose_SeqAIJCUSPARSE; 1977 A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE; 1978 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr); 1979 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr); 1980 } 1981 A->boundtocpu = flg; 1982 PetscFunctionReturn(0); 1983 } 1984 1985 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType mtype, MatReuse reuse, Mat* newmat) 1986 { 1987 PetscErrorCode ierr; 1988 cusparseStatus_t stat; 1989 Mat B; 1990 1991 PetscFunctionBegin; 1992 if (reuse == MAT_INITIAL_MATRIX) { 1993 ierr = MatDuplicate(A,MAT_COPY_VALUES,newmat);CHKERRQ(ierr); 1994 } else if (reuse == MAT_REUSE_MATRIX) { 1995 ierr = MatCopy(A,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr); 1996 } 1997 B = *newmat; 1998 1999 ierr = PetscFree(B->defaultvectype);CHKERRQ(ierr); 2000 ierr = PetscStrallocpy(VECCUDA,&B->defaultvectype);CHKERRQ(ierr); 2001 2002 if (reuse != MAT_REUSE_MATRIX && !B->spptr) { 2003 if (B->factortype == MAT_FACTOR_NONE) { 2004 Mat_SeqAIJCUSPARSE *spptr; 2005 2006 ierr = PetscNew(&spptr);CHKERRQ(ierr); 2007 spptr->format = MAT_CUSPARSE_CSR; 2008 stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat); 2009 B->spptr = spptr; 2010 } else { 2011 Mat_SeqAIJCUSPARSETriFactors *spptr; 2012 2013 ierr = PetscNew(&spptr);CHKERRQ(ierr); 2014 stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat); 2015 B->spptr = spptr; 2016 } 2017 B->offloadmask = PETSC_OFFLOAD_UNALLOCATED; 2018 } 2019 B->ops->assemblyend = MatAssemblyEnd_SeqAIJCUSPARSE; 2020 B->ops->destroy = MatDestroy_SeqAIJCUSPARSE; 2021 B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE; 2022 B->ops->bindtocpu = MatBindToCPU_SeqAIJCUSPARSE; 2023 B->ops->duplicate = MatDuplicate_SeqAIJCUSPARSE; 2024 2025 ierr = MatBindToCPU_SeqAIJCUSPARSE(B,PETSC_FALSE);CHKERRQ(ierr); 2026 ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 2027 ierr = PetscObjectComposeFunction((PetscObject)B,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr); 2028 PetscFunctionReturn(0); 2029 } 2030 2031 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B) 2032 { 2033 PetscErrorCode ierr; 2034 2035 PetscFunctionBegin; 2036 ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr); 2037 ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr); 2038 PetscFunctionReturn(0); 2039 } 2040 2041 /*MC 2042 MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices. 2043 2044 A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either 2045 CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later. 2046 All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library. 2047 2048 Options Database Keys: 2049 + -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions() 2050 . -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 2051 - -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 2052 2053 Level: beginner 2054 2055 .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 2056 M*/ 2057 2058 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat,MatFactorType,Mat*); 2059 2060 2061 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void) 2062 { 2063 PetscErrorCode ierr; 2064 2065 PetscFunctionBegin; 2066 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 2067 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 2068 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 2069 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 2070 PetscFunctionReturn(0); 2071 } 2072 2073 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct) 2074 { 2075 PetscErrorCode ierr; 2076 cusparseStatus_t stat; 2077 cusparseHandle_t handle; 2078 2079 PetscFunctionBegin; 2080 if (*cusparsestruct) { 2081 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format);CHKERRQ(ierr); 2082 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format);CHKERRQ(ierr); 2083 delete (*cusparsestruct)->workVector; 2084 delete (*cusparsestruct)->rowoffsets_gpu; 2085 if (handle = (*cusparsestruct)->handle) { 2086 stat = cusparseDestroy(handle);CHKERRCUSPARSE(stat); 2087 } 2088 ierr = PetscFree(*cusparsestruct);CHKERRQ(ierr); 2089 } 2090 PetscFunctionReturn(0); 2091 } 2092 2093 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat) 2094 { 2095 PetscFunctionBegin; 2096 if (*mat) { 2097 delete (*mat)->values; 2098 delete (*mat)->column_indices; 2099 delete (*mat)->row_offsets; 2100 delete *mat; 2101 *mat = 0; 2102 } 2103 PetscFunctionReturn(0); 2104 } 2105 2106 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor) 2107 { 2108 cusparseStatus_t stat; 2109 PetscErrorCode ierr; 2110 2111 PetscFunctionBegin; 2112 if (*trifactor) { 2113 if ((*trifactor)->descr) { stat = cusparseDestroyMatDescr((*trifactor)->descr);CHKERRCUSPARSE(stat); } 2114 if ((*trifactor)->solveInfo) { stat = cusparseDestroySolveAnalysisInfo((*trifactor)->solveInfo);CHKERRCUSPARSE(stat); } 2115 ierr = CsrMatrix_Destroy(&(*trifactor)->csrMat);CHKERRQ(ierr); 2116 delete *trifactor; 2117 *trifactor = 0; 2118 } 2119 PetscFunctionReturn(0); 2120 } 2121 2122 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format) 2123 { 2124 CsrMatrix *mat; 2125 cusparseStatus_t stat; 2126 cudaError_t err; 2127 2128 PetscFunctionBegin; 2129 if (*matstruct) { 2130 if ((*matstruct)->mat) { 2131 if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) { 2132 cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat; 2133 stat = cusparseDestroyHybMat(hybMat);CHKERRCUSPARSE(stat); 2134 } else { 2135 mat = (CsrMatrix*)(*matstruct)->mat; 2136 CsrMatrix_Destroy(&mat); 2137 } 2138 } 2139 if ((*matstruct)->descr) { stat = cusparseDestroyMatDescr((*matstruct)->descr);CHKERRCUSPARSE(stat); } 2140 delete (*matstruct)->cprowIndices; 2141 if ((*matstruct)->alpha) { err=cudaFree((*matstruct)->alpha);CHKERRCUDA(err); } 2142 if ((*matstruct)->beta_zero) { err=cudaFree((*matstruct)->beta_zero);CHKERRCUDA(err); } 2143 if ((*matstruct)->beta_one) { err=cudaFree((*matstruct)->beta_one);CHKERRCUDA(err); } 2144 delete *matstruct; 2145 *matstruct = 0; 2146 } 2147 PetscFunctionReturn(0); 2148 } 2149 2150 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors** trifactors) 2151 { 2152 PetscErrorCode ierr; 2153 2154 PetscFunctionBegin; 2155 if (*trifactors) { 2156 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtr);CHKERRQ(ierr); 2157 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtr);CHKERRQ(ierr); 2158 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtrTranspose);CHKERRQ(ierr); 2159 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtrTranspose);CHKERRQ(ierr); 2160 delete (*trifactors)->rpermIndices; 2161 delete (*trifactors)->cpermIndices; 2162 delete (*trifactors)->workVector; 2163 (*trifactors)->rpermIndices = 0; 2164 (*trifactors)->cpermIndices = 0; 2165 (*trifactors)->workVector = 0; 2166 } 2167 PetscFunctionReturn(0); 2168 } 2169 2170 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors) 2171 { 2172 PetscErrorCode ierr; 2173 cusparseHandle_t handle; 2174 cusparseStatus_t stat; 2175 2176 PetscFunctionBegin; 2177 if (*trifactors) { 2178 ierr = MatSeqAIJCUSPARSETriFactors_Reset(trifactors);CHKERRQ(ierr); 2179 if (handle = (*trifactors)->handle) { 2180 stat = cusparseDestroy(handle);CHKERRCUSPARSE(stat); 2181 } 2182 ierr = PetscFree(*trifactors);CHKERRQ(ierr); 2183 } 2184 PetscFunctionReturn(0); 2185 } 2186