1 /* 2 Defines the basic matrix operations for the AIJ (compressed row) 3 matrix storage format using the CUSPARSE library, 4 */ 5 #define PETSC_SKIP_SPINLOCK 6 7 #include <petscconf.h> 8 #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/ 9 #include <../src/mat/impls/sbaij/seq/sbaij.h> 10 #include <../src/vec/vec/impls/dvecimpl.h> 11 #include <petsc/private/vecimpl.h> 12 #undef VecType 13 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h> 14 15 const char *const MatCUSPARSEStorageFormats[] = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0}; 16 17 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*); 18 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*); 19 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*); 20 21 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 22 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 23 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*); 24 25 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec); 26 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 27 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 28 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 29 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat); 30 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec); 31 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 32 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 33 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 34 35 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**); 36 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**); 37 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat); 38 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**); 39 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**); 40 41 PetscErrorCode MatCUSPARSESetStream(Mat A,const cudaStream_t stream) 42 { 43 cusparseStatus_t stat; 44 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 45 46 PetscFunctionBegin; 47 cusparsestruct->stream = stream; 48 stat = cusparseSetStream(cusparsestruct->handle,cusparsestruct->stream);CHKERRCUDA(stat); 49 PetscFunctionReturn(0); 50 } 51 52 PetscErrorCode MatCUSPARSESetHandle(Mat A,const cusparseHandle_t handle) 53 { 54 cusparseStatus_t stat; 55 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 56 57 PetscFunctionBegin; 58 if (cusparsestruct->handle != handle) { 59 if (cusparsestruct->handle) { 60 stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUDA(stat); 61 } 62 cusparsestruct->handle = handle; 63 } 64 stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUDA(stat); 65 PetscFunctionReturn(0); 66 } 67 68 PetscErrorCode MatCUSPARSEClearHandle(Mat A) 69 { 70 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 71 PetscFunctionBegin; 72 if (cusparsestruct->handle) 73 cusparsestruct->handle = 0; 74 PetscFunctionReturn(0); 75 } 76 77 PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A,MatSolverType *type) 78 { 79 PetscFunctionBegin; 80 *type = MATSOLVERCUSPARSE; 81 PetscFunctionReturn(0); 82 } 83 84 /*MC 85 MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices 86 on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported 87 algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer 88 performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the 89 CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these 90 algorithms are not recommended. This class does NOT support direct solver operations. 91 92 Level: beginner 93 94 .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 95 M*/ 96 97 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B) 98 { 99 PetscErrorCode ierr; 100 PetscInt n = A->rmap->n; 101 102 PetscFunctionBegin; 103 ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr); 104 (*B)->factortype = ftype; 105 ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr); 106 ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 107 108 if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) { 109 ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr); 110 (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE; 111 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJCUSPARSE; 112 } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) { 113 (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJCUSPARSE; 114 (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE; 115 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types"); 116 117 ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr); 118 ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_cusparse);CHKERRQ(ierr); 119 PetscFunctionReturn(0); 120 } 121 122 PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 123 { 124 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 125 126 PetscFunctionBegin; 127 #if CUDA_VERSION>=4020 128 switch (op) { 129 case MAT_CUSPARSE_MULT: 130 cusparsestruct->format = format; 131 break; 132 case MAT_CUSPARSE_ALL: 133 cusparsestruct->format = format; 134 break; 135 default: 136 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op); 137 } 138 #else 139 if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"ELL (Ellpack) and HYB (Hybrid) storage format require CUDA 4.2 or later."); 140 #endif 141 PetscFunctionReturn(0); 142 } 143 144 /*@ 145 MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular 146 operation. Only the MatMult operation can use different GPU storage formats 147 for MPIAIJCUSPARSE matrices. 148 Not Collective 149 150 Input Parameters: 151 + A - Matrix of type SEQAIJCUSPARSE 152 . op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL. 153 - format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2) 154 155 Output Parameter: 156 157 Level: intermediate 158 159 .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 160 @*/ 161 PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 162 { 163 PetscErrorCode ierr; 164 165 PetscFunctionBegin; 166 PetscValidHeaderSpecific(A, MAT_CLASSID,1); 167 ierr = PetscTryMethod(A, "MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));CHKERRQ(ierr); 168 PetscFunctionReturn(0); 169 } 170 171 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A) 172 { 173 PetscErrorCode ierr; 174 MatCUSPARSEStorageFormat format; 175 PetscBool flg; 176 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 177 178 PetscFunctionBegin; 179 ierr = PetscOptionsHead(PetscOptionsObject,"SeqAIJCUSPARSE options");CHKERRQ(ierr); 180 ierr = PetscObjectOptionsBegin((PetscObject)A); 181 if (A->factortype==MAT_FACTOR_NONE) { 182 ierr = PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV", 183 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 184 if (flg) { 185 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);CHKERRQ(ierr); 186 } 187 } 188 ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve", 189 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 190 if (flg) { 191 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr); 192 } 193 ierr = PetscOptionsEnd();CHKERRQ(ierr); 194 PetscFunctionReturn(0); 195 196 } 197 198 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 199 { 200 PetscErrorCode ierr; 201 202 PetscFunctionBegin; 203 ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 204 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 205 PetscFunctionReturn(0); 206 } 207 208 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 209 { 210 PetscErrorCode ierr; 211 212 PetscFunctionBegin; 213 ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 214 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 215 PetscFunctionReturn(0); 216 } 217 218 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info) 219 { 220 PetscErrorCode ierr; 221 222 PetscFunctionBegin; 223 ierr = MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr); 224 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 225 PetscFunctionReturn(0); 226 } 227 228 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info) 229 { 230 PetscErrorCode ierr; 231 232 PetscFunctionBegin; 233 ierr = MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr); 234 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 235 PetscFunctionReturn(0); 236 } 237 238 static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A) 239 { 240 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 241 PetscInt n = A->rmap->n; 242 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 243 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 244 cusparseStatus_t stat; 245 const PetscInt *ai = a->i,*aj = a->j,*vi; 246 const MatScalar *aa = a->a,*v; 247 PetscInt *AiLo, *AjLo; 248 PetscScalar *AALo; 249 PetscInt i,nz, nzLower, offset, rowOffset; 250 PetscErrorCode ierr; 251 252 PetscFunctionBegin; 253 if (A->valid_GPU_matrix == PETSC_OFFLOAD_UNALLOCATED || A->valid_GPU_matrix == PETSC_OFFLOAD_CPU) { 254 try { 255 /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */ 256 nzLower=n+ai[n]-ai[1]; 257 258 /* Allocate Space for the lower triangular matrix */ 259 ierr = cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUDA(ierr); 260 ierr = cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUDA(ierr); 261 ierr = cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUDA(ierr); 262 263 /* Fill the lower triangular matrix */ 264 AiLo[0] = (PetscInt) 0; 265 AiLo[n] = nzLower; 266 AjLo[0] = (PetscInt) 0; 267 AALo[0] = (MatScalar) 1.0; 268 v = aa; 269 vi = aj; 270 offset = 1; 271 rowOffset= 1; 272 for (i=1; i<n; i++) { 273 nz = ai[i+1] - ai[i]; 274 /* additional 1 for the term on the diagonal */ 275 AiLo[i] = rowOffset; 276 rowOffset += nz+1; 277 278 ierr = PetscMemcpy(&(AjLo[offset]), vi, nz*sizeof(PetscInt));CHKERRQ(ierr); 279 ierr = PetscMemcpy(&(AALo[offset]), v, nz*sizeof(PetscScalar));CHKERRQ(ierr); 280 281 offset += nz; 282 AjLo[offset] = (PetscInt) i; 283 AALo[offset] = (MatScalar) 1.0; 284 offset += 1; 285 286 v += nz; 287 vi += nz; 288 } 289 290 /* allocate space for the triangular factor information */ 291 loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct; 292 293 /* Create the matrix description */ 294 stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUDA(stat); 295 stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat); 296 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUDA(stat); 297 stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER);CHKERRCUDA(stat); 298 stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUDA(stat); 299 300 /* Create the solve analysis information */ 301 stat = cusparseCreateSolveAnalysisInfo(&loTriFactor->solveInfo);CHKERRCUDA(stat); 302 303 /* set the operation */ 304 loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 305 306 /* set the matrix */ 307 loTriFactor->csrMat = new CsrMatrix; 308 loTriFactor->csrMat->num_rows = n; 309 loTriFactor->csrMat->num_cols = n; 310 loTriFactor->csrMat->num_entries = nzLower; 311 312 loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1); 313 loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1); 314 315 loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower); 316 loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower); 317 318 loTriFactor->csrMat->values = new THRUSTARRAY(nzLower); 319 loTriFactor->csrMat->values->assign(AALo, AALo+nzLower); 320 321 /* perform the solve analysis */ 322 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, 323 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 324 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 325 loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo);CHKERRCUDA(stat); 326 327 /* assign the pointer. Is this really necessary? */ 328 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor; 329 330 ierr = cudaFreeHost(AiLo);CHKERRCUDA(ierr); 331 ierr = cudaFreeHost(AjLo);CHKERRCUDA(ierr); 332 ierr = cudaFreeHost(AALo);CHKERRCUDA(ierr); 333 } catch(char *ex) { 334 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 335 } 336 } 337 PetscFunctionReturn(0); 338 } 339 340 static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A) 341 { 342 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 343 PetscInt n = A->rmap->n; 344 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 345 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 346 cusparseStatus_t stat; 347 const PetscInt *aj = a->j,*adiag = a->diag,*vi; 348 const MatScalar *aa = a->a,*v; 349 PetscInt *AiUp, *AjUp; 350 PetscScalar *AAUp; 351 PetscInt i,nz, nzUpper, offset; 352 PetscErrorCode ierr; 353 354 PetscFunctionBegin; 355 if (A->valid_GPU_matrix == PETSC_OFFLOAD_UNALLOCATED || A->valid_GPU_matrix == PETSC_OFFLOAD_CPU) { 356 try { 357 /* next, figure out the number of nonzeros in the upper triangular matrix. */ 358 nzUpper = adiag[0]-adiag[n]; 359 360 /* Allocate Space for the upper triangular matrix */ 361 ierr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(ierr); 362 ierr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(ierr); 363 ierr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(ierr); 364 365 /* Fill the upper triangular matrix */ 366 AiUp[0]=(PetscInt) 0; 367 AiUp[n]=nzUpper; 368 offset = nzUpper; 369 for (i=n-1; i>=0; i--) { 370 v = aa + adiag[i+1] + 1; 371 vi = aj + adiag[i+1] + 1; 372 373 /* number of elements NOT on the diagonal */ 374 nz = adiag[i] - adiag[i+1]-1; 375 376 /* decrement the offset */ 377 offset -= (nz+1); 378 379 /* first, set the diagonal elements */ 380 AjUp[offset] = (PetscInt) i; 381 AAUp[offset] = (MatScalar)1./v[nz]; 382 AiUp[i] = AiUp[i+1] - (nz+1); 383 384 ierr = PetscMemcpy(&(AjUp[offset+1]), vi, nz*sizeof(PetscInt));CHKERRQ(ierr); 385 ierr = PetscMemcpy(&(AAUp[offset+1]), v, nz*sizeof(PetscScalar));CHKERRQ(ierr); 386 } 387 388 /* allocate space for the triangular factor information */ 389 upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct; 390 391 /* Create the matrix description */ 392 stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUDA(stat); 393 stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat); 394 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUDA(stat); 395 stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUDA(stat); 396 stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUDA(stat); 397 398 /* Create the solve analysis information */ 399 stat = cusparseCreateSolveAnalysisInfo(&upTriFactor->solveInfo);CHKERRCUDA(stat); 400 401 /* set the operation */ 402 upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 403 404 /* set the matrix */ 405 upTriFactor->csrMat = new CsrMatrix; 406 upTriFactor->csrMat->num_rows = n; 407 upTriFactor->csrMat->num_cols = n; 408 upTriFactor->csrMat->num_entries = nzUpper; 409 410 upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1); 411 upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1); 412 413 upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper); 414 upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper); 415 416 upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper); 417 upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper); 418 419 /* perform the solve analysis */ 420 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, 421 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 422 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 423 upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo);CHKERRCUDA(stat); 424 425 /* assign the pointer. Is this really necessary? */ 426 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor; 427 428 ierr = cudaFreeHost(AiUp);CHKERRCUDA(ierr); 429 ierr = cudaFreeHost(AjUp);CHKERRCUDA(ierr); 430 ierr = cudaFreeHost(AAUp);CHKERRCUDA(ierr); 431 } catch(char *ex) { 432 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 433 } 434 } 435 PetscFunctionReturn(0); 436 } 437 438 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A) 439 { 440 PetscErrorCode ierr; 441 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 442 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 443 IS isrow = a->row,iscol = a->icol; 444 PetscBool row_identity,col_identity; 445 const PetscInt *r,*c; 446 PetscInt n = A->rmap->n; 447 448 PetscFunctionBegin; 449 ierr = MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);CHKERRQ(ierr); 450 ierr = MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);CHKERRQ(ierr); 451 452 cusparseTriFactors->workVector = new THRUSTARRAY; 453 cusparseTriFactors->workVector->resize(n); 454 cusparseTriFactors->nnz=a->nz; 455 456 A->valid_GPU_matrix = PETSC_OFFLOAD_BOTH; 457 /*lower triangular indices */ 458 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 459 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 460 if (!row_identity) { 461 cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 462 cusparseTriFactors->rpermIndices->assign(r, r+n); 463 } 464 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 465 466 /*upper triangular indices */ 467 ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr); 468 ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 469 if (!col_identity) { 470 cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 471 cusparseTriFactors->cpermIndices->assign(c, c+n); 472 } 473 ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr); 474 PetscFunctionReturn(0); 475 } 476 477 static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A) 478 { 479 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 480 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 481 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 482 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 483 cusparseStatus_t stat; 484 PetscErrorCode ierr; 485 PetscInt *AiUp, *AjUp; 486 PetscScalar *AAUp; 487 PetscScalar *AALo; 488 PetscInt nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j; 489 Mat_SeqSBAIJ *b = (Mat_SeqSBAIJ*)A->data; 490 const PetscInt *ai = b->i,*aj = b->j,*vj; 491 const MatScalar *aa = b->a,*v; 492 493 PetscFunctionBegin; 494 if (A->valid_GPU_matrix == PETSC_OFFLOAD_UNALLOCATED || A->valid_GPU_matrix == PETSC_OFFLOAD_CPU) { 495 try { 496 /* Allocate Space for the upper triangular matrix */ 497 ierr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(ierr); 498 ierr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(ierr); 499 ierr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(ierr); 500 ierr = cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUDA(ierr); 501 502 /* Fill the upper triangular matrix */ 503 AiUp[0]=(PetscInt) 0; 504 AiUp[n]=nzUpper; 505 offset = 0; 506 for (i=0; i<n; i++) { 507 /* set the pointers */ 508 v = aa + ai[i]; 509 vj = aj + ai[i]; 510 nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */ 511 512 /* first, set the diagonal elements */ 513 AjUp[offset] = (PetscInt) i; 514 AAUp[offset] = (MatScalar)1.0/v[nz]; 515 AiUp[i] = offset; 516 AALo[offset] = (MatScalar)1.0/v[nz]; 517 518 offset+=1; 519 if (nz>0) { 520 ierr = PetscMemcpy(&(AjUp[offset]), vj, nz*sizeof(PetscInt));CHKERRQ(ierr); 521 ierr = PetscMemcpy(&(AAUp[offset]), v, nz*sizeof(PetscScalar));CHKERRQ(ierr); 522 for (j=offset; j<offset+nz; j++) { 523 AAUp[j] = -AAUp[j]; 524 AALo[j] = AAUp[j]/v[nz]; 525 } 526 offset+=nz; 527 } 528 } 529 530 /* allocate space for the triangular factor information */ 531 upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct; 532 533 /* Create the matrix description */ 534 stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUDA(stat); 535 stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat); 536 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUDA(stat); 537 stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUDA(stat); 538 stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUDA(stat); 539 540 /* Create the solve analysis information */ 541 stat = cusparseCreateSolveAnalysisInfo(&upTriFactor->solveInfo);CHKERRCUDA(stat); 542 543 /* set the operation */ 544 upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 545 546 /* set the matrix */ 547 upTriFactor->csrMat = new CsrMatrix; 548 upTriFactor->csrMat->num_rows = A->rmap->n; 549 upTriFactor->csrMat->num_cols = A->cmap->n; 550 upTriFactor->csrMat->num_entries = a->nz; 551 552 upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 553 upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1); 554 555 upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 556 upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz); 557 558 upTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 559 upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz); 560 561 /* perform the solve analysis */ 562 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, 563 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 564 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 565 upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo);CHKERRCUDA(stat); 566 567 /* assign the pointer. Is this really necessary? */ 568 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor; 569 570 /* allocate space for the triangular factor information */ 571 loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct; 572 573 /* Create the matrix description */ 574 stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUDA(stat); 575 stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat); 576 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUDA(stat); 577 stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUDA(stat); 578 stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUDA(stat); 579 580 /* Create the solve analysis information */ 581 stat = cusparseCreateSolveAnalysisInfo(&loTriFactor->solveInfo);CHKERRCUDA(stat); 582 583 /* set the operation */ 584 loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE; 585 586 /* set the matrix */ 587 loTriFactor->csrMat = new CsrMatrix; 588 loTriFactor->csrMat->num_rows = A->rmap->n; 589 loTriFactor->csrMat->num_cols = A->cmap->n; 590 loTriFactor->csrMat->num_entries = a->nz; 591 592 loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 593 loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1); 594 595 loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 596 loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz); 597 598 loTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 599 loTriFactor->csrMat->values->assign(AALo, AALo+a->nz); 600 601 /* perform the solve analysis */ 602 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, 603 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 604 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 605 loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo);CHKERRCUDA(stat); 606 607 /* assign the pointer. Is this really necessary? */ 608 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor; 609 610 A->valid_GPU_matrix = PETSC_OFFLOAD_BOTH; 611 ierr = cudaFreeHost(AiUp);CHKERRCUDA(ierr); 612 ierr = cudaFreeHost(AjUp);CHKERRCUDA(ierr); 613 ierr = cudaFreeHost(AAUp);CHKERRCUDA(ierr); 614 ierr = cudaFreeHost(AALo);CHKERRCUDA(ierr); 615 } catch(char *ex) { 616 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 617 } 618 } 619 PetscFunctionReturn(0); 620 } 621 622 static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A) 623 { 624 PetscErrorCode ierr; 625 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 626 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 627 IS ip = a->row; 628 const PetscInt *rip; 629 PetscBool perm_identity; 630 PetscInt n = A->rmap->n; 631 632 PetscFunctionBegin; 633 ierr = MatSeqAIJCUSPARSEBuildICCTriMatrices(A);CHKERRQ(ierr); 634 cusparseTriFactors->workVector = new THRUSTARRAY; 635 cusparseTriFactors->workVector->resize(n); 636 cusparseTriFactors->nnz=(a->nz-n)*2 + n; 637 638 /*lower triangular indices */ 639 ierr = ISGetIndices(ip,&rip);CHKERRQ(ierr); 640 ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr); 641 if (!perm_identity) { 642 cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 643 cusparseTriFactors->rpermIndices->assign(rip, rip+n); 644 cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 645 cusparseTriFactors->cpermIndices->assign(rip, rip+n); 646 } 647 ierr = ISRestoreIndices(ip,&rip);CHKERRQ(ierr); 648 PetscFunctionReturn(0); 649 } 650 651 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info) 652 { 653 Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 654 IS isrow = b->row,iscol = b->col; 655 PetscBool row_identity,col_identity; 656 PetscErrorCode ierr; 657 658 PetscFunctionBegin; 659 ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 660 /* determine which version of MatSolve needs to be used. */ 661 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 662 ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 663 if (row_identity && col_identity) { 664 B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 665 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 666 } else { 667 B->ops->solve = MatSolve_SeqAIJCUSPARSE; 668 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 669 } 670 671 /* get the triangular factors */ 672 ierr = MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);CHKERRQ(ierr); 673 PetscFunctionReturn(0); 674 } 675 676 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info) 677 { 678 Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 679 IS ip = b->row; 680 PetscBool perm_identity; 681 PetscErrorCode ierr; 682 683 PetscFunctionBegin; 684 ierr = MatCholeskyFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 685 686 /* determine which version of MatSolve needs to be used. */ 687 ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr); 688 if (perm_identity) { 689 B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 690 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 691 } else { 692 B->ops->solve = MatSolve_SeqAIJCUSPARSE; 693 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 694 } 695 696 /* get the triangular factors */ 697 ierr = MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);CHKERRQ(ierr); 698 PetscFunctionReturn(0); 699 } 700 701 static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A) 702 { 703 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 704 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 705 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 706 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 707 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 708 cusparseStatus_t stat; 709 cusparseIndexBase_t indexBase; 710 cusparseMatrixType_t matrixType; 711 cusparseFillMode_t fillMode; 712 cusparseDiagType_t diagType; 713 714 PetscFunctionBegin; 715 716 /*********************************************/ 717 /* Now the Transpose of the Lower Tri Factor */ 718 /*********************************************/ 719 720 /* allocate space for the transpose of the lower triangular factor */ 721 loTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct; 722 723 /* set the matrix descriptors of the lower triangular factor */ 724 matrixType = cusparseGetMatType(loTriFactor->descr); 725 indexBase = cusparseGetMatIndexBase(loTriFactor->descr); 726 fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ? 727 CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 728 diagType = cusparseGetMatDiagType(loTriFactor->descr); 729 730 /* Create the matrix description */ 731 stat = cusparseCreateMatDescr(&loTriFactorT->descr);CHKERRCUDA(stat); 732 stat = cusparseSetMatIndexBase(loTriFactorT->descr, indexBase);CHKERRCUDA(stat); 733 stat = cusparseSetMatType(loTriFactorT->descr, matrixType);CHKERRCUDA(stat); 734 stat = cusparseSetMatFillMode(loTriFactorT->descr, fillMode);CHKERRCUDA(stat); 735 stat = cusparseSetMatDiagType(loTriFactorT->descr, diagType);CHKERRCUDA(stat); 736 737 /* Create the solve analysis information */ 738 stat = cusparseCreateSolveAnalysisInfo(&loTriFactorT->solveInfo);CHKERRCUDA(stat); 739 740 /* set the operation */ 741 loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 742 743 /* allocate GPU space for the CSC of the lower triangular factor*/ 744 loTriFactorT->csrMat = new CsrMatrix; 745 loTriFactorT->csrMat->num_rows = loTriFactor->csrMat->num_rows; 746 loTriFactorT->csrMat->num_cols = loTriFactor->csrMat->num_cols; 747 loTriFactorT->csrMat->num_entries = loTriFactor->csrMat->num_entries; 748 loTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(loTriFactor->csrMat->num_rows+1); 749 loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactor->csrMat->num_entries); 750 loTriFactorT->csrMat->values = new THRUSTARRAY(loTriFactor->csrMat->num_entries); 751 752 /* compute the transpose of the lower triangular factor, i.e. the CSC */ 753 stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, 754 loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, 755 loTriFactor->csrMat->values->data().get(), 756 loTriFactor->csrMat->row_offsets->data().get(), 757 loTriFactor->csrMat->column_indices->data().get(), 758 loTriFactorT->csrMat->values->data().get(), 759 loTriFactorT->csrMat->column_indices->data().get(), 760 loTriFactorT->csrMat->row_offsets->data().get(), 761 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUDA(stat); 762 763 /* perform the solve analysis on the transposed matrix */ 764 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp, 765 loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, 766 loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(), 767 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), 768 loTriFactorT->solveInfo);CHKERRCUDA(stat); 769 770 /* assign the pointer. Is this really necessary? */ 771 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT; 772 773 /*********************************************/ 774 /* Now the Transpose of the Upper Tri Factor */ 775 /*********************************************/ 776 777 /* allocate space for the transpose of the upper triangular factor */ 778 upTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct; 779 780 /* set the matrix descriptors of the upper triangular factor */ 781 matrixType = cusparseGetMatType(upTriFactor->descr); 782 indexBase = cusparseGetMatIndexBase(upTriFactor->descr); 783 fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ? 784 CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 785 diagType = cusparseGetMatDiagType(upTriFactor->descr); 786 787 /* Create the matrix description */ 788 stat = cusparseCreateMatDescr(&upTriFactorT->descr);CHKERRCUDA(stat); 789 stat = cusparseSetMatIndexBase(upTriFactorT->descr, indexBase);CHKERRCUDA(stat); 790 stat = cusparseSetMatType(upTriFactorT->descr, matrixType);CHKERRCUDA(stat); 791 stat = cusparseSetMatFillMode(upTriFactorT->descr, fillMode);CHKERRCUDA(stat); 792 stat = cusparseSetMatDiagType(upTriFactorT->descr, diagType);CHKERRCUDA(stat); 793 794 /* Create the solve analysis information */ 795 stat = cusparseCreateSolveAnalysisInfo(&upTriFactorT->solveInfo);CHKERRCUDA(stat); 796 797 /* set the operation */ 798 upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 799 800 /* allocate GPU space for the CSC of the upper triangular factor*/ 801 upTriFactorT->csrMat = new CsrMatrix; 802 upTriFactorT->csrMat->num_rows = upTriFactor->csrMat->num_rows; 803 upTriFactorT->csrMat->num_cols = upTriFactor->csrMat->num_cols; 804 upTriFactorT->csrMat->num_entries = upTriFactor->csrMat->num_entries; 805 upTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(upTriFactor->csrMat->num_rows+1); 806 upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactor->csrMat->num_entries); 807 upTriFactorT->csrMat->values = new THRUSTARRAY(upTriFactor->csrMat->num_entries); 808 809 /* compute the transpose of the upper triangular factor, i.e. the CSC */ 810 stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, 811 upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, 812 upTriFactor->csrMat->values->data().get(), 813 upTriFactor->csrMat->row_offsets->data().get(), 814 upTriFactor->csrMat->column_indices->data().get(), 815 upTriFactorT->csrMat->values->data().get(), 816 upTriFactorT->csrMat->column_indices->data().get(), 817 upTriFactorT->csrMat->row_offsets->data().get(), 818 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUDA(stat); 819 820 /* perform the solve analysis on the transposed matrix */ 821 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp, 822 upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, 823 upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(), 824 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), 825 upTriFactorT->solveInfo);CHKERRCUDA(stat); 826 827 /* assign the pointer. Is this really necessary? */ 828 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT; 829 PetscFunctionReturn(0); 830 } 831 832 static PetscErrorCode MatSeqAIJCUSPARSEGenerateTransposeForMult(Mat A) 833 { 834 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 835 Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 836 Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 837 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 838 cusparseStatus_t stat; 839 cusparseIndexBase_t indexBase; 840 cudaError_t err; 841 842 PetscFunctionBegin; 843 844 /* allocate space for the triangular factor information */ 845 matstructT = new Mat_SeqAIJCUSPARSEMultStruct; 846 stat = cusparseCreateMatDescr(&matstructT->descr);CHKERRCUDA(stat); 847 indexBase = cusparseGetMatIndexBase(matstruct->descr); 848 stat = cusparseSetMatIndexBase(matstructT->descr, indexBase);CHKERRCUDA(stat); 849 stat = cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUDA(stat); 850 851 /* set alpha and beta */ 852 err = cudaMalloc((void **)&(matstructT->alpha),sizeof(PetscScalar));CHKERRCUDA(err); 853 err = cudaMemcpy(matstructT->alpha,&ALPHA,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 854 err = cudaMalloc((void **)&(matstructT->beta),sizeof(PetscScalar));CHKERRCUDA(err); 855 err = cudaMemcpy(matstructT->beta,&BETA,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 856 stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUDA(stat); 857 858 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 859 CsrMatrix *matrix = (CsrMatrix*)matstruct->mat; 860 CsrMatrix *matrixT= new CsrMatrix; 861 matrixT->num_rows = A->cmap->n; 862 matrixT->num_cols = A->rmap->n; 863 matrixT->num_entries = a->nz; 864 matrixT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 865 matrixT->column_indices = new THRUSTINTARRAY32(a->nz); 866 matrixT->values = new THRUSTARRAY(a->nz); 867 868 /* compute the transpose of the upper triangular factor, i.e. the CSC */ 869 indexBase = cusparseGetMatIndexBase(matstruct->descr); 870 stat = cusparse_csr2csc(cusparsestruct->handle, matrix->num_rows, 871 matrix->num_cols, matrix->num_entries, 872 matrix->values->data().get(), 873 matrix->row_offsets->data().get(), 874 matrix->column_indices->data().get(), 875 matrixT->values->data().get(), 876 matrixT->column_indices->data().get(), 877 matrixT->row_offsets->data().get(), 878 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUDA(stat); 879 880 /* assign the pointer */ 881 matstructT->mat = matrixT; 882 883 } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) { 884 #if CUDA_VERSION>=5000 885 /* First convert HYB to CSR */ 886 CsrMatrix *temp= new CsrMatrix; 887 temp->num_rows = A->rmap->n; 888 temp->num_cols = A->cmap->n; 889 temp->num_entries = a->nz; 890 temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 891 temp->column_indices = new THRUSTINTARRAY32(a->nz); 892 temp->values = new THRUSTARRAY(a->nz); 893 894 895 stat = cusparse_hyb2csr(cusparsestruct->handle, 896 matstruct->descr, (cusparseHybMat_t)matstruct->mat, 897 temp->values->data().get(), 898 temp->row_offsets->data().get(), 899 temp->column_indices->data().get());CHKERRCUDA(stat); 900 901 /* Next, convert CSR to CSC (i.e. the matrix transpose) */ 902 CsrMatrix *tempT= new CsrMatrix; 903 tempT->num_rows = A->rmap->n; 904 tempT->num_cols = A->cmap->n; 905 tempT->num_entries = a->nz; 906 tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 907 tempT->column_indices = new THRUSTINTARRAY32(a->nz); 908 tempT->values = new THRUSTARRAY(a->nz); 909 910 stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows, 911 temp->num_cols, temp->num_entries, 912 temp->values->data().get(), 913 temp->row_offsets->data().get(), 914 temp->column_indices->data().get(), 915 tempT->values->data().get(), 916 tempT->column_indices->data().get(), 917 tempT->row_offsets->data().get(), 918 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUDA(stat); 919 920 /* Last, convert CSC to HYB */ 921 cusparseHybMat_t hybMat; 922 stat = cusparseCreateHybMat(&hybMat);CHKERRCUDA(stat); 923 cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ? 924 CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 925 stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n, 926 matstructT->descr, tempT->values->data().get(), 927 tempT->row_offsets->data().get(), 928 tempT->column_indices->data().get(), 929 hybMat, 0, partition);CHKERRCUDA(stat); 930 931 /* assign the pointer */ 932 matstructT->mat = hybMat; 933 934 /* delete temporaries */ 935 if (tempT) { 936 if (tempT->values) delete (THRUSTARRAY*) tempT->values; 937 if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices; 938 if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets; 939 delete (CsrMatrix*) tempT; 940 } 941 if (temp) { 942 if (temp->values) delete (THRUSTARRAY*) temp->values; 943 if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices; 944 if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets; 945 delete (CsrMatrix*) temp; 946 } 947 #else 948 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"ELL (Ellpack) and HYB (Hybrid) storage format for the Matrix Transpose (in MatMultTranspose) require CUDA 5.0 or later."); 949 #endif 950 } 951 /* assign the compressed row indices */ 952 matstructT->cprowIndices = new THRUSTINTARRAY; 953 matstructT->cprowIndices->resize(A->cmap->n); 954 thrust::sequence(matstructT->cprowIndices->begin(), matstructT->cprowIndices->end()); 955 956 /* assign the pointer */ 957 ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT; 958 PetscFunctionReturn(0); 959 } 960 961 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 962 { 963 PetscInt n = xx->map->n; 964 const PetscScalar *barray; 965 PetscScalar *xarray; 966 thrust::device_ptr<const PetscScalar> bGPU; 967 thrust::device_ptr<PetscScalar> xGPU; 968 cusparseStatus_t stat; 969 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 970 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 971 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 972 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 973 PetscErrorCode ierr; 974 975 PetscFunctionBegin; 976 /* Analyze the matrix and create the transpose ... on the fly */ 977 if (!loTriFactorT && !upTriFactorT) { 978 ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr); 979 loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 980 upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 981 } 982 983 /* Get the GPU pointers */ 984 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 985 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 986 xGPU = thrust::device_pointer_cast(xarray); 987 bGPU = thrust::device_pointer_cast(barray); 988 989 /* First, reorder with the row permutation */ 990 thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), 991 thrust::make_permutation_iterator(bGPU+n, cusparseTriFactors->rpermIndices->end()), 992 xGPU); 993 994 /* First, solve U */ 995 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, 996 upTriFactorT->csrMat->num_rows, &ALPHA, upTriFactorT->descr, 997 upTriFactorT->csrMat->values->data().get(), 998 upTriFactorT->csrMat->row_offsets->data().get(), 999 upTriFactorT->csrMat->column_indices->data().get(), 1000 upTriFactorT->solveInfo, 1001 xarray, tempGPU->data().get());CHKERRCUDA(stat); 1002 1003 /* Then, solve L */ 1004 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, 1005 loTriFactorT->csrMat->num_rows, &ALPHA, loTriFactorT->descr, 1006 loTriFactorT->csrMat->values->data().get(), 1007 loTriFactorT->csrMat->row_offsets->data().get(), 1008 loTriFactorT->csrMat->column_indices->data().get(), 1009 loTriFactorT->solveInfo, 1010 tempGPU->data().get(), xarray);CHKERRCUDA(stat); 1011 1012 /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */ 1013 thrust::copy(thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()), 1014 thrust::make_permutation_iterator(xGPU+n, cusparseTriFactors->cpermIndices->end()), 1015 tempGPU->begin()); 1016 1017 /* Copy the temporary to the full solution. */ 1018 thrust::copy(tempGPU->begin(), tempGPU->end(), xGPU); 1019 1020 /* restore */ 1021 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1022 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1023 ierr = WaitForGPU();CHKERRCUDA(ierr); 1024 1025 ierr = PetscLogFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1026 PetscFunctionReturn(0); 1027 } 1028 1029 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 1030 { 1031 const PetscScalar *barray; 1032 PetscScalar *xarray; 1033 cusparseStatus_t stat; 1034 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1035 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1036 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1037 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1038 PetscErrorCode ierr; 1039 1040 PetscFunctionBegin; 1041 /* Analyze the matrix and create the transpose ... on the fly */ 1042 if (!loTriFactorT && !upTriFactorT) { 1043 ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr); 1044 loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1045 upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1046 } 1047 1048 /* Get the GPU pointers */ 1049 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1050 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1051 1052 /* First, solve U */ 1053 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, 1054 upTriFactorT->csrMat->num_rows, &ALPHA, upTriFactorT->descr, 1055 upTriFactorT->csrMat->values->data().get(), 1056 upTriFactorT->csrMat->row_offsets->data().get(), 1057 upTriFactorT->csrMat->column_indices->data().get(), 1058 upTriFactorT->solveInfo, 1059 barray, tempGPU->data().get());CHKERRCUDA(stat); 1060 1061 /* Then, solve L */ 1062 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, 1063 loTriFactorT->csrMat->num_rows, &ALPHA, loTriFactorT->descr, 1064 loTriFactorT->csrMat->values->data().get(), 1065 loTriFactorT->csrMat->row_offsets->data().get(), 1066 loTriFactorT->csrMat->column_indices->data().get(), 1067 loTriFactorT->solveInfo, 1068 tempGPU->data().get(), xarray);CHKERRCUDA(stat); 1069 1070 /* restore */ 1071 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1072 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1073 ierr = WaitForGPU();CHKERRCUDA(ierr); 1074 ierr = PetscLogFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1075 PetscFunctionReturn(0); 1076 } 1077 1078 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 1079 { 1080 const PetscScalar *barray; 1081 PetscScalar *xarray; 1082 thrust::device_ptr<const PetscScalar> bGPU; 1083 thrust::device_ptr<PetscScalar> xGPU; 1084 cusparseStatus_t stat; 1085 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1086 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1087 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1088 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1089 PetscErrorCode ierr; 1090 1091 PetscFunctionBegin; 1092 1093 /* Get the GPU pointers */ 1094 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1095 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1096 xGPU = thrust::device_pointer_cast(xarray); 1097 bGPU = thrust::device_pointer_cast(barray); 1098 1099 /* First, reorder with the row permutation */ 1100 thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), 1101 thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()), 1102 xGPU); 1103 1104 /* Next, solve L */ 1105 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp, 1106 loTriFactor->csrMat->num_rows, &ALPHA, loTriFactor->descr, 1107 loTriFactor->csrMat->values->data().get(), 1108 loTriFactor->csrMat->row_offsets->data().get(), 1109 loTriFactor->csrMat->column_indices->data().get(), 1110 loTriFactor->solveInfo, 1111 xarray, tempGPU->data().get());CHKERRCUDA(stat); 1112 1113 /* Then, solve U */ 1114 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp, 1115 upTriFactor->csrMat->num_rows, &ALPHA, upTriFactor->descr, 1116 upTriFactor->csrMat->values->data().get(), 1117 upTriFactor->csrMat->row_offsets->data().get(), 1118 upTriFactor->csrMat->column_indices->data().get(), 1119 upTriFactor->solveInfo, 1120 tempGPU->data().get(), xarray);CHKERRCUDA(stat); 1121 1122 /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */ 1123 thrust::copy(thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()), 1124 thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->end()), 1125 tempGPU->begin()); 1126 1127 /* Copy the temporary to the full solution. */ 1128 thrust::copy(tempGPU->begin(), tempGPU->end(), xGPU); 1129 1130 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1131 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1132 ierr = WaitForGPU();CHKERRCUDA(ierr); 1133 ierr = PetscLogFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1134 PetscFunctionReturn(0); 1135 } 1136 1137 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 1138 { 1139 const PetscScalar *barray; 1140 PetscScalar *xarray; 1141 cusparseStatus_t stat; 1142 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1143 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1144 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1145 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1146 PetscErrorCode ierr; 1147 1148 PetscFunctionBegin; 1149 /* Get the GPU pointers */ 1150 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1151 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1152 1153 /* First, solve L */ 1154 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp, 1155 loTriFactor->csrMat->num_rows, &ALPHA, loTriFactor->descr, 1156 loTriFactor->csrMat->values->data().get(), 1157 loTriFactor->csrMat->row_offsets->data().get(), 1158 loTriFactor->csrMat->column_indices->data().get(), 1159 loTriFactor->solveInfo, 1160 barray, tempGPU->data().get());CHKERRCUDA(stat); 1161 1162 /* Next, solve U */ 1163 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp, 1164 upTriFactor->csrMat->num_rows, &ALPHA, upTriFactor->descr, 1165 upTriFactor->csrMat->values->data().get(), 1166 upTriFactor->csrMat->row_offsets->data().get(), 1167 upTriFactor->csrMat->column_indices->data().get(), 1168 upTriFactor->solveInfo, 1169 tempGPU->data().get(), xarray);CHKERRCUDA(stat); 1170 1171 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1172 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1173 ierr = WaitForGPU();CHKERRCUDA(ierr); 1174 ierr = PetscLogFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1175 PetscFunctionReturn(0); 1176 } 1177 1178 static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A) 1179 { 1180 1181 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1182 Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 1183 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1184 PetscInt m = A->rmap->n,*ii,*ridx; 1185 PetscErrorCode ierr; 1186 cusparseStatus_t stat; 1187 cudaError_t err; 1188 1189 PetscFunctionBegin; 1190 if (A->valid_GPU_matrix == PETSC_OFFLOAD_UNALLOCATED || A->valid_GPU_matrix == PETSC_OFFLOAD_CPU) { 1191 ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1192 if (A->assembled && A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) { 1193 CsrMatrix *matrix = (CsrMatrix*)matstruct->mat; 1194 /* copy values only */ 1195 matrix->values->assign(a->a, a->a+a->nz); 1196 } else { 1197 MatSeqAIJCUSPARSEMultStruct_Destroy(&matstruct,cusparsestruct->format); 1198 try { 1199 cusparsestruct->nonzerorow=0; 1200 for (int j = 0; j<m; j++) cusparsestruct->nonzerorow += ((a->i[j+1]-a->i[j])>0); 1201 1202 if (a->compressedrow.use) { 1203 m = a->compressedrow.nrows; 1204 ii = a->compressedrow.i; 1205 ridx = a->compressedrow.rindex; 1206 } else { 1207 /* Forcing compressed row on the GPU */ 1208 int k=0; 1209 ierr = PetscMalloc1(cusparsestruct->nonzerorow+1, &ii);CHKERRQ(ierr); 1210 ierr = PetscMalloc1(cusparsestruct->nonzerorow, &ridx);CHKERRQ(ierr); 1211 ii[0]=0; 1212 for (int j = 0; j<m; j++) { 1213 if ((a->i[j+1]-a->i[j])>0) { 1214 ii[k] = a->i[j]; 1215 ridx[k]= j; 1216 k++; 1217 } 1218 } 1219 ii[cusparsestruct->nonzerorow] = a->nz; 1220 m = cusparsestruct->nonzerorow; 1221 } 1222 1223 /* allocate space for the triangular factor information */ 1224 matstruct = new Mat_SeqAIJCUSPARSEMultStruct; 1225 stat = cusparseCreateMatDescr(&matstruct->descr);CHKERRCUDA(stat); 1226 stat = cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat); 1227 stat = cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUDA(stat); 1228 1229 err = cudaMalloc((void **)&(matstruct->alpha),sizeof(PetscScalar));CHKERRCUDA(err); 1230 err = cudaMemcpy(matstruct->alpha,&ALPHA,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1231 err = cudaMalloc((void **)&(matstruct->beta),sizeof(PetscScalar));CHKERRCUDA(err); 1232 err = cudaMemcpy(matstruct->beta,&BETA,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1233 stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUDA(stat); 1234 1235 /* Build a hybrid/ellpack matrix if this option is chosen for the storage */ 1236 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1237 /* set the matrix */ 1238 CsrMatrix *matrix= new CsrMatrix; 1239 matrix->num_rows = m; 1240 matrix->num_cols = A->cmap->n; 1241 matrix->num_entries = a->nz; 1242 matrix->row_offsets = new THRUSTINTARRAY32(m+1); 1243 matrix->row_offsets->assign(ii, ii + m+1); 1244 1245 matrix->column_indices = new THRUSTINTARRAY32(a->nz); 1246 matrix->column_indices->assign(a->j, a->j+a->nz); 1247 1248 matrix->values = new THRUSTARRAY(a->nz); 1249 matrix->values->assign(a->a, a->a+a->nz); 1250 1251 /* assign the pointer */ 1252 matstruct->mat = matrix; 1253 1254 } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) { 1255 #if CUDA_VERSION>=4020 1256 CsrMatrix *matrix= new CsrMatrix; 1257 matrix->num_rows = m; 1258 matrix->num_cols = A->cmap->n; 1259 matrix->num_entries = a->nz; 1260 matrix->row_offsets = new THRUSTINTARRAY32(m+1); 1261 matrix->row_offsets->assign(ii, ii + m+1); 1262 1263 matrix->column_indices = new THRUSTINTARRAY32(a->nz); 1264 matrix->column_indices->assign(a->j, a->j+a->nz); 1265 1266 matrix->values = new THRUSTARRAY(a->nz); 1267 matrix->values->assign(a->a, a->a+a->nz); 1268 1269 cusparseHybMat_t hybMat; 1270 stat = cusparseCreateHybMat(&hybMat);CHKERRCUDA(stat); 1271 cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ? 1272 CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 1273 stat = cusparse_csr2hyb(cusparsestruct->handle, matrix->num_rows, matrix->num_cols, 1274 matstruct->descr, matrix->values->data().get(), 1275 matrix->row_offsets->data().get(), 1276 matrix->column_indices->data().get(), 1277 hybMat, 0, partition);CHKERRCUDA(stat); 1278 /* assign the pointer */ 1279 matstruct->mat = hybMat; 1280 1281 if (matrix) { 1282 if (matrix->values) delete (THRUSTARRAY*)matrix->values; 1283 if (matrix->column_indices) delete (THRUSTINTARRAY32*)matrix->column_indices; 1284 if (matrix->row_offsets) delete (THRUSTINTARRAY32*)matrix->row_offsets; 1285 delete (CsrMatrix*)matrix; 1286 } 1287 #endif 1288 } 1289 1290 /* assign the compressed row indices */ 1291 matstruct->cprowIndices = new THRUSTINTARRAY(m); 1292 matstruct->cprowIndices->assign(ridx,ridx+m); 1293 1294 /* assign the pointer */ 1295 cusparsestruct->mat = matstruct; 1296 1297 if (!a->compressedrow.use) { 1298 ierr = PetscFree(ii);CHKERRQ(ierr); 1299 ierr = PetscFree(ridx);CHKERRQ(ierr); 1300 } 1301 cusparsestruct->workVector = new THRUSTARRAY; 1302 cusparsestruct->workVector->resize(m); 1303 } catch(char *ex) { 1304 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 1305 } 1306 cusparsestruct->nonzerostate = A->nonzerostate; 1307 } 1308 ierr = WaitForGPU();CHKERRCUDA(ierr); 1309 A->valid_GPU_matrix = PETSC_OFFLOAD_BOTH; 1310 ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1311 } 1312 PetscFunctionReturn(0); 1313 } 1314 1315 static PetscErrorCode MatCreateVecs_SeqAIJCUSPARSE(Mat mat, Vec *right, Vec *left) 1316 { 1317 PetscErrorCode ierr; 1318 PetscInt rbs,cbs; 1319 1320 PetscFunctionBegin; 1321 ierr = MatGetBlockSizes(mat,&rbs,&cbs);CHKERRQ(ierr); 1322 if (right) { 1323 ierr = VecCreate(PetscObjectComm((PetscObject)mat),right);CHKERRQ(ierr); 1324 ierr = VecSetSizes(*right,mat->cmap->n,PETSC_DETERMINE);CHKERRQ(ierr); 1325 ierr = VecSetBlockSize(*right,cbs);CHKERRQ(ierr); 1326 ierr = VecSetType(*right,VECSEQCUDA);CHKERRQ(ierr); 1327 ierr = PetscLayoutReference(mat->cmap,&(*right)->map);CHKERRQ(ierr); 1328 } 1329 if (left) { 1330 ierr = VecCreate(PetscObjectComm((PetscObject)mat),left);CHKERRQ(ierr); 1331 ierr = VecSetSizes(*left,mat->rmap->n,PETSC_DETERMINE);CHKERRQ(ierr); 1332 ierr = VecSetBlockSize(*left,rbs);CHKERRQ(ierr); 1333 ierr = VecSetType(*left,VECSEQCUDA);CHKERRQ(ierr); 1334 ierr = PetscLayoutReference(mat->rmap,&(*left)->map);CHKERRQ(ierr); 1335 } 1336 PetscFunctionReturn(0); 1337 } 1338 1339 struct VecCUDAPlusEquals 1340 { 1341 template <typename Tuple> 1342 __host__ __device__ 1343 void operator()(Tuple t) 1344 { 1345 thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t); 1346 } 1347 }; 1348 1349 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 1350 { 1351 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1352 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1353 Mat_SeqAIJCUSPARSEMultStruct *matstruct; /* Do not initialize yet, because GPU data may not be available yet */ 1354 const PetscScalar *xarray; 1355 PetscScalar *yarray; 1356 PetscErrorCode ierr; 1357 cusparseStatus_t stat; 1358 1359 PetscFunctionBegin; 1360 /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */ 1361 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 1362 ierr = VecCUDAGetArrayRead(xx,&xarray);CHKERRQ(ierr); 1363 ierr = VecSet(yy,0);CHKERRQ(ierr); 1364 ierr = VecCUDAGetArrayWrite(yy,&yarray);CHKERRQ(ierr); 1365 matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 1366 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1367 CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 1368 stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1369 mat->num_rows, mat->num_cols, mat->num_entries, 1370 matstruct->alpha, matstruct->descr, mat->values->data().get(), mat->row_offsets->data().get(), 1371 mat->column_indices->data().get(), xarray, matstruct->beta, 1372 yarray);CHKERRCUDA(stat); 1373 } else { 1374 #if CUDA_VERSION>=4020 1375 cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat; 1376 stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1377 matstruct->alpha, matstruct->descr, hybMat, 1378 xarray, matstruct->beta, 1379 yarray);CHKERRCUDA(stat); 1380 #endif 1381 } 1382 ierr = VecCUDARestoreArrayRead(xx,&xarray);CHKERRQ(ierr); 1383 ierr = VecCUDARestoreArrayWrite(yy,&yarray);CHKERRQ(ierr); 1384 if (!cusparsestruct->stream) { 1385 ierr = WaitForGPU();CHKERRCUDA(ierr); 1386 } 1387 ierr = PetscLogFlops(2.0*a->nz - cusparsestruct->nonzerorow);CHKERRQ(ierr); 1388 PetscFunctionReturn(0); 1389 } 1390 1391 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 1392 { 1393 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1394 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1395 Mat_SeqAIJCUSPARSEMultStruct *matstructT; 1396 const PetscScalar *xarray; 1397 PetscScalar *yarray; 1398 PetscErrorCode ierr; 1399 cusparseStatus_t stat; 1400 1401 PetscFunctionBegin; 1402 /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */ 1403 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 1404 matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 1405 if (!matstructT) { 1406 ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr); 1407 matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 1408 } 1409 ierr = VecCUDAGetArrayRead(xx,&xarray);CHKERRQ(ierr); 1410 ierr = VecSet(yy,0);CHKERRQ(ierr); 1411 ierr = VecCUDAGetArrayWrite(yy,&yarray);CHKERRQ(ierr); 1412 1413 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1414 CsrMatrix *mat = (CsrMatrix*)matstructT->mat; 1415 stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1416 mat->num_rows, mat->num_cols, 1417 mat->num_entries, matstructT->alpha, matstructT->descr, 1418 mat->values->data().get(), mat->row_offsets->data().get(), 1419 mat->column_indices->data().get(), xarray, matstructT->beta, 1420 yarray);CHKERRCUDA(stat); 1421 } else { 1422 #if CUDA_VERSION>=4020 1423 cusparseHybMat_t hybMat = (cusparseHybMat_t)matstructT->mat; 1424 stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1425 matstructT->alpha, matstructT->descr, hybMat, 1426 xarray, matstructT->beta, 1427 yarray);CHKERRCUDA(stat); 1428 #endif 1429 } 1430 ierr = VecCUDARestoreArrayRead(xx,&xarray);CHKERRQ(ierr); 1431 ierr = VecCUDARestoreArrayWrite(yy,&yarray);CHKERRQ(ierr); 1432 if (!cusparsestruct->stream) { 1433 ierr = WaitForGPU();CHKERRCUDA(ierr); 1434 } 1435 ierr = PetscLogFlops(2.0*a->nz - cusparsestruct->nonzerorow);CHKERRQ(ierr); 1436 PetscFunctionReturn(0); 1437 } 1438 1439 1440 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 1441 { 1442 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1443 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1444 Mat_SeqAIJCUSPARSEMultStruct *matstruct; 1445 thrust::device_ptr<PetscScalar> zptr; 1446 const PetscScalar *xarray; 1447 PetscScalar *zarray; 1448 PetscErrorCode ierr; 1449 cusparseStatus_t stat; 1450 1451 PetscFunctionBegin; 1452 /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */ 1453 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 1454 matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 1455 try { 1456 ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr); 1457 ierr = VecCUDAGetArrayRead(xx,&xarray);CHKERRQ(ierr); 1458 ierr = VecCUDAGetArrayReadWrite(zz,&zarray);CHKERRQ(ierr); 1459 zptr = thrust::device_pointer_cast(zarray); 1460 1461 /* multiply add */ 1462 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1463 CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 1464 /* here we need to be careful to set the number of rows in the multiply to the 1465 number of compressed rows in the matrix ... which is equivalent to the 1466 size of the workVector */ 1467 stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1468 mat->num_rows, mat->num_cols, 1469 mat->num_entries, matstruct->alpha, matstruct->descr, 1470 mat->values->data().get(), mat->row_offsets->data().get(), 1471 mat->column_indices->data().get(), xarray, matstruct->beta, 1472 cusparsestruct->workVector->data().get());CHKERRCUDA(stat); 1473 } else { 1474 #if CUDA_VERSION>=4020 1475 cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat; 1476 if (cusparsestruct->workVector->size()) { 1477 stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1478 matstruct->alpha, matstruct->descr, hybMat, 1479 xarray, matstruct->beta, 1480 cusparsestruct->workVector->data().get());CHKERRCUDA(stat); 1481 } 1482 #endif 1483 } 1484 1485 /* scatter the data from the temporary into the full vector with a += operation */ 1486 thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))), 1487 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + cusparsestruct->workVector->size(), 1488 VecCUDAPlusEquals()); 1489 ierr = VecCUDARestoreArrayRead(xx,&xarray);CHKERRQ(ierr); 1490 ierr = VecCUDARestoreArrayReadWrite(zz,&zarray);CHKERRQ(ierr); 1491 1492 } catch(char *ex) { 1493 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 1494 } 1495 ierr = WaitForGPU();CHKERRCUDA(ierr); 1496 ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr); 1497 PetscFunctionReturn(0); 1498 } 1499 1500 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 1501 { 1502 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1503 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1504 Mat_SeqAIJCUSPARSEMultStruct *matstructT; 1505 thrust::device_ptr<PetscScalar> zptr; 1506 const PetscScalar *xarray; 1507 PetscScalar *zarray; 1508 PetscErrorCode ierr; 1509 cusparseStatus_t stat; 1510 1511 PetscFunctionBegin; 1512 /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */ 1513 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 1514 matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 1515 if (!matstructT) { 1516 ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr); 1517 matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 1518 } 1519 1520 try { 1521 ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr); 1522 ierr = VecCUDAGetArrayRead(xx,&xarray);CHKERRQ(ierr); 1523 ierr = VecCUDAGetArrayReadWrite(zz,&zarray);CHKERRQ(ierr); 1524 zptr = thrust::device_pointer_cast(zarray); 1525 1526 /* multiply add with matrix transpose */ 1527 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1528 CsrMatrix *mat = (CsrMatrix*)matstructT->mat; 1529 /* here we need to be careful to set the number of rows in the multiply to the 1530 number of compressed rows in the matrix ... which is equivalent to the 1531 size of the workVector */ 1532 stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1533 mat->num_rows, mat->num_cols, 1534 mat->num_entries, matstructT->alpha, matstructT->descr, 1535 mat->values->data().get(), mat->row_offsets->data().get(), 1536 mat->column_indices->data().get(), xarray, matstructT->beta, 1537 cusparsestruct->workVector->data().get());CHKERRCUDA(stat); 1538 } else { 1539 #if CUDA_VERSION>=4020 1540 cusparseHybMat_t hybMat = (cusparseHybMat_t)matstructT->mat; 1541 if (cusparsestruct->workVector->size()) { 1542 stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1543 matstructT->alpha, matstructT->descr, hybMat, 1544 xarray, matstructT->beta, 1545 cusparsestruct->workVector->data().get());CHKERRCUDA(stat); 1546 } 1547 #endif 1548 } 1549 1550 /* scatter the data from the temporary into the full vector with a += operation */ 1551 thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstructT->cprowIndices->begin()))), 1552 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstructT->cprowIndices->begin()))) + A->cmap->n, 1553 VecCUDAPlusEquals()); 1554 1555 ierr = VecCUDARestoreArrayRead(xx,&xarray);CHKERRQ(ierr); 1556 ierr = VecCUDARestoreArrayReadWrite(zz,&zarray);CHKERRQ(ierr); 1557 1558 } catch(char *ex) { 1559 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 1560 } 1561 ierr = WaitForGPU();CHKERRCUDA(ierr); 1562 ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr); 1563 PetscFunctionReturn(0); 1564 } 1565 1566 static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode) 1567 { 1568 PetscErrorCode ierr; 1569 1570 PetscFunctionBegin; 1571 ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr); 1572 if (A->factortype==MAT_FACTOR_NONE) { 1573 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 1574 } 1575 if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0); 1576 A->ops->mult = MatMult_SeqAIJCUSPARSE; 1577 A->ops->multadd = MatMultAdd_SeqAIJCUSPARSE; 1578 A->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE; 1579 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE; 1580 PetscFunctionReturn(0); 1581 } 1582 1583 /* --------------------------------------------------------------------------------*/ 1584 /*@ 1585 MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format 1586 (the default parallel PETSc format). This matrix will ultimately pushed down 1587 to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix 1588 assembly performance the user should preallocate the matrix storage by setting 1589 the parameter nz (or the array nnz). By setting these parameters accurately, 1590 performance during matrix assembly can be increased by more than a factor of 50. 1591 1592 Collective on MPI_Comm 1593 1594 Input Parameters: 1595 + comm - MPI communicator, set to PETSC_COMM_SELF 1596 . m - number of rows 1597 . n - number of columns 1598 . nz - number of nonzeros per row (same for all rows) 1599 - nnz - array containing the number of nonzeros in the various rows 1600 (possibly different for each row) or NULL 1601 1602 Output Parameter: 1603 . A - the matrix 1604 1605 It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 1606 MatXXXXSetPreallocation() paradgm instead of this routine directly. 1607 [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 1608 1609 Notes: 1610 If nnz is given then nz is ignored 1611 1612 The AIJ format (also called the Yale sparse matrix format or 1613 compressed row storage), is fully compatible with standard Fortran 77 1614 storage. That is, the stored row and column indices can begin at 1615 either one (as in Fortran) or zero. See the users' manual for details. 1616 1617 Specify the preallocated storage with either nz or nnz (not both). 1618 Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 1619 allocation. For large problems you MUST preallocate memory or you 1620 will get TERRIBLE performance, see the users' manual chapter on matrices. 1621 1622 By default, this format uses inodes (identical nodes) when possible, to 1623 improve numerical efficiency of matrix-vector products and solves. We 1624 search for consecutive rows with the same nonzero structure, thereby 1625 reusing matrix information to achieve increased efficiency. 1626 1627 Level: intermediate 1628 1629 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE 1630 @*/ 1631 PetscErrorCode MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A) 1632 { 1633 PetscErrorCode ierr; 1634 1635 PetscFunctionBegin; 1636 ierr = MatCreate(comm,A);CHKERRQ(ierr); 1637 ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr); 1638 ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 1639 ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr); 1640 PetscFunctionReturn(0); 1641 } 1642 1643 static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A) 1644 { 1645 PetscErrorCode ierr; 1646 1647 PetscFunctionBegin; 1648 if (A->factortype==MAT_FACTOR_NONE) { 1649 if (A->valid_GPU_matrix != PETSC_OFFLOAD_UNALLOCATED) { 1650 ierr = MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr);CHKERRQ(ierr); 1651 } 1652 } else { 1653 ierr = MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr);CHKERRQ(ierr); 1654 } 1655 ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr); 1656 PetscFunctionReturn(0); 1657 } 1658 1659 static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A,MatDuplicateOption cpvalues,Mat *B) 1660 { 1661 PetscErrorCode ierr; 1662 Mat C; 1663 cusparseStatus_t stat; 1664 cusparseHandle_t handle=0; 1665 1666 PetscFunctionBegin; 1667 ierr = MatDuplicate_SeqAIJ(A,cpvalues,B);CHKERRQ(ierr); 1668 C = *B; 1669 /* inject CUSPARSE-specific stuff */ 1670 if (C->factortype==MAT_FACTOR_NONE) { 1671 /* you cannot check the inode.use flag here since the matrix was just created. 1672 now build a GPU matrix data structure */ 1673 C->spptr = new Mat_SeqAIJCUSPARSE; 1674 ((Mat_SeqAIJCUSPARSE*)C->spptr)->mat = 0; 1675 ((Mat_SeqAIJCUSPARSE*)C->spptr)->matTranspose = 0; 1676 ((Mat_SeqAIJCUSPARSE*)C->spptr)->workVector = 0; 1677 ((Mat_SeqAIJCUSPARSE*)C->spptr)->format = MAT_CUSPARSE_CSR; 1678 ((Mat_SeqAIJCUSPARSE*)C->spptr)->stream = 0; 1679 ((Mat_SeqAIJCUSPARSE*)C->spptr)->handle = 0; 1680 stat = cusparseCreate(&handle);CHKERRCUDA(stat); 1681 ((Mat_SeqAIJCUSPARSE*)C->spptr)->handle = handle; 1682 ((Mat_SeqAIJCUSPARSE*)C->spptr)->stream = 0; 1683 ((Mat_SeqAIJCUSPARSE*)C->spptr)->nonzerostate = 0; 1684 } else { 1685 /* NEXT, set the pointers to the triangular factors */ 1686 C->spptr = new Mat_SeqAIJCUSPARSETriFactors; 1687 ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->loTriFactorPtr = 0; 1688 ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->upTriFactorPtr = 0; 1689 ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->loTriFactorPtrTranspose = 0; 1690 ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->upTriFactorPtrTranspose = 0; 1691 ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->rpermIndices = 0; 1692 ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->cpermIndices = 0; 1693 ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->workVector = 0; 1694 ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->handle = 0; 1695 stat = cusparseCreate(&handle);CHKERRCUDA(stat); 1696 ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->handle = handle; 1697 ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->nnz = 0; 1698 } 1699 1700 C->ops->assemblyend = MatAssemblyEnd_SeqAIJCUSPARSE; 1701 C->ops->destroy = MatDestroy_SeqAIJCUSPARSE; 1702 C->ops->getvecs = MatCreateVecs_SeqAIJCUSPARSE; 1703 C->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE; 1704 C->ops->mult = MatMult_SeqAIJCUSPARSE; 1705 C->ops->multadd = MatMultAdd_SeqAIJCUSPARSE; 1706 C->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE; 1707 C->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE; 1708 C->ops->duplicate = MatDuplicate_SeqAIJCUSPARSE; 1709 1710 ierr = PetscObjectChangeTypeName((PetscObject)C,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 1711 1712 C->valid_GPU_matrix = PETSC_OFFLOAD_UNALLOCATED; 1713 1714 ierr = PetscObjectComposeFunction((PetscObject)C, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr); 1715 PetscFunctionReturn(0); 1716 } 1717 1718 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B) 1719 { 1720 PetscErrorCode ierr; 1721 cusparseStatus_t stat; 1722 cusparseHandle_t handle=0; 1723 1724 PetscFunctionBegin; 1725 ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr); 1726 if (B->factortype==MAT_FACTOR_NONE) { 1727 /* you cannot check the inode.use flag here since the matrix was just created. 1728 now build a GPU matrix data structure */ 1729 B->spptr = new Mat_SeqAIJCUSPARSE; 1730 ((Mat_SeqAIJCUSPARSE*)B->spptr)->mat = 0; 1731 ((Mat_SeqAIJCUSPARSE*)B->spptr)->matTranspose = 0; 1732 ((Mat_SeqAIJCUSPARSE*)B->spptr)->workVector = 0; 1733 ((Mat_SeqAIJCUSPARSE*)B->spptr)->format = MAT_CUSPARSE_CSR; 1734 ((Mat_SeqAIJCUSPARSE*)B->spptr)->stream = 0; 1735 ((Mat_SeqAIJCUSPARSE*)B->spptr)->handle = 0; 1736 stat = cusparseCreate(&handle);CHKERRCUDA(stat); 1737 ((Mat_SeqAIJCUSPARSE*)B->spptr)->handle = handle; 1738 ((Mat_SeqAIJCUSPARSE*)B->spptr)->stream = 0; 1739 ((Mat_SeqAIJCUSPARSE*)B->spptr)->nonzerostate = 0; 1740 } else { 1741 /* NEXT, set the pointers to the triangular factors */ 1742 B->spptr = new Mat_SeqAIJCUSPARSETriFactors; 1743 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->loTriFactorPtr = 0; 1744 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->upTriFactorPtr = 0; 1745 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->loTriFactorPtrTranspose = 0; 1746 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->upTriFactorPtrTranspose = 0; 1747 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->rpermIndices = 0; 1748 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->cpermIndices = 0; 1749 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->workVector = 0; 1750 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->handle = 0; 1751 stat = cusparseCreate(&handle);CHKERRCUDA(stat); 1752 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->handle = handle; 1753 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->nnz = 0; 1754 } 1755 1756 B->ops->assemblyend = MatAssemblyEnd_SeqAIJCUSPARSE; 1757 B->ops->destroy = MatDestroy_SeqAIJCUSPARSE; 1758 B->ops->getvecs = MatCreateVecs_SeqAIJCUSPARSE; 1759 B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE; 1760 B->ops->mult = MatMult_SeqAIJCUSPARSE; 1761 B->ops->multadd = MatMultAdd_SeqAIJCUSPARSE; 1762 B->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE; 1763 B->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE; 1764 B->ops->duplicate = MatDuplicate_SeqAIJCUSPARSE; 1765 1766 ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 1767 1768 B->valid_GPU_matrix = PETSC_OFFLOAD_UNALLOCATED; 1769 1770 ierr = PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr); 1771 PetscFunctionReturn(0); 1772 } 1773 1774 /*MC 1775 MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices. 1776 1777 A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either 1778 CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later. 1779 All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library. 1780 1781 Options Database Keys: 1782 + -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions() 1783 . -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 1784 . -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 1785 1786 Level: beginner 1787 1788 .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 1789 M*/ 1790 1791 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat,MatFactorType,Mat*); 1792 1793 1794 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void) 1795 { 1796 PetscErrorCode ierr; 1797 1798 PetscFunctionBegin; 1799 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 1800 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 1801 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 1802 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 1803 PetscFunctionReturn(0); 1804 } 1805 1806 1807 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct) 1808 { 1809 cusparseStatus_t stat; 1810 cusparseHandle_t handle; 1811 1812 PetscFunctionBegin; 1813 if (*cusparsestruct) { 1814 MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format); 1815 MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format); 1816 delete (*cusparsestruct)->workVector; 1817 if (handle = (*cusparsestruct)->handle) { 1818 stat = cusparseDestroy(handle);CHKERRCUDA(stat); 1819 } 1820 delete *cusparsestruct; 1821 *cusparsestruct = 0; 1822 } 1823 PetscFunctionReturn(0); 1824 } 1825 1826 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat) 1827 { 1828 PetscFunctionBegin; 1829 if (*mat) { 1830 delete (*mat)->values; 1831 delete (*mat)->column_indices; 1832 delete (*mat)->row_offsets; 1833 delete *mat; 1834 *mat = 0; 1835 } 1836 PetscFunctionReturn(0); 1837 } 1838 1839 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor) 1840 { 1841 cusparseStatus_t stat; 1842 PetscErrorCode ierr; 1843 1844 PetscFunctionBegin; 1845 if (*trifactor) { 1846 if ((*trifactor)->descr) { stat = cusparseDestroyMatDescr((*trifactor)->descr);CHKERRCUDA(stat); } 1847 if ((*trifactor)->solveInfo) { stat = cusparseDestroySolveAnalysisInfo((*trifactor)->solveInfo);CHKERRCUDA(stat); } 1848 ierr = CsrMatrix_Destroy(&(*trifactor)->csrMat);CHKERRQ(ierr); 1849 delete *trifactor; 1850 *trifactor = 0; 1851 } 1852 PetscFunctionReturn(0); 1853 } 1854 1855 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format) 1856 { 1857 CsrMatrix *mat; 1858 cusparseStatus_t stat; 1859 cudaError_t err; 1860 1861 PetscFunctionBegin; 1862 if (*matstruct) { 1863 if ((*matstruct)->mat) { 1864 if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) { 1865 cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat; 1866 stat = cusparseDestroyHybMat(hybMat);CHKERRCUDA(stat); 1867 } else { 1868 mat = (CsrMatrix*)(*matstruct)->mat; 1869 CsrMatrix_Destroy(&mat); 1870 } 1871 } 1872 if ((*matstruct)->descr) { stat = cusparseDestroyMatDescr((*matstruct)->descr);CHKERRCUDA(stat); } 1873 delete (*matstruct)->cprowIndices; 1874 if ((*matstruct)->alpha) { err=cudaFree((*matstruct)->alpha);CHKERRCUDA(err); } 1875 if ((*matstruct)->beta) { err=cudaFree((*matstruct)->beta);CHKERRCUDA(err); } 1876 delete *matstruct; 1877 *matstruct = 0; 1878 } 1879 PetscFunctionReturn(0); 1880 } 1881 1882 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors) 1883 { 1884 cusparseHandle_t handle; 1885 cusparseStatus_t stat; 1886 1887 PetscFunctionBegin; 1888 if (*trifactors) { 1889 MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtr); 1890 MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtr); 1891 MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtrTranspose); 1892 MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtrTranspose); 1893 delete (*trifactors)->rpermIndices; 1894 delete (*trifactors)->cpermIndices; 1895 delete (*trifactors)->workVector; 1896 if (handle = (*trifactors)->handle) { 1897 stat = cusparseDestroy(handle);CHKERRCUDA(stat); 1898 } 1899 delete *trifactors; 1900 *trifactors = 0; 1901 } 1902 PetscFunctionReturn(0); 1903 } 1904 1905