1 /* 2 Defines the basic matrix operations for the AIJ (compressed row) 3 matrix storage format using the CUSPARSE library, 4 */ 5 #define PETSC_SKIP_SPINLOCK 6 7 #include <petscconf.h> 8 #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/ 9 #include <../src/mat/impls/sbaij/seq/sbaij.h> 10 #include <../src/vec/vec/impls/dvecimpl.h> 11 #include <petsc/private/vecimpl.h> 12 #undef VecType 13 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h> 14 15 const char *const MatCUSPARSEStorageFormats[] = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0}; 16 17 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*); 18 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*); 19 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*); 20 21 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 22 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 23 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*); 24 25 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec); 26 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 27 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 28 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 29 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat); 30 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec); 31 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 32 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 33 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 34 35 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**); 36 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**); 37 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat); 38 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**); 39 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**); 40 41 PetscErrorCode MatCUSPARSESetStream(Mat A,const cudaStream_t stream) 42 { 43 cusparseStatus_t stat; 44 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 45 46 PetscFunctionBegin; 47 cusparsestruct->stream = stream; 48 stat = cusparseSetStream(cusparsestruct->handle,cusparsestruct->stream);CHKERRCUDA(stat); 49 PetscFunctionReturn(0); 50 } 51 52 PetscErrorCode MatCUSPARSESetHandle(Mat A,const cusparseHandle_t handle) 53 { 54 cusparseStatus_t stat; 55 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 56 57 PetscFunctionBegin; 58 if (cusparsestruct->handle != handle) { 59 if (cusparsestruct->handle) { 60 stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUDA(stat); 61 } 62 cusparsestruct->handle = handle; 63 } 64 stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUDA(stat); 65 PetscFunctionReturn(0); 66 } 67 68 PetscErrorCode MatCUSPARSEClearHandle(Mat A) 69 { 70 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 71 PetscFunctionBegin; 72 if (cusparsestruct->handle) 73 cusparsestruct->handle = 0; 74 PetscFunctionReturn(0); 75 } 76 77 PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A,MatSolverType *type) 78 { 79 PetscFunctionBegin; 80 *type = MATSOLVERCUSPARSE; 81 PetscFunctionReturn(0); 82 } 83 84 /*MC 85 MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices 86 on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported 87 algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer 88 performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the 89 CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these 90 algorithms are not recommended. This class does NOT support direct solver operations. 91 92 Level: beginner 93 94 .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 95 M*/ 96 97 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B) 98 { 99 PetscErrorCode ierr; 100 PetscInt n = A->rmap->n; 101 102 PetscFunctionBegin; 103 ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr); 104 (*B)->factortype = ftype; 105 ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr); 106 ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 107 108 if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) { 109 ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr); 110 (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE; 111 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJCUSPARSE; 112 } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) { 113 (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJCUSPARSE; 114 (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE; 115 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types"); 116 117 ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr); 118 ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_cusparse);CHKERRQ(ierr); 119 PetscFunctionReturn(0); 120 } 121 122 PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 123 { 124 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 125 126 PetscFunctionBegin; 127 switch (op) { 128 case MAT_CUSPARSE_MULT: 129 cusparsestruct->format = format; 130 break; 131 case MAT_CUSPARSE_ALL: 132 cusparsestruct->format = format; 133 break; 134 default: 135 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op); 136 } 137 PetscFunctionReturn(0); 138 } 139 140 /*@ 141 MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular 142 operation. Only the MatMult operation can use different GPU storage formats 143 for MPIAIJCUSPARSE matrices. 144 Not Collective 145 146 Input Parameters: 147 + A - Matrix of type SEQAIJCUSPARSE 148 . op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL. 149 - format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2) 150 151 Output Parameter: 152 153 Level: intermediate 154 155 .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 156 @*/ 157 PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 158 { 159 PetscErrorCode ierr; 160 161 PetscFunctionBegin; 162 PetscValidHeaderSpecific(A, MAT_CLASSID,1); 163 ierr = PetscTryMethod(A, "MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));CHKERRQ(ierr); 164 PetscFunctionReturn(0); 165 } 166 167 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A) 168 { 169 PetscErrorCode ierr; 170 MatCUSPARSEStorageFormat format; 171 PetscBool flg; 172 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 173 174 PetscFunctionBegin; 175 ierr = PetscOptionsHead(PetscOptionsObject,"SeqAIJCUSPARSE options");CHKERRQ(ierr); 176 if (A->factortype==MAT_FACTOR_NONE) { 177 ierr = PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV", 178 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 179 if (flg) { 180 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);CHKERRQ(ierr); 181 } 182 } 183 ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve", 184 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 185 if (flg) { 186 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr); 187 } 188 ierr = PetscOptionsTail();CHKERRQ(ierr); 189 PetscFunctionReturn(0); 190 191 } 192 193 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 194 { 195 PetscErrorCode ierr; 196 197 PetscFunctionBegin; 198 ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 199 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 200 PetscFunctionReturn(0); 201 } 202 203 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 204 { 205 PetscErrorCode ierr; 206 207 PetscFunctionBegin; 208 ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 209 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 210 PetscFunctionReturn(0); 211 } 212 213 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info) 214 { 215 PetscErrorCode ierr; 216 217 PetscFunctionBegin; 218 ierr = MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr); 219 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 220 PetscFunctionReturn(0); 221 } 222 223 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info) 224 { 225 PetscErrorCode ierr; 226 227 PetscFunctionBegin; 228 ierr = MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr); 229 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 230 PetscFunctionReturn(0); 231 } 232 233 static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A) 234 { 235 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 236 PetscInt n = A->rmap->n; 237 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 238 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 239 cusparseStatus_t stat; 240 const PetscInt *ai = a->i,*aj = a->j,*vi; 241 const MatScalar *aa = a->a,*v; 242 PetscInt *AiLo, *AjLo; 243 PetscScalar *AALo; 244 PetscInt i,nz, nzLower, offset, rowOffset; 245 PetscErrorCode ierr; 246 247 PetscFunctionBegin; 248 if (A->valid_GPU_matrix == PETSC_OFFLOAD_UNALLOCATED || A->valid_GPU_matrix == PETSC_OFFLOAD_CPU) { 249 try { 250 /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */ 251 nzLower=n+ai[n]-ai[1]; 252 253 /* Allocate Space for the lower triangular matrix */ 254 ierr = cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUDA(ierr); 255 ierr = cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUDA(ierr); 256 ierr = cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUDA(ierr); 257 258 /* Fill the lower triangular matrix */ 259 AiLo[0] = (PetscInt) 0; 260 AiLo[n] = nzLower; 261 AjLo[0] = (PetscInt) 0; 262 AALo[0] = (MatScalar) 1.0; 263 v = aa; 264 vi = aj; 265 offset = 1; 266 rowOffset= 1; 267 for (i=1; i<n; i++) { 268 nz = ai[i+1] - ai[i]; 269 /* additional 1 for the term on the diagonal */ 270 AiLo[i] = rowOffset; 271 rowOffset += nz+1; 272 273 ierr = PetscArraycpy(&(AjLo[offset]), vi, nz);CHKERRQ(ierr); 274 ierr = PetscArraycpy(&(AALo[offset]), v, nz);CHKERRQ(ierr); 275 276 offset += nz; 277 AjLo[offset] = (PetscInt) i; 278 AALo[offset] = (MatScalar) 1.0; 279 offset += 1; 280 281 v += nz; 282 vi += nz; 283 } 284 285 /* allocate space for the triangular factor information */ 286 loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct; 287 288 /* Create the matrix description */ 289 stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUDA(stat); 290 stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat); 291 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUDA(stat); 292 stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER);CHKERRCUDA(stat); 293 stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUDA(stat); 294 295 /* Create the solve analysis information */ 296 stat = cusparseCreateSolveAnalysisInfo(&loTriFactor->solveInfo);CHKERRCUDA(stat); 297 298 /* set the operation */ 299 loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 300 301 /* set the matrix */ 302 loTriFactor->csrMat = new CsrMatrix; 303 loTriFactor->csrMat->num_rows = n; 304 loTriFactor->csrMat->num_cols = n; 305 loTriFactor->csrMat->num_entries = nzLower; 306 307 loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1); 308 loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1); 309 310 loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower); 311 loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower); 312 313 loTriFactor->csrMat->values = new THRUSTARRAY(nzLower); 314 loTriFactor->csrMat->values->assign(AALo, AALo+nzLower); 315 316 /* perform the solve analysis */ 317 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, 318 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 319 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 320 loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo);CHKERRCUDA(stat); 321 322 /* assign the pointer. Is this really necessary? */ 323 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor; 324 325 ierr = cudaFreeHost(AiLo);CHKERRCUDA(ierr); 326 ierr = cudaFreeHost(AjLo);CHKERRCUDA(ierr); 327 ierr = cudaFreeHost(AALo);CHKERRCUDA(ierr); 328 ierr = PetscLogCpuToGpu((n+1+nzLower)*sizeof(int)+nzLower*sizeof(PetscScalar));CHKERRQ(ierr); 329 } catch(char *ex) { 330 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 331 } 332 } 333 PetscFunctionReturn(0); 334 } 335 336 static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A) 337 { 338 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 339 PetscInt n = A->rmap->n; 340 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 341 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 342 cusparseStatus_t stat; 343 const PetscInt *aj = a->j,*adiag = a->diag,*vi; 344 const MatScalar *aa = a->a,*v; 345 PetscInt *AiUp, *AjUp; 346 PetscScalar *AAUp; 347 PetscInt i,nz, nzUpper, offset; 348 PetscErrorCode ierr; 349 350 PetscFunctionBegin; 351 if (A->valid_GPU_matrix == PETSC_OFFLOAD_UNALLOCATED || A->valid_GPU_matrix == PETSC_OFFLOAD_CPU) { 352 try { 353 /* next, figure out the number of nonzeros in the upper triangular matrix. */ 354 nzUpper = adiag[0]-adiag[n]; 355 356 /* Allocate Space for the upper triangular matrix */ 357 ierr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(ierr); 358 ierr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(ierr); 359 ierr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(ierr); 360 361 /* Fill the upper triangular matrix */ 362 AiUp[0]=(PetscInt) 0; 363 AiUp[n]=nzUpper; 364 offset = nzUpper; 365 for (i=n-1; i>=0; i--) { 366 v = aa + adiag[i+1] + 1; 367 vi = aj + adiag[i+1] + 1; 368 369 /* number of elements NOT on the diagonal */ 370 nz = adiag[i] - adiag[i+1]-1; 371 372 /* decrement the offset */ 373 offset -= (nz+1); 374 375 /* first, set the diagonal elements */ 376 AjUp[offset] = (PetscInt) i; 377 AAUp[offset] = (MatScalar)1./v[nz]; 378 AiUp[i] = AiUp[i+1] - (nz+1); 379 380 ierr = PetscArraycpy(&(AjUp[offset+1]), vi, nz);CHKERRQ(ierr); 381 ierr = PetscArraycpy(&(AAUp[offset+1]), v, nz);CHKERRQ(ierr); 382 } 383 384 /* allocate space for the triangular factor information */ 385 upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct; 386 387 /* Create the matrix description */ 388 stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUDA(stat); 389 stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat); 390 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUDA(stat); 391 stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUDA(stat); 392 stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUDA(stat); 393 394 /* Create the solve analysis information */ 395 stat = cusparseCreateSolveAnalysisInfo(&upTriFactor->solveInfo);CHKERRCUDA(stat); 396 397 /* set the operation */ 398 upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 399 400 /* set the matrix */ 401 upTriFactor->csrMat = new CsrMatrix; 402 upTriFactor->csrMat->num_rows = n; 403 upTriFactor->csrMat->num_cols = n; 404 upTriFactor->csrMat->num_entries = nzUpper; 405 406 upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1); 407 upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1); 408 409 upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper); 410 upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper); 411 412 upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper); 413 upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper); 414 415 /* perform the solve analysis */ 416 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, 417 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 418 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 419 upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo);CHKERRCUDA(stat); 420 421 /* assign the pointer. Is this really necessary? */ 422 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor; 423 424 ierr = cudaFreeHost(AiUp);CHKERRCUDA(ierr); 425 ierr = cudaFreeHost(AjUp);CHKERRCUDA(ierr); 426 ierr = cudaFreeHost(AAUp);CHKERRCUDA(ierr); 427 ierr = PetscLogCpuToGpu((n+1+nzUpper)*sizeof(int)+nzUpper*sizeof(PetscScalar));CHKERRQ(ierr); 428 } catch(char *ex) { 429 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 430 } 431 } 432 PetscFunctionReturn(0); 433 } 434 435 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A) 436 { 437 PetscErrorCode ierr; 438 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 439 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 440 IS isrow = a->row,iscol = a->icol; 441 PetscBool row_identity,col_identity; 442 const PetscInt *r,*c; 443 PetscInt n = A->rmap->n; 444 445 PetscFunctionBegin; 446 ierr = MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);CHKERRQ(ierr); 447 ierr = MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);CHKERRQ(ierr); 448 449 cusparseTriFactors->workVector = new THRUSTARRAY(n); 450 cusparseTriFactors->nnz=a->nz; 451 452 A->valid_GPU_matrix = PETSC_OFFLOAD_BOTH; 453 /*lower triangular indices */ 454 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 455 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 456 if (!row_identity) { 457 cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 458 cusparseTriFactors->rpermIndices->assign(r, r+n); 459 } 460 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 461 462 /*upper triangular indices */ 463 ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr); 464 ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 465 if (!col_identity) { 466 cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 467 cusparseTriFactors->cpermIndices->assign(c, c+n); 468 } 469 470 if(!row_identity && !col_identity) { 471 ierr = PetscLogCpuToGpu(2*n*sizeof(PetscInt));CHKERRQ(ierr); 472 } else if(!row_identity) { 473 ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr); 474 } else if(!col_identity) { 475 ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr); 476 } 477 478 ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr); 479 PetscFunctionReturn(0); 480 } 481 482 static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A) 483 { 484 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 485 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 486 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 487 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 488 cusparseStatus_t stat; 489 PetscErrorCode ierr; 490 PetscInt *AiUp, *AjUp; 491 PetscScalar *AAUp; 492 PetscScalar *AALo; 493 PetscInt nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j; 494 Mat_SeqSBAIJ *b = (Mat_SeqSBAIJ*)A->data; 495 const PetscInt *ai = b->i,*aj = b->j,*vj; 496 const MatScalar *aa = b->a,*v; 497 498 PetscFunctionBegin; 499 if (A->valid_GPU_matrix == PETSC_OFFLOAD_UNALLOCATED || A->valid_GPU_matrix == PETSC_OFFLOAD_CPU) { 500 try { 501 /* Allocate Space for the upper triangular matrix */ 502 ierr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(ierr); 503 ierr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(ierr); 504 ierr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(ierr); 505 ierr = cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUDA(ierr); 506 507 /* Fill the upper triangular matrix */ 508 AiUp[0]=(PetscInt) 0; 509 AiUp[n]=nzUpper; 510 offset = 0; 511 for (i=0; i<n; i++) { 512 /* set the pointers */ 513 v = aa + ai[i]; 514 vj = aj + ai[i]; 515 nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */ 516 517 /* first, set the diagonal elements */ 518 AjUp[offset] = (PetscInt) i; 519 AAUp[offset] = (MatScalar)1.0/v[nz]; 520 AiUp[i] = offset; 521 AALo[offset] = (MatScalar)1.0/v[nz]; 522 523 offset+=1; 524 if (nz>0) { 525 ierr = PetscArraycpy(&(AjUp[offset]), vj, nz);CHKERRQ(ierr); 526 ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr); 527 for (j=offset; j<offset+nz; j++) { 528 AAUp[j] = -AAUp[j]; 529 AALo[j] = AAUp[j]/v[nz]; 530 } 531 offset+=nz; 532 } 533 } 534 535 /* allocate space for the triangular factor information */ 536 upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct; 537 538 /* Create the matrix description */ 539 stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUDA(stat); 540 stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat); 541 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUDA(stat); 542 stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUDA(stat); 543 stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUDA(stat); 544 545 /* Create the solve analysis information */ 546 stat = cusparseCreateSolveAnalysisInfo(&upTriFactor->solveInfo);CHKERRCUDA(stat); 547 548 /* set the operation */ 549 upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 550 551 /* set the matrix */ 552 upTriFactor->csrMat = new CsrMatrix; 553 upTriFactor->csrMat->num_rows = A->rmap->n; 554 upTriFactor->csrMat->num_cols = A->cmap->n; 555 upTriFactor->csrMat->num_entries = a->nz; 556 557 upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 558 upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1); 559 560 upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 561 upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz); 562 563 upTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 564 upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz); 565 566 /* perform the solve analysis */ 567 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, 568 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 569 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 570 upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo);CHKERRCUDA(stat); 571 572 /* assign the pointer. Is this really necessary? */ 573 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor; 574 575 /* allocate space for the triangular factor information */ 576 loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct; 577 578 /* Create the matrix description */ 579 stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUDA(stat); 580 stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat); 581 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUDA(stat); 582 stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUDA(stat); 583 stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUDA(stat); 584 585 /* Create the solve analysis information */ 586 stat = cusparseCreateSolveAnalysisInfo(&loTriFactor->solveInfo);CHKERRCUDA(stat); 587 588 /* set the operation */ 589 loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE; 590 591 /* set the matrix */ 592 loTriFactor->csrMat = new CsrMatrix; 593 loTriFactor->csrMat->num_rows = A->rmap->n; 594 loTriFactor->csrMat->num_cols = A->cmap->n; 595 loTriFactor->csrMat->num_entries = a->nz; 596 597 loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 598 loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1); 599 600 loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 601 loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz); 602 603 loTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 604 loTriFactor->csrMat->values->assign(AALo, AALo+a->nz); 605 ierr = PetscLogCpuToGpu(2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)));CHKERRQ(ierr); 606 607 /* perform the solve analysis */ 608 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, 609 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 610 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 611 loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo);CHKERRCUDA(stat); 612 613 /* assign the pointer. Is this really necessary? */ 614 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor; 615 616 A->valid_GPU_matrix = PETSC_OFFLOAD_BOTH; 617 ierr = cudaFreeHost(AiUp);CHKERRCUDA(ierr); 618 ierr = cudaFreeHost(AjUp);CHKERRCUDA(ierr); 619 ierr = cudaFreeHost(AAUp);CHKERRCUDA(ierr); 620 ierr = cudaFreeHost(AALo);CHKERRCUDA(ierr); 621 } catch(char *ex) { 622 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 623 } 624 } 625 PetscFunctionReturn(0); 626 } 627 628 static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A) 629 { 630 PetscErrorCode ierr; 631 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 632 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 633 IS ip = a->row; 634 const PetscInt *rip; 635 PetscBool perm_identity; 636 PetscInt n = A->rmap->n; 637 638 PetscFunctionBegin; 639 ierr = MatSeqAIJCUSPARSEBuildICCTriMatrices(A);CHKERRQ(ierr); 640 cusparseTriFactors->workVector = new THRUSTARRAY(n); 641 cusparseTriFactors->nnz=(a->nz-n)*2 + n; 642 643 /*lower triangular indices */ 644 ierr = ISGetIndices(ip,&rip);CHKERRQ(ierr); 645 ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr); 646 if (!perm_identity) { 647 cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 648 cusparseTriFactors->rpermIndices->assign(rip, rip+n); 649 cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 650 cusparseTriFactors->cpermIndices->assign(rip, rip+n); 651 ierr = PetscLogCpuToGpu(2*n*sizeof(PetscInt));CHKERRQ(ierr); 652 } 653 ierr = ISRestoreIndices(ip,&rip);CHKERRQ(ierr); 654 PetscFunctionReturn(0); 655 } 656 657 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info) 658 { 659 Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 660 IS isrow = b->row,iscol = b->col; 661 PetscBool row_identity,col_identity; 662 PetscErrorCode ierr; 663 664 PetscFunctionBegin; 665 ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 666 /* determine which version of MatSolve needs to be used. */ 667 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 668 ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 669 if (row_identity && col_identity) { 670 B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 671 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 672 } else { 673 B->ops->solve = MatSolve_SeqAIJCUSPARSE; 674 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 675 } 676 677 /* get the triangular factors */ 678 ierr = MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);CHKERRQ(ierr); 679 PetscFunctionReturn(0); 680 } 681 682 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info) 683 { 684 Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 685 IS ip = b->row; 686 PetscBool perm_identity; 687 PetscErrorCode ierr; 688 689 PetscFunctionBegin; 690 ierr = MatCholeskyFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 691 692 /* determine which version of MatSolve needs to be used. */ 693 ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr); 694 if (perm_identity) { 695 B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 696 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 697 } else { 698 B->ops->solve = MatSolve_SeqAIJCUSPARSE; 699 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 700 } 701 702 /* get the triangular factors */ 703 ierr = MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);CHKERRQ(ierr); 704 PetscFunctionReturn(0); 705 } 706 707 static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A) 708 { 709 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 710 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 711 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 712 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 713 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 714 cusparseStatus_t stat; 715 cusparseIndexBase_t indexBase; 716 cusparseMatrixType_t matrixType; 717 cusparseFillMode_t fillMode; 718 cusparseDiagType_t diagType; 719 720 PetscFunctionBegin; 721 722 /*********************************************/ 723 /* Now the Transpose of the Lower Tri Factor */ 724 /*********************************************/ 725 726 /* allocate space for the transpose of the lower triangular factor */ 727 loTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct; 728 729 /* set the matrix descriptors of the lower triangular factor */ 730 matrixType = cusparseGetMatType(loTriFactor->descr); 731 indexBase = cusparseGetMatIndexBase(loTriFactor->descr); 732 fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ? 733 CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 734 diagType = cusparseGetMatDiagType(loTriFactor->descr); 735 736 /* Create the matrix description */ 737 stat = cusparseCreateMatDescr(&loTriFactorT->descr);CHKERRCUDA(stat); 738 stat = cusparseSetMatIndexBase(loTriFactorT->descr, indexBase);CHKERRCUDA(stat); 739 stat = cusparseSetMatType(loTriFactorT->descr, matrixType);CHKERRCUDA(stat); 740 stat = cusparseSetMatFillMode(loTriFactorT->descr, fillMode);CHKERRCUDA(stat); 741 stat = cusparseSetMatDiagType(loTriFactorT->descr, diagType);CHKERRCUDA(stat); 742 743 /* Create the solve analysis information */ 744 stat = cusparseCreateSolveAnalysisInfo(&loTriFactorT->solveInfo);CHKERRCUDA(stat); 745 746 /* set the operation */ 747 loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 748 749 /* allocate GPU space for the CSC of the lower triangular factor*/ 750 loTriFactorT->csrMat = new CsrMatrix; 751 loTriFactorT->csrMat->num_rows = loTriFactor->csrMat->num_rows; 752 loTriFactorT->csrMat->num_cols = loTriFactor->csrMat->num_cols; 753 loTriFactorT->csrMat->num_entries = loTriFactor->csrMat->num_entries; 754 loTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(loTriFactor->csrMat->num_rows+1); 755 loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactor->csrMat->num_entries); 756 loTriFactorT->csrMat->values = new THRUSTARRAY(loTriFactor->csrMat->num_entries); 757 758 /* compute the transpose of the lower triangular factor, i.e. the CSC */ 759 stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, 760 loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, 761 loTriFactor->csrMat->values->data().get(), 762 loTriFactor->csrMat->row_offsets->data().get(), 763 loTriFactor->csrMat->column_indices->data().get(), 764 loTriFactorT->csrMat->values->data().get(), 765 loTriFactorT->csrMat->column_indices->data().get(), 766 loTriFactorT->csrMat->row_offsets->data().get(), 767 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUDA(stat); 768 769 /* perform the solve analysis on the transposed matrix */ 770 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp, 771 loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, 772 loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(), 773 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), 774 loTriFactorT->solveInfo);CHKERRCUDA(stat); 775 776 /* assign the pointer. Is this really necessary? */ 777 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT; 778 779 /*********************************************/ 780 /* Now the Transpose of the Upper Tri Factor */ 781 /*********************************************/ 782 783 /* allocate space for the transpose of the upper triangular factor */ 784 upTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct; 785 786 /* set the matrix descriptors of the upper triangular factor */ 787 matrixType = cusparseGetMatType(upTriFactor->descr); 788 indexBase = cusparseGetMatIndexBase(upTriFactor->descr); 789 fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ? 790 CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 791 diagType = cusparseGetMatDiagType(upTriFactor->descr); 792 793 /* Create the matrix description */ 794 stat = cusparseCreateMatDescr(&upTriFactorT->descr);CHKERRCUDA(stat); 795 stat = cusparseSetMatIndexBase(upTriFactorT->descr, indexBase);CHKERRCUDA(stat); 796 stat = cusparseSetMatType(upTriFactorT->descr, matrixType);CHKERRCUDA(stat); 797 stat = cusparseSetMatFillMode(upTriFactorT->descr, fillMode);CHKERRCUDA(stat); 798 stat = cusparseSetMatDiagType(upTriFactorT->descr, diagType);CHKERRCUDA(stat); 799 800 /* Create the solve analysis information */ 801 stat = cusparseCreateSolveAnalysisInfo(&upTriFactorT->solveInfo);CHKERRCUDA(stat); 802 803 /* set the operation */ 804 upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 805 806 /* allocate GPU space for the CSC of the upper triangular factor*/ 807 upTriFactorT->csrMat = new CsrMatrix; 808 upTriFactorT->csrMat->num_rows = upTriFactor->csrMat->num_rows; 809 upTriFactorT->csrMat->num_cols = upTriFactor->csrMat->num_cols; 810 upTriFactorT->csrMat->num_entries = upTriFactor->csrMat->num_entries; 811 upTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(upTriFactor->csrMat->num_rows+1); 812 upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactor->csrMat->num_entries); 813 upTriFactorT->csrMat->values = new THRUSTARRAY(upTriFactor->csrMat->num_entries); 814 815 /* compute the transpose of the upper triangular factor, i.e. the CSC */ 816 stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, 817 upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, 818 upTriFactor->csrMat->values->data().get(), 819 upTriFactor->csrMat->row_offsets->data().get(), 820 upTriFactor->csrMat->column_indices->data().get(), 821 upTriFactorT->csrMat->values->data().get(), 822 upTriFactorT->csrMat->column_indices->data().get(), 823 upTriFactorT->csrMat->row_offsets->data().get(), 824 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUDA(stat); 825 826 /* perform the solve analysis on the transposed matrix */ 827 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp, 828 upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, 829 upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(), 830 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), 831 upTriFactorT->solveInfo);CHKERRCUDA(stat); 832 833 /* assign the pointer. Is this really necessary? */ 834 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT; 835 PetscFunctionReturn(0); 836 } 837 838 static PetscErrorCode MatSeqAIJCUSPARSEGenerateTransposeForMult(Mat A) 839 { 840 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 841 Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 842 Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 843 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 844 cusparseStatus_t stat; 845 cusparseIndexBase_t indexBase; 846 cudaError_t err; 847 PetscErrorCode ierr; 848 849 PetscFunctionBegin; 850 851 /* allocate space for the triangular factor information */ 852 matstructT = new Mat_SeqAIJCUSPARSEMultStruct; 853 stat = cusparseCreateMatDescr(&matstructT->descr);CHKERRCUDA(stat); 854 indexBase = cusparseGetMatIndexBase(matstruct->descr); 855 stat = cusparseSetMatIndexBase(matstructT->descr, indexBase);CHKERRCUDA(stat); 856 stat = cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUDA(stat); 857 858 /* set alpha and beta */ 859 err = cudaMalloc((void **)&(matstructT->alpha), sizeof(PetscScalar));CHKERRCUDA(err); 860 err = cudaMalloc((void **)&(matstructT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err); 861 err = cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar));CHKERRCUDA(err); 862 err = cudaMemcpy(matstructT->alpha, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 863 err = cudaMemcpy(matstructT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 864 err = cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 865 stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUDA(stat); 866 867 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 868 CsrMatrix *matrix = (CsrMatrix*)matstruct->mat; 869 CsrMatrix *matrixT= new CsrMatrix; 870 matrixT->num_rows = A->cmap->n; 871 matrixT->num_cols = A->rmap->n; 872 matrixT->num_entries = a->nz; 873 matrixT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 874 matrixT->column_indices = new THRUSTINTARRAY32(a->nz); 875 matrixT->values = new THRUSTARRAY(a->nz); 876 877 /* compute the transpose of the upper triangular factor, i.e. the CSC */ 878 indexBase = cusparseGetMatIndexBase(matstruct->descr); 879 stat = cusparse_csr2csc(cusparsestruct->handle, matrix->num_rows, 880 matrix->num_cols, matrix->num_entries, 881 matrix->values->data().get(), 882 matrix->row_offsets->data().get(), 883 matrix->column_indices->data().get(), 884 matrixT->values->data().get(), 885 matrixT->column_indices->data().get(), 886 matrixT->row_offsets->data().get(), 887 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUDA(stat); 888 889 /* assign the pointer */ 890 matstructT->mat = matrixT; 891 ierr = PetscLogCpuToGpu(((A->rmap->n+1)+(a->nz))*sizeof(int)+(3+a->nz)*sizeof(PetscScalar));CHKERRQ(ierr); 892 } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) { 893 /* First convert HYB to CSR */ 894 CsrMatrix *temp= new CsrMatrix; 895 temp->num_rows = A->rmap->n; 896 temp->num_cols = A->cmap->n; 897 temp->num_entries = a->nz; 898 temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 899 temp->column_indices = new THRUSTINTARRAY32(a->nz); 900 temp->values = new THRUSTARRAY(a->nz); 901 902 903 stat = cusparse_hyb2csr(cusparsestruct->handle, 904 matstruct->descr, (cusparseHybMat_t)matstruct->mat, 905 temp->values->data().get(), 906 temp->row_offsets->data().get(), 907 temp->column_indices->data().get());CHKERRCUDA(stat); 908 909 /* Next, convert CSR to CSC (i.e. the matrix transpose) */ 910 CsrMatrix *tempT= new CsrMatrix; 911 tempT->num_rows = A->rmap->n; 912 tempT->num_cols = A->cmap->n; 913 tempT->num_entries = a->nz; 914 tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 915 tempT->column_indices = new THRUSTINTARRAY32(a->nz); 916 tempT->values = new THRUSTARRAY(a->nz); 917 918 stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows, 919 temp->num_cols, temp->num_entries, 920 temp->values->data().get(), 921 temp->row_offsets->data().get(), 922 temp->column_indices->data().get(), 923 tempT->values->data().get(), 924 tempT->column_indices->data().get(), 925 tempT->row_offsets->data().get(), 926 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUDA(stat); 927 928 /* Last, convert CSC to HYB */ 929 cusparseHybMat_t hybMat; 930 stat = cusparseCreateHybMat(&hybMat);CHKERRCUDA(stat); 931 cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ? 932 CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 933 stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n, 934 matstructT->descr, tempT->values->data().get(), 935 tempT->row_offsets->data().get(), 936 tempT->column_indices->data().get(), 937 hybMat, 0, partition);CHKERRCUDA(stat); 938 939 /* assign the pointer */ 940 matstructT->mat = hybMat; 941 ierr = PetscLogCpuToGpu((2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)))+3*sizeof(PetscScalar));CHKERRQ(ierr); 942 943 /* delete temporaries */ 944 if (tempT) { 945 if (tempT->values) delete (THRUSTARRAY*) tempT->values; 946 if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices; 947 if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets; 948 delete (CsrMatrix*) tempT; 949 } 950 if (temp) { 951 if (temp->values) delete (THRUSTARRAY*) temp->values; 952 if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices; 953 if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets; 954 delete (CsrMatrix*) temp; 955 } 956 } 957 /* assign the compressed row indices */ 958 matstructT->cprowIndices = new THRUSTINTARRAY; 959 matstructT->cprowIndices->resize(A->cmap->n); 960 thrust::sequence(matstructT->cprowIndices->begin(), matstructT->cprowIndices->end()); 961 /* assign the pointer */ 962 ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT; 963 PetscFunctionReturn(0); 964 } 965 966 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 967 { 968 PetscInt n = xx->map->n; 969 const PetscScalar *barray; 970 PetscScalar *xarray; 971 thrust::device_ptr<const PetscScalar> bGPU; 972 thrust::device_ptr<PetscScalar> xGPU; 973 cusparseStatus_t stat; 974 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 975 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 976 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 977 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 978 PetscErrorCode ierr; 979 980 PetscFunctionBegin; 981 /* Analyze the matrix and create the transpose ... on the fly */ 982 if (!loTriFactorT && !upTriFactorT) { 983 ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr); 984 loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 985 upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 986 } 987 988 /* Get the GPU pointers */ 989 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 990 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 991 xGPU = thrust::device_pointer_cast(xarray); 992 bGPU = thrust::device_pointer_cast(barray); 993 994 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 995 /* First, reorder with the row permutation */ 996 thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), 997 thrust::make_permutation_iterator(bGPU+n, cusparseTriFactors->rpermIndices->end()), 998 xGPU); 999 1000 /* First, solve U */ 1001 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, 1002 upTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactorT->descr, 1003 upTriFactorT->csrMat->values->data().get(), 1004 upTriFactorT->csrMat->row_offsets->data().get(), 1005 upTriFactorT->csrMat->column_indices->data().get(), 1006 upTriFactorT->solveInfo, 1007 xarray, tempGPU->data().get());CHKERRCUDA(stat); 1008 1009 /* Then, solve L */ 1010 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, 1011 loTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactorT->descr, 1012 loTriFactorT->csrMat->values->data().get(), 1013 loTriFactorT->csrMat->row_offsets->data().get(), 1014 loTriFactorT->csrMat->column_indices->data().get(), 1015 loTriFactorT->solveInfo, 1016 tempGPU->data().get(), xarray);CHKERRCUDA(stat); 1017 1018 /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */ 1019 thrust::copy(thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()), 1020 thrust::make_permutation_iterator(xGPU+n, cusparseTriFactors->cpermIndices->end()), 1021 tempGPU->begin()); 1022 1023 /* Copy the temporary to the full solution. */ 1024 thrust::copy(tempGPU->begin(), tempGPU->end(), xGPU); 1025 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1026 1027 /* restore */ 1028 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1029 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1030 ierr = WaitForGPU();CHKERRCUDA(ierr); 1031 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1032 PetscFunctionReturn(0); 1033 } 1034 1035 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 1036 { 1037 const PetscScalar *barray; 1038 PetscScalar *xarray; 1039 cusparseStatus_t stat; 1040 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1041 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1042 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1043 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1044 PetscErrorCode ierr; 1045 1046 PetscFunctionBegin; 1047 /* Analyze the matrix and create the transpose ... on the fly */ 1048 if (!loTriFactorT && !upTriFactorT) { 1049 ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr); 1050 loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1051 upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1052 } 1053 1054 /* Get the GPU pointers */ 1055 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1056 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1057 1058 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1059 /* First, solve U */ 1060 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, 1061 upTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactorT->descr, 1062 upTriFactorT->csrMat->values->data().get(), 1063 upTriFactorT->csrMat->row_offsets->data().get(), 1064 upTriFactorT->csrMat->column_indices->data().get(), 1065 upTriFactorT->solveInfo, 1066 barray, tempGPU->data().get());CHKERRCUDA(stat); 1067 1068 /* Then, solve L */ 1069 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, 1070 loTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactorT->descr, 1071 loTriFactorT->csrMat->values->data().get(), 1072 loTriFactorT->csrMat->row_offsets->data().get(), 1073 loTriFactorT->csrMat->column_indices->data().get(), 1074 loTriFactorT->solveInfo, 1075 tempGPU->data().get(), xarray);CHKERRCUDA(stat); 1076 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1077 1078 /* restore */ 1079 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1080 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1081 ierr = WaitForGPU();CHKERRCUDA(ierr); 1082 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1083 PetscFunctionReturn(0); 1084 } 1085 1086 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 1087 { 1088 const PetscScalar *barray; 1089 PetscScalar *xarray; 1090 thrust::device_ptr<const PetscScalar> bGPU; 1091 thrust::device_ptr<PetscScalar> xGPU; 1092 cusparseStatus_t stat; 1093 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1094 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1095 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1096 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1097 PetscErrorCode ierr; 1098 1099 PetscFunctionBegin; 1100 1101 /* Get the GPU pointers */ 1102 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1103 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1104 xGPU = thrust::device_pointer_cast(xarray); 1105 bGPU = thrust::device_pointer_cast(barray); 1106 1107 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1108 /* First, reorder with the row permutation */ 1109 thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), 1110 thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()), 1111 xGPU); 1112 1113 /* Next, solve L */ 1114 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp, 1115 loTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactor->descr, 1116 loTriFactor->csrMat->values->data().get(), 1117 loTriFactor->csrMat->row_offsets->data().get(), 1118 loTriFactor->csrMat->column_indices->data().get(), 1119 loTriFactor->solveInfo, 1120 xarray, tempGPU->data().get());CHKERRCUDA(stat); 1121 1122 /* Then, solve U */ 1123 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp, 1124 upTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactor->descr, 1125 upTriFactor->csrMat->values->data().get(), 1126 upTriFactor->csrMat->row_offsets->data().get(), 1127 upTriFactor->csrMat->column_indices->data().get(), 1128 upTriFactor->solveInfo, 1129 tempGPU->data().get(), xarray);CHKERRCUDA(stat); 1130 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1131 1132 /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */ 1133 thrust::copy(thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()), 1134 thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->end()), 1135 tempGPU->begin()); 1136 1137 /* Copy the temporary to the full solution. */ 1138 thrust::copy(tempGPU->begin(), tempGPU->end(), xGPU); 1139 1140 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1141 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1142 ierr = WaitForGPU();CHKERRCUDA(ierr); 1143 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1144 PetscFunctionReturn(0); 1145 } 1146 1147 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 1148 { 1149 const PetscScalar *barray; 1150 PetscScalar *xarray; 1151 cusparseStatus_t stat; 1152 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1153 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1154 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1155 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1156 PetscErrorCode ierr; 1157 1158 PetscFunctionBegin; 1159 /* Get the GPU pointers */ 1160 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1161 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1162 1163 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1164 /* First, solve L */ 1165 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp, 1166 loTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactor->descr, 1167 loTriFactor->csrMat->values->data().get(), 1168 loTriFactor->csrMat->row_offsets->data().get(), 1169 loTriFactor->csrMat->column_indices->data().get(), 1170 loTriFactor->solveInfo, 1171 barray, tempGPU->data().get());CHKERRCUDA(stat); 1172 1173 /* Next, solve U */ 1174 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp, 1175 upTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactor->descr, 1176 upTriFactor->csrMat->values->data().get(), 1177 upTriFactor->csrMat->row_offsets->data().get(), 1178 upTriFactor->csrMat->column_indices->data().get(), 1179 upTriFactor->solveInfo, 1180 tempGPU->data().get(), xarray);CHKERRCUDA(stat); 1181 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1182 1183 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1184 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1185 ierr = WaitForGPU();CHKERRCUDA(ierr); 1186 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1187 PetscFunctionReturn(0); 1188 } 1189 1190 static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A) 1191 { 1192 1193 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1194 Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 1195 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1196 PetscInt m = A->rmap->n,*ii,*ridx; 1197 PetscErrorCode ierr; 1198 cusparseStatus_t stat; 1199 cudaError_t err; 1200 1201 PetscFunctionBegin; 1202 if (A->pinnedtocpu) PetscFunctionReturn(0); 1203 if (A->valid_GPU_matrix == PETSC_OFFLOAD_UNALLOCATED || A->valid_GPU_matrix == PETSC_OFFLOAD_CPU) { 1204 ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1205 if (A->assembled && A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) { 1206 CsrMatrix *matrix = (CsrMatrix*)matstruct->mat; 1207 /* copy values only */ 1208 matrix->values->assign(a->a, a->a+a->nz); 1209 ierr = PetscLogCpuToGpu((a->nz)*sizeof(PetscScalar));CHKERRQ(ierr); 1210 } else { 1211 MatSeqAIJCUSPARSEMultStruct_Destroy(&matstruct,cusparsestruct->format); 1212 try { 1213 cusparsestruct->nonzerorow=0; 1214 for (int j = 0; j<m; j++) cusparsestruct->nonzerorow += ((a->i[j+1]-a->i[j])>0); 1215 1216 if (a->compressedrow.use) { 1217 m = a->compressedrow.nrows; 1218 ii = a->compressedrow.i; 1219 ridx = a->compressedrow.rindex; 1220 } else { 1221 /* Forcing compressed row on the GPU */ 1222 int k=0; 1223 ierr = PetscMalloc1(cusparsestruct->nonzerorow+1, &ii);CHKERRQ(ierr); 1224 ierr = PetscMalloc1(cusparsestruct->nonzerorow, &ridx);CHKERRQ(ierr); 1225 ii[0]=0; 1226 for (int j = 0; j<m; j++) { 1227 if ((a->i[j+1]-a->i[j])>0) { 1228 ii[k] = a->i[j]; 1229 ridx[k]= j; 1230 k++; 1231 } 1232 } 1233 ii[cusparsestruct->nonzerorow] = a->nz; 1234 m = cusparsestruct->nonzerorow; 1235 } 1236 1237 /* allocate space for the triangular factor information */ 1238 matstruct = new Mat_SeqAIJCUSPARSEMultStruct; 1239 stat = cusparseCreateMatDescr(&matstruct->descr);CHKERRCUDA(stat); 1240 stat = cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat); 1241 stat = cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUDA(stat); 1242 1243 err = cudaMalloc((void **)&(matstruct->alpha), sizeof(PetscScalar));CHKERRCUDA(err); 1244 err = cudaMalloc((void **)&(matstruct->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err); 1245 err = cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar));CHKERRCUDA(err); 1246 err = cudaMemcpy(matstruct->alpha, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1247 err = cudaMemcpy(matstruct->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1248 err = cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1249 stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUDA(stat); 1250 1251 /* Build a hybrid/ellpack matrix if this option is chosen for the storage */ 1252 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1253 /* set the matrix */ 1254 CsrMatrix *matrix= new CsrMatrix; 1255 matrix->num_rows = m; 1256 matrix->num_cols = A->cmap->n; 1257 matrix->num_entries = a->nz; 1258 matrix->row_offsets = new THRUSTINTARRAY32(m+1); 1259 matrix->row_offsets->assign(ii, ii + m+1); 1260 1261 matrix->column_indices = new THRUSTINTARRAY32(a->nz); 1262 matrix->column_indices->assign(a->j, a->j+a->nz); 1263 1264 matrix->values = new THRUSTARRAY(a->nz); 1265 matrix->values->assign(a->a, a->a+a->nz); 1266 1267 /* assign the pointer */ 1268 matstruct->mat = matrix; 1269 1270 } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) { 1271 CsrMatrix *matrix= new CsrMatrix; 1272 matrix->num_rows = m; 1273 matrix->num_cols = A->cmap->n; 1274 matrix->num_entries = a->nz; 1275 matrix->row_offsets = new THRUSTINTARRAY32(m+1); 1276 matrix->row_offsets->assign(ii, ii + m+1); 1277 1278 matrix->column_indices = new THRUSTINTARRAY32(a->nz); 1279 matrix->column_indices->assign(a->j, a->j+a->nz); 1280 1281 matrix->values = new THRUSTARRAY(a->nz); 1282 matrix->values->assign(a->a, a->a+a->nz); 1283 1284 cusparseHybMat_t hybMat; 1285 stat = cusparseCreateHybMat(&hybMat);CHKERRCUDA(stat); 1286 cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ? 1287 CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 1288 stat = cusparse_csr2hyb(cusparsestruct->handle, matrix->num_rows, matrix->num_cols, 1289 matstruct->descr, matrix->values->data().get(), 1290 matrix->row_offsets->data().get(), 1291 matrix->column_indices->data().get(), 1292 hybMat, 0, partition);CHKERRCUDA(stat); 1293 /* assign the pointer */ 1294 matstruct->mat = hybMat; 1295 1296 if (matrix) { 1297 if (matrix->values) delete (THRUSTARRAY*)matrix->values; 1298 if (matrix->column_indices) delete (THRUSTINTARRAY32*)matrix->column_indices; 1299 if (matrix->row_offsets) delete (THRUSTINTARRAY32*)matrix->row_offsets; 1300 delete (CsrMatrix*)matrix; 1301 } 1302 } 1303 1304 /* assign the compressed row indices */ 1305 matstruct->cprowIndices = new THRUSTINTARRAY(m); 1306 matstruct->cprowIndices->assign(ridx,ridx+m); 1307 ierr = PetscLogCpuToGpu(((m+1)+(a->nz))*sizeof(int)+m*sizeof(PetscInt)+(3+(a->nz))*sizeof(PetscScalar));CHKERRQ(ierr); 1308 1309 /* assign the pointer */ 1310 cusparsestruct->mat = matstruct; 1311 1312 if (!a->compressedrow.use) { 1313 ierr = PetscFree(ii);CHKERRQ(ierr); 1314 ierr = PetscFree(ridx);CHKERRQ(ierr); 1315 } 1316 cusparsestruct->workVector = new THRUSTARRAY(m); 1317 } catch(char *ex) { 1318 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 1319 } 1320 cusparsestruct->nonzerostate = A->nonzerostate; 1321 } 1322 ierr = WaitForGPU();CHKERRCUDA(ierr); 1323 A->valid_GPU_matrix = PETSC_OFFLOAD_BOTH; 1324 ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1325 } 1326 PetscFunctionReturn(0); 1327 } 1328 1329 struct VecCUDAPlusEquals 1330 { 1331 template <typename Tuple> 1332 __host__ __device__ 1333 void operator()(Tuple t) 1334 { 1335 thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t); 1336 } 1337 }; 1338 1339 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 1340 { 1341 PetscErrorCode ierr; 1342 1343 PetscFunctionBegin; 1344 ierr = MatMultAdd_SeqAIJCUSPARSE(A,xx,NULL,yy);CHKERRQ(ierr); 1345 PetscFunctionReturn(0); 1346 } 1347 1348 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 1349 { 1350 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1351 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1352 Mat_SeqAIJCUSPARSEMultStruct *matstructT; 1353 const PetscScalar *xarray; 1354 PetscScalar *yarray; 1355 PetscErrorCode ierr; 1356 cusparseStatus_t stat; 1357 1358 PetscFunctionBegin; 1359 /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */ 1360 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 1361 matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 1362 if (!matstructT) { 1363 ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr); 1364 matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 1365 } 1366 ierr = VecCUDAGetArrayRead(xx,&xarray);CHKERRQ(ierr); 1367 ierr = VecSet(yy,0);CHKERRQ(ierr); 1368 ierr = VecCUDAGetArrayWrite(yy,&yarray);CHKERRQ(ierr); 1369 1370 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1371 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1372 CsrMatrix *mat = (CsrMatrix*)matstructT->mat; 1373 stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1374 mat->num_rows, mat->num_cols, 1375 mat->num_entries, matstructT->alpha, matstructT->descr, 1376 mat->values->data().get(), mat->row_offsets->data().get(), 1377 mat->column_indices->data().get(), xarray, matstructT->beta_zero, 1378 yarray);CHKERRCUDA(stat); 1379 } else { 1380 cusparseHybMat_t hybMat = (cusparseHybMat_t)matstructT->mat; 1381 stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1382 matstructT->alpha, matstructT->descr, hybMat, 1383 xarray, matstructT->beta_zero, 1384 yarray);CHKERRCUDA(stat); 1385 } 1386 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1387 ierr = VecCUDARestoreArrayRead(xx,&xarray);CHKERRQ(ierr); 1388 ierr = VecCUDARestoreArrayWrite(yy,&yarray);CHKERRQ(ierr); 1389 if (!cusparsestruct->stream) { 1390 ierr = WaitForGPU();CHKERRCUDA(ierr); 1391 } 1392 ierr = PetscLogGpuFlops(2.0*a->nz - cusparsestruct->nonzerorow);CHKERRQ(ierr); 1393 PetscFunctionReturn(0); 1394 } 1395 1396 1397 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 1398 { 1399 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1400 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1401 Mat_SeqAIJCUSPARSEMultStruct *matstruct; 1402 const PetscScalar *xarray; 1403 PetscScalar *zarray,*dptr,*beta; 1404 PetscErrorCode ierr; 1405 cusparseStatus_t stat; 1406 1407 PetscFunctionBegin; 1408 /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */ 1409 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 1410 matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 1411 try { 1412 ierr = VecCUDAGetArrayRead(xx,&xarray);CHKERRQ(ierr); 1413 ierr = VecCUDAGetArray(zz,&zarray);CHKERRQ(ierr); 1414 dptr = cusparsestruct->workVector->size() == (thrust::detail::vector_base<PetscScalar, thrust::device_malloc_allocator<PetscScalar> >::size_type)(A->rmap->n) ? zarray : cusparsestruct->workVector->data().get(); 1415 beta = (yy == zz && dptr == zarray) ? matstruct->beta_one : matstruct->beta_zero; 1416 1417 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1418 /* csr_spmv is multiply add */ 1419 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 1420 /* here we need to be careful to set the number of rows in the multiply to the 1421 number of compressed rows in the matrix ... which is equivalent to the 1422 size of the workVector */ 1423 CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 1424 stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1425 mat->num_rows, mat->num_cols, 1426 mat->num_entries, matstruct->alpha, matstruct->descr, 1427 mat->values->data().get(), mat->row_offsets->data().get(), 1428 mat->column_indices->data().get(), xarray, beta, 1429 dptr);CHKERRCUDA(stat); 1430 } else { 1431 if (cusparsestruct->workVector->size()) { 1432 cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat; 1433 stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1434 matstruct->alpha, matstruct->descr, hybMat, 1435 xarray, beta, 1436 dptr);CHKERRCUDA(stat); 1437 } 1438 } 1439 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1440 1441 if (yy) { 1442 if (dptr != zarray) { 1443 ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr); 1444 } else if (zz != yy) { 1445 ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); 1446 } 1447 } else if (dptr != zarray) { 1448 ierr = VecSet(zz,0);CHKERRQ(ierr); 1449 } 1450 /* scatter the data from the temporary into the full vector with a += operation */ 1451 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1452 if (dptr != zarray) { 1453 thrust::device_ptr<PetscScalar> zptr; 1454 1455 zptr = thrust::device_pointer_cast(zarray); 1456 thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))), 1457 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + cusparsestruct->workVector->size(), 1458 VecCUDAPlusEquals()); 1459 } 1460 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1461 ierr = VecCUDARestoreArrayRead(xx,&xarray);CHKERRQ(ierr); 1462 ierr = VecCUDARestoreArray(zz,&zarray);CHKERRQ(ierr); 1463 } catch(char *ex) { 1464 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 1465 } 1466 if (!yy) { /* MatMult */ 1467 if (!cusparsestruct->stream) { 1468 ierr = WaitForGPU();CHKERRCUDA(ierr); 1469 } 1470 } 1471 ierr = PetscLogGpuFlops(2.0*a->nz);CHKERRQ(ierr); 1472 PetscFunctionReturn(0); 1473 } 1474 1475 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 1476 { 1477 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1478 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1479 Mat_SeqAIJCUSPARSEMultStruct *matstructT; 1480 thrust::device_ptr<PetscScalar> zptr; 1481 const PetscScalar *xarray; 1482 PetscScalar *zarray; 1483 PetscErrorCode ierr; 1484 cusparseStatus_t stat; 1485 1486 PetscFunctionBegin; 1487 /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */ 1488 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 1489 matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 1490 if (!matstructT) { 1491 ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr); 1492 matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 1493 } 1494 1495 try { 1496 ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr); 1497 ierr = VecCUDAGetArrayRead(xx,&xarray);CHKERRQ(ierr); 1498 ierr = VecCUDAGetArray(zz,&zarray);CHKERRQ(ierr); 1499 zptr = thrust::device_pointer_cast(zarray); 1500 1501 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1502 /* multiply add with matrix transpose */ 1503 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1504 CsrMatrix *mat = (CsrMatrix*)matstructT->mat; 1505 /* here we need to be careful to set the number of rows in the multiply to the 1506 number of compressed rows in the matrix ... which is equivalent to the 1507 size of the workVector */ 1508 stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1509 mat->num_rows, mat->num_cols, 1510 mat->num_entries, matstructT->alpha, matstructT->descr, 1511 mat->values->data().get(), mat->row_offsets->data().get(), 1512 mat->column_indices->data().get(), xarray, matstructT->beta_zero, 1513 cusparsestruct->workVector->data().get());CHKERRCUDA(stat); 1514 } else { 1515 cusparseHybMat_t hybMat = (cusparseHybMat_t)matstructT->mat; 1516 if (cusparsestruct->workVector->size()) { 1517 stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1518 matstructT->alpha, matstructT->descr, hybMat, 1519 xarray, matstructT->beta_zero, 1520 cusparsestruct->workVector->data().get());CHKERRCUDA(stat); 1521 } 1522 } 1523 1524 /* scatter the data from the temporary into the full vector with a += operation */ 1525 thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstructT->cprowIndices->begin()))), 1526 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstructT->cprowIndices->begin()))) + A->cmap->n, 1527 VecCUDAPlusEquals()); 1528 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1529 ierr = VecCUDARestoreArrayRead(xx,&xarray);CHKERRQ(ierr); 1530 ierr = VecCUDARestoreArray(zz,&zarray);CHKERRQ(ierr); 1531 1532 } catch(char *ex) { 1533 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 1534 } 1535 ierr = WaitForGPU();CHKERRCUDA(ierr); 1536 ierr = PetscLogGpuFlops(2.0*a->nz);CHKERRQ(ierr); 1537 PetscFunctionReturn(0); 1538 } 1539 1540 static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode) 1541 { 1542 PetscErrorCode ierr; 1543 1544 PetscFunctionBegin; 1545 ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr); 1546 if (A->factortype==MAT_FACTOR_NONE) { 1547 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 1548 } 1549 if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0); 1550 A->ops->mult = MatMult_SeqAIJCUSPARSE; 1551 A->ops->multadd = MatMultAdd_SeqAIJCUSPARSE; 1552 A->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE; 1553 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE; 1554 PetscFunctionReturn(0); 1555 } 1556 1557 /* --------------------------------------------------------------------------------*/ 1558 /*@ 1559 MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format 1560 (the default parallel PETSc format). This matrix will ultimately pushed down 1561 to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix 1562 assembly performance the user should preallocate the matrix storage by setting 1563 the parameter nz (or the array nnz). By setting these parameters accurately, 1564 performance during matrix assembly can be increased by more than a factor of 50. 1565 1566 Collective 1567 1568 Input Parameters: 1569 + comm - MPI communicator, set to PETSC_COMM_SELF 1570 . m - number of rows 1571 . n - number of columns 1572 . nz - number of nonzeros per row (same for all rows) 1573 - nnz - array containing the number of nonzeros in the various rows 1574 (possibly different for each row) or NULL 1575 1576 Output Parameter: 1577 . A - the matrix 1578 1579 It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 1580 MatXXXXSetPreallocation() paradgm instead of this routine directly. 1581 [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 1582 1583 Notes: 1584 If nnz is given then nz is ignored 1585 1586 The AIJ format (also called the Yale sparse matrix format or 1587 compressed row storage), is fully compatible with standard Fortran 77 1588 storage. That is, the stored row and column indices can begin at 1589 either one (as in Fortran) or zero. See the users' manual for details. 1590 1591 Specify the preallocated storage with either nz or nnz (not both). 1592 Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 1593 allocation. For large problems you MUST preallocate memory or you 1594 will get TERRIBLE performance, see the users' manual chapter on matrices. 1595 1596 By default, this format uses inodes (identical nodes) when possible, to 1597 improve numerical efficiency of matrix-vector products and solves. We 1598 search for consecutive rows with the same nonzero structure, thereby 1599 reusing matrix information to achieve increased efficiency. 1600 1601 Level: intermediate 1602 1603 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE 1604 @*/ 1605 PetscErrorCode MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A) 1606 { 1607 PetscErrorCode ierr; 1608 1609 PetscFunctionBegin; 1610 ierr = MatCreate(comm,A);CHKERRQ(ierr); 1611 ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr); 1612 ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 1613 ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr); 1614 PetscFunctionReturn(0); 1615 } 1616 1617 static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A) 1618 { 1619 PetscErrorCode ierr; 1620 1621 PetscFunctionBegin; 1622 if (A->factortype==MAT_FACTOR_NONE) { 1623 if (A->valid_GPU_matrix != PETSC_OFFLOAD_UNALLOCATED) { 1624 ierr = MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr);CHKERRQ(ierr); 1625 } 1626 } else { 1627 ierr = MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr);CHKERRQ(ierr); 1628 } 1629 ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr); 1630 PetscFunctionReturn(0); 1631 } 1632 1633 static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A,MatDuplicateOption cpvalues,Mat *B) 1634 { 1635 PetscErrorCode ierr; 1636 Mat C; 1637 cusparseStatus_t stat; 1638 cusparseHandle_t handle=0; 1639 1640 PetscFunctionBegin; 1641 ierr = MatDuplicate_SeqAIJ(A,cpvalues,B);CHKERRQ(ierr); 1642 C = *B; 1643 ierr = PetscFree(C->defaultvectype);CHKERRQ(ierr); 1644 ierr = PetscStrallocpy(VECCUDA,&C->defaultvectype);CHKERRQ(ierr); 1645 1646 /* inject CUSPARSE-specific stuff */ 1647 if (C->factortype==MAT_FACTOR_NONE) { 1648 /* you cannot check the inode.use flag here since the matrix was just created. 1649 now build a GPU matrix data structure */ 1650 C->spptr = new Mat_SeqAIJCUSPARSE; 1651 ((Mat_SeqAIJCUSPARSE*)C->spptr)->mat = 0; 1652 ((Mat_SeqAIJCUSPARSE*)C->spptr)->matTranspose = 0; 1653 ((Mat_SeqAIJCUSPARSE*)C->spptr)->workVector = 0; 1654 ((Mat_SeqAIJCUSPARSE*)C->spptr)->format = MAT_CUSPARSE_CSR; 1655 ((Mat_SeqAIJCUSPARSE*)C->spptr)->stream = 0; 1656 stat = cusparseCreate(&handle);CHKERRCUDA(stat); 1657 ((Mat_SeqAIJCUSPARSE*)C->spptr)->handle = handle; 1658 ((Mat_SeqAIJCUSPARSE*)C->spptr)->nonzerostate = 0; 1659 } else { 1660 /* NEXT, set the pointers to the triangular factors */ 1661 C->spptr = new Mat_SeqAIJCUSPARSETriFactors; 1662 ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->loTriFactorPtr = 0; 1663 ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->upTriFactorPtr = 0; 1664 ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->loTriFactorPtrTranspose = 0; 1665 ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->upTriFactorPtrTranspose = 0; 1666 ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->rpermIndices = 0; 1667 ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->cpermIndices = 0; 1668 ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->workVector = 0; 1669 ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->handle = 0; 1670 stat = cusparseCreate(&handle);CHKERRCUDA(stat); 1671 ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->handle = handle; 1672 ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->nnz = 0; 1673 } 1674 1675 C->ops->assemblyend = MatAssemblyEnd_SeqAIJCUSPARSE; 1676 C->ops->destroy = MatDestroy_SeqAIJCUSPARSE; 1677 C->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE; 1678 C->ops->mult = MatMult_SeqAIJCUSPARSE; 1679 C->ops->multadd = MatMultAdd_SeqAIJCUSPARSE; 1680 C->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE; 1681 C->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE; 1682 C->ops->duplicate = MatDuplicate_SeqAIJCUSPARSE; 1683 1684 ierr = PetscObjectChangeTypeName((PetscObject)C,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 1685 1686 C->valid_GPU_matrix = PETSC_OFFLOAD_UNALLOCATED; 1687 1688 ierr = PetscObjectComposeFunction((PetscObject)C, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr); 1689 PetscFunctionReturn(0); 1690 } 1691 1692 PETSC_EXTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat B) 1693 { 1694 PetscErrorCode ierr; 1695 cusparseStatus_t stat; 1696 cusparseHandle_t handle=0; 1697 1698 PetscFunctionBegin; 1699 ierr = PetscFree(B->defaultvectype);CHKERRQ(ierr); 1700 ierr = PetscStrallocpy(VECCUDA,&B->defaultvectype);CHKERRQ(ierr); 1701 1702 if (B->factortype==MAT_FACTOR_NONE) { 1703 /* you cannot check the inode.use flag here since the matrix was just created. 1704 now build a GPU matrix data structure */ 1705 B->spptr = new Mat_SeqAIJCUSPARSE; 1706 ((Mat_SeqAIJCUSPARSE*)B->spptr)->mat = 0; 1707 ((Mat_SeqAIJCUSPARSE*)B->spptr)->matTranspose = 0; 1708 ((Mat_SeqAIJCUSPARSE*)B->spptr)->workVector = 0; 1709 ((Mat_SeqAIJCUSPARSE*)B->spptr)->format = MAT_CUSPARSE_CSR; 1710 ((Mat_SeqAIJCUSPARSE*)B->spptr)->stream = 0; 1711 stat = cusparseCreate(&handle);CHKERRCUDA(stat); 1712 ((Mat_SeqAIJCUSPARSE*)B->spptr)->handle = handle; 1713 ((Mat_SeqAIJCUSPARSE*)B->spptr)->nonzerostate = 0; 1714 } else { 1715 /* NEXT, set the pointers to the triangular factors */ 1716 B->spptr = new Mat_SeqAIJCUSPARSETriFactors; 1717 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->loTriFactorPtr = 0; 1718 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->upTriFactorPtr = 0; 1719 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->loTriFactorPtrTranspose = 0; 1720 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->upTriFactorPtrTranspose = 0; 1721 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->rpermIndices = 0; 1722 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->cpermIndices = 0; 1723 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->workVector = 0; 1724 stat = cusparseCreate(&handle);CHKERRCUDA(stat); 1725 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->handle = handle; 1726 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->nnz = 0; 1727 } 1728 1729 B->ops->assemblyend = MatAssemblyEnd_SeqAIJCUSPARSE; 1730 B->ops->destroy = MatDestroy_SeqAIJCUSPARSE; 1731 B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE; 1732 B->ops->mult = MatMult_SeqAIJCUSPARSE; 1733 B->ops->multadd = MatMultAdd_SeqAIJCUSPARSE; 1734 B->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE; 1735 B->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE; 1736 B->ops->duplicate = MatDuplicate_SeqAIJCUSPARSE; 1737 1738 ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 1739 1740 B->valid_GPU_matrix = PETSC_OFFLOAD_UNALLOCATED; 1741 1742 ierr = PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr); 1743 PetscFunctionReturn(0); 1744 } 1745 1746 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B) 1747 { 1748 PetscErrorCode ierr; 1749 1750 PetscFunctionBegin; 1751 ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr); 1752 ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(B); 1753 PetscFunctionReturn(0); 1754 } 1755 1756 /*MC 1757 MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices. 1758 1759 A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either 1760 CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later. 1761 All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library. 1762 1763 Options Database Keys: 1764 + -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions() 1765 . -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 1766 - -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 1767 1768 Level: beginner 1769 1770 .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 1771 M*/ 1772 1773 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat,MatFactorType,Mat*); 1774 1775 1776 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void) 1777 { 1778 PetscErrorCode ierr; 1779 1780 PetscFunctionBegin; 1781 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 1782 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 1783 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 1784 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 1785 PetscFunctionReturn(0); 1786 } 1787 1788 1789 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct) 1790 { 1791 cusparseStatus_t stat; 1792 cusparseHandle_t handle; 1793 1794 PetscFunctionBegin; 1795 if (*cusparsestruct) { 1796 MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format); 1797 MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format); 1798 delete (*cusparsestruct)->workVector; 1799 if (handle = (*cusparsestruct)->handle) { 1800 stat = cusparseDestroy(handle);CHKERRCUDA(stat); 1801 } 1802 delete *cusparsestruct; 1803 *cusparsestruct = 0; 1804 } 1805 PetscFunctionReturn(0); 1806 } 1807 1808 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat) 1809 { 1810 PetscFunctionBegin; 1811 if (*mat) { 1812 delete (*mat)->values; 1813 delete (*mat)->column_indices; 1814 delete (*mat)->row_offsets; 1815 delete *mat; 1816 *mat = 0; 1817 } 1818 PetscFunctionReturn(0); 1819 } 1820 1821 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor) 1822 { 1823 cusparseStatus_t stat; 1824 PetscErrorCode ierr; 1825 1826 PetscFunctionBegin; 1827 if (*trifactor) { 1828 if ((*trifactor)->descr) { stat = cusparseDestroyMatDescr((*trifactor)->descr);CHKERRCUDA(stat); } 1829 if ((*trifactor)->solveInfo) { stat = cusparseDestroySolveAnalysisInfo((*trifactor)->solveInfo);CHKERRCUDA(stat); } 1830 ierr = CsrMatrix_Destroy(&(*trifactor)->csrMat);CHKERRQ(ierr); 1831 delete *trifactor; 1832 *trifactor = 0; 1833 } 1834 PetscFunctionReturn(0); 1835 } 1836 1837 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format) 1838 { 1839 CsrMatrix *mat; 1840 cusparseStatus_t stat; 1841 cudaError_t err; 1842 1843 PetscFunctionBegin; 1844 if (*matstruct) { 1845 if ((*matstruct)->mat) { 1846 if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) { 1847 cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat; 1848 stat = cusparseDestroyHybMat(hybMat);CHKERRCUDA(stat); 1849 } else { 1850 mat = (CsrMatrix*)(*matstruct)->mat; 1851 CsrMatrix_Destroy(&mat); 1852 } 1853 } 1854 if ((*matstruct)->descr) { stat = cusparseDestroyMatDescr((*matstruct)->descr);CHKERRCUDA(stat); } 1855 delete (*matstruct)->cprowIndices; 1856 if ((*matstruct)->alpha) { err=cudaFree((*matstruct)->alpha);CHKERRCUDA(err); } 1857 if ((*matstruct)->beta_zero) { err=cudaFree((*matstruct)->beta_zero);CHKERRCUDA(err); } 1858 if ((*matstruct)->beta_one) { err=cudaFree((*matstruct)->beta_one);CHKERRCUDA(err); } 1859 delete *matstruct; 1860 *matstruct = 0; 1861 } 1862 PetscFunctionReturn(0); 1863 } 1864 1865 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors) 1866 { 1867 cusparseHandle_t handle; 1868 cusparseStatus_t stat; 1869 1870 PetscFunctionBegin; 1871 if (*trifactors) { 1872 MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtr); 1873 MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtr); 1874 MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtrTranspose); 1875 MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtrTranspose); 1876 delete (*trifactors)->rpermIndices; 1877 delete (*trifactors)->cpermIndices; 1878 delete (*trifactors)->workVector; 1879 if (handle = (*trifactors)->handle) { 1880 stat = cusparseDestroy(handle);CHKERRCUDA(stat); 1881 } 1882 delete *trifactors; 1883 *trifactors = 0; 1884 } 1885 PetscFunctionReturn(0); 1886 } 1887 1888