1 /* 2 Defines the basic matrix operations for the AIJ (compressed row) 3 matrix storage format using the CUSPARSE library, 4 */ 5 #define PETSC_SKIP_SPINLOCK 6 7 #include <petscconf.h> 8 #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/ 9 #include <../src/mat/impls/sbaij/seq/sbaij.h> 10 #include <../src/vec/vec/impls/dvecimpl.h> 11 #include <petsc/private/vecimpl.h> 12 #undef VecType 13 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h> 14 15 const char *const MatCUSPARSEStorageFormats[] = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0}; 16 17 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*); 18 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*); 19 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*); 20 21 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 22 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 23 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*); 24 25 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec); 26 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 27 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 28 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 29 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat); 30 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec); 31 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 32 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 33 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 34 35 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**); 36 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**); 37 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat); 38 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**); 39 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**); 40 41 PetscErrorCode MatCUSPARSESetStream(Mat A,const cudaStream_t stream) 42 { 43 cusparseStatus_t stat; 44 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 45 46 PetscFunctionBegin; 47 cusparsestruct->stream = stream; 48 stat = cusparseSetStream(cusparsestruct->handle,cusparsestruct->stream);CHKERRCUDA(stat); 49 PetscFunctionReturn(0); 50 } 51 52 PetscErrorCode MatCUSPARSESetHandle(Mat A,const cusparseHandle_t handle) 53 { 54 cusparseStatus_t stat; 55 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 56 57 PetscFunctionBegin; 58 if (cusparsestruct->handle != handle) { 59 if (cusparsestruct->handle) { 60 stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUDA(stat); 61 } 62 cusparsestruct->handle = handle; 63 } 64 stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUDA(stat); 65 PetscFunctionReturn(0); 66 } 67 68 PetscErrorCode MatCUSPARSEClearHandle(Mat A) 69 { 70 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 71 PetscFunctionBegin; 72 if (cusparsestruct->handle) 73 cusparsestruct->handle = 0; 74 PetscFunctionReturn(0); 75 } 76 77 PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A,MatSolverType *type) 78 { 79 PetscFunctionBegin; 80 *type = MATSOLVERCUSPARSE; 81 PetscFunctionReturn(0); 82 } 83 84 /*MC 85 MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices 86 on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported 87 algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer 88 performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the 89 CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these 90 algorithms are not recommended. This class does NOT support direct solver operations. 91 92 Level: beginner 93 94 .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 95 M*/ 96 97 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B) 98 { 99 PetscErrorCode ierr; 100 PetscInt n = A->rmap->n; 101 102 PetscFunctionBegin; 103 ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr); 104 (*B)->factortype = ftype; 105 ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr); 106 ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 107 108 if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) { 109 ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr); 110 (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE; 111 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJCUSPARSE; 112 } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) { 113 (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJCUSPARSE; 114 (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE; 115 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types"); 116 117 ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr); 118 ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_cusparse);CHKERRQ(ierr); 119 PetscFunctionReturn(0); 120 } 121 122 PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 123 { 124 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 125 126 PetscFunctionBegin; 127 switch (op) { 128 case MAT_CUSPARSE_MULT: 129 cusparsestruct->format = format; 130 break; 131 case MAT_CUSPARSE_ALL: 132 cusparsestruct->format = format; 133 break; 134 default: 135 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op); 136 } 137 PetscFunctionReturn(0); 138 } 139 140 /*@ 141 MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular 142 operation. Only the MatMult operation can use different GPU storage formats 143 for MPIAIJCUSPARSE matrices. 144 Not Collective 145 146 Input Parameters: 147 + A - Matrix of type SEQAIJCUSPARSE 148 . op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL. 149 - format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2) 150 151 Output Parameter: 152 153 Level: intermediate 154 155 .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 156 @*/ 157 PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 158 { 159 PetscErrorCode ierr; 160 161 PetscFunctionBegin; 162 PetscValidHeaderSpecific(A, MAT_CLASSID,1); 163 ierr = PetscTryMethod(A, "MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));CHKERRQ(ierr); 164 PetscFunctionReturn(0); 165 } 166 167 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A) 168 { 169 PetscErrorCode ierr; 170 MatCUSPARSEStorageFormat format; 171 PetscBool flg; 172 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 173 174 PetscFunctionBegin; 175 ierr = PetscOptionsHead(PetscOptionsObject,"SeqAIJCUSPARSE options");CHKERRQ(ierr); 176 if (A->factortype==MAT_FACTOR_NONE) { 177 ierr = PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV", 178 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 179 if (flg) { 180 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);CHKERRQ(ierr); 181 } 182 } 183 ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve", 184 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 185 if (flg) { 186 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr); 187 } 188 ierr = PetscOptionsTail();CHKERRQ(ierr); 189 PetscFunctionReturn(0); 190 191 } 192 193 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 194 { 195 PetscErrorCode ierr; 196 197 PetscFunctionBegin; 198 ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 199 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 200 PetscFunctionReturn(0); 201 } 202 203 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 204 { 205 PetscErrorCode ierr; 206 207 PetscFunctionBegin; 208 ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 209 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 210 PetscFunctionReturn(0); 211 } 212 213 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info) 214 { 215 PetscErrorCode ierr; 216 217 PetscFunctionBegin; 218 ierr = MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr); 219 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 220 PetscFunctionReturn(0); 221 } 222 223 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info) 224 { 225 PetscErrorCode ierr; 226 227 PetscFunctionBegin; 228 ierr = MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr); 229 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 230 PetscFunctionReturn(0); 231 } 232 233 static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A) 234 { 235 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 236 PetscInt n = A->rmap->n; 237 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 238 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 239 cusparseStatus_t stat; 240 const PetscInt *ai = a->i,*aj = a->j,*vi; 241 const MatScalar *aa = a->a,*v; 242 PetscInt *AiLo, *AjLo; 243 PetscScalar *AALo; 244 PetscInt i,nz, nzLower, offset, rowOffset; 245 PetscErrorCode ierr; 246 247 PetscFunctionBegin; 248 if (A->valid_GPU_matrix == PETSC_OFFLOAD_UNALLOCATED || A->valid_GPU_matrix == PETSC_OFFLOAD_CPU) { 249 try { 250 /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */ 251 nzLower=n+ai[n]-ai[1]; 252 253 /* Allocate Space for the lower triangular matrix */ 254 ierr = cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUDA(ierr); 255 ierr = cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUDA(ierr); 256 ierr = cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUDA(ierr); 257 258 /* Fill the lower triangular matrix */ 259 AiLo[0] = (PetscInt) 0; 260 AiLo[n] = nzLower; 261 AjLo[0] = (PetscInt) 0; 262 AALo[0] = (MatScalar) 1.0; 263 v = aa; 264 vi = aj; 265 offset = 1; 266 rowOffset= 1; 267 for (i=1; i<n; i++) { 268 nz = ai[i+1] - ai[i]; 269 /* additional 1 for the term on the diagonal */ 270 AiLo[i] = rowOffset; 271 rowOffset += nz+1; 272 273 ierr = PetscArraycpy(&(AjLo[offset]), vi, nz);CHKERRQ(ierr); 274 ierr = PetscArraycpy(&(AALo[offset]), v, nz);CHKERRQ(ierr); 275 276 offset += nz; 277 AjLo[offset] = (PetscInt) i; 278 AALo[offset] = (MatScalar) 1.0; 279 offset += 1; 280 281 v += nz; 282 vi += nz; 283 } 284 285 /* allocate space for the triangular factor information */ 286 loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct; 287 288 /* Create the matrix description */ 289 stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUDA(stat); 290 stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat); 291 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUDA(stat); 292 stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER);CHKERRCUDA(stat); 293 stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUDA(stat); 294 295 /* Create the solve analysis information */ 296 stat = cusparseCreateSolveAnalysisInfo(&loTriFactor->solveInfo);CHKERRCUDA(stat); 297 298 /* set the operation */ 299 loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 300 301 /* set the matrix */ 302 loTriFactor->csrMat = new CsrMatrix; 303 loTriFactor->csrMat->num_rows = n; 304 loTriFactor->csrMat->num_cols = n; 305 loTriFactor->csrMat->num_entries = nzLower; 306 307 loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1); 308 loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1); 309 310 loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower); 311 loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower); 312 313 loTriFactor->csrMat->values = new THRUSTARRAY(nzLower); 314 loTriFactor->csrMat->values->assign(AALo, AALo+nzLower); 315 316 /* perform the solve analysis */ 317 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, 318 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 319 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 320 loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo);CHKERRCUDA(stat); 321 322 /* assign the pointer. Is this really necessary? */ 323 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor; 324 325 ierr = cudaFreeHost(AiLo);CHKERRCUDA(ierr); 326 ierr = cudaFreeHost(AjLo);CHKERRCUDA(ierr); 327 ierr = cudaFreeHost(AALo);CHKERRCUDA(ierr); 328 ierr = PetscLogCpuToGpu((n+1+nzLower)*sizeof(int)+nzLower*sizeof(PetscScalar));CHKERRQ(ierr); 329 } catch(char *ex) { 330 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 331 } 332 } 333 PetscFunctionReturn(0); 334 } 335 336 static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A) 337 { 338 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 339 PetscInt n = A->rmap->n; 340 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 341 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 342 cusparseStatus_t stat; 343 const PetscInt *aj = a->j,*adiag = a->diag,*vi; 344 const MatScalar *aa = a->a,*v; 345 PetscInt *AiUp, *AjUp; 346 PetscScalar *AAUp; 347 PetscInt i,nz, nzUpper, offset; 348 PetscErrorCode ierr; 349 350 PetscFunctionBegin; 351 if (A->valid_GPU_matrix == PETSC_OFFLOAD_UNALLOCATED || A->valid_GPU_matrix == PETSC_OFFLOAD_CPU) { 352 try { 353 /* next, figure out the number of nonzeros in the upper triangular matrix. */ 354 nzUpper = adiag[0]-adiag[n]; 355 356 /* Allocate Space for the upper triangular matrix */ 357 ierr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(ierr); 358 ierr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(ierr); 359 ierr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(ierr); 360 361 /* Fill the upper triangular matrix */ 362 AiUp[0]=(PetscInt) 0; 363 AiUp[n]=nzUpper; 364 offset = nzUpper; 365 for (i=n-1; i>=0; i--) { 366 v = aa + adiag[i+1] + 1; 367 vi = aj + adiag[i+1] + 1; 368 369 /* number of elements NOT on the diagonal */ 370 nz = adiag[i] - adiag[i+1]-1; 371 372 /* decrement the offset */ 373 offset -= (nz+1); 374 375 /* first, set the diagonal elements */ 376 AjUp[offset] = (PetscInt) i; 377 AAUp[offset] = (MatScalar)1./v[nz]; 378 AiUp[i] = AiUp[i+1] - (nz+1); 379 380 ierr = PetscArraycpy(&(AjUp[offset+1]), vi, nz);CHKERRQ(ierr); 381 ierr = PetscArraycpy(&(AAUp[offset+1]), v, nz);CHKERRQ(ierr); 382 } 383 384 /* allocate space for the triangular factor information */ 385 upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct; 386 387 /* Create the matrix description */ 388 stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUDA(stat); 389 stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat); 390 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUDA(stat); 391 stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUDA(stat); 392 stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUDA(stat); 393 394 /* Create the solve analysis information */ 395 stat = cusparseCreateSolveAnalysisInfo(&upTriFactor->solveInfo);CHKERRCUDA(stat); 396 397 /* set the operation */ 398 upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 399 400 /* set the matrix */ 401 upTriFactor->csrMat = new CsrMatrix; 402 upTriFactor->csrMat->num_rows = n; 403 upTriFactor->csrMat->num_cols = n; 404 upTriFactor->csrMat->num_entries = nzUpper; 405 406 upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1); 407 upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1); 408 409 upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper); 410 upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper); 411 412 upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper); 413 upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper); 414 415 /* perform the solve analysis */ 416 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, 417 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 418 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 419 upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo);CHKERRCUDA(stat); 420 421 /* assign the pointer. Is this really necessary? */ 422 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor; 423 424 ierr = cudaFreeHost(AiUp);CHKERRCUDA(ierr); 425 ierr = cudaFreeHost(AjUp);CHKERRCUDA(ierr); 426 ierr = cudaFreeHost(AAUp);CHKERRCUDA(ierr); 427 ierr = PetscLogCpuToGpu((n+1+nzUpper)*sizeof(int)+nzUpper*sizeof(PetscScalar));CHKERRQ(ierr); 428 } catch(char *ex) { 429 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 430 } 431 } 432 PetscFunctionReturn(0); 433 } 434 435 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A) 436 { 437 PetscErrorCode ierr; 438 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 439 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 440 IS isrow = a->row,iscol = a->icol; 441 PetscBool row_identity,col_identity; 442 const PetscInt *r,*c; 443 PetscInt n = A->rmap->n; 444 445 PetscFunctionBegin; 446 ierr = MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);CHKERRQ(ierr); 447 ierr = MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);CHKERRQ(ierr); 448 449 cusparseTriFactors->workVector = new THRUSTARRAY(n); 450 cusparseTriFactors->nnz=a->nz; 451 452 A->valid_GPU_matrix = PETSC_OFFLOAD_BOTH; 453 /*lower triangular indices */ 454 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 455 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 456 if (!row_identity) { 457 cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 458 cusparseTriFactors->rpermIndices->assign(r, r+n); 459 } 460 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 461 462 /*upper triangular indices */ 463 ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr); 464 ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 465 if (!col_identity) { 466 cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 467 cusparseTriFactors->cpermIndices->assign(c, c+n); 468 } 469 470 if(!row_identity && !col_identity) { 471 ierr = PetscLogCpuToGpu(2*n*sizeof(PetscInt));CHKERRQ(ierr); 472 } else if(!row_identity) { 473 ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr); 474 } else if(!col_identity) { 475 ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr); 476 } 477 478 ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr); 479 PetscFunctionReturn(0); 480 } 481 482 static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A) 483 { 484 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 485 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 486 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 487 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 488 cusparseStatus_t stat; 489 PetscErrorCode ierr; 490 PetscInt *AiUp, *AjUp; 491 PetscScalar *AAUp; 492 PetscScalar *AALo; 493 PetscInt nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j; 494 Mat_SeqSBAIJ *b = (Mat_SeqSBAIJ*)A->data; 495 const PetscInt *ai = b->i,*aj = b->j,*vj; 496 const MatScalar *aa = b->a,*v; 497 498 PetscFunctionBegin; 499 if (A->valid_GPU_matrix == PETSC_OFFLOAD_UNALLOCATED || A->valid_GPU_matrix == PETSC_OFFLOAD_CPU) { 500 try { 501 /* Allocate Space for the upper triangular matrix */ 502 ierr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(ierr); 503 ierr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(ierr); 504 ierr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(ierr); 505 ierr = cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUDA(ierr); 506 507 /* Fill the upper triangular matrix */ 508 AiUp[0]=(PetscInt) 0; 509 AiUp[n]=nzUpper; 510 offset = 0; 511 for (i=0; i<n; i++) { 512 /* set the pointers */ 513 v = aa + ai[i]; 514 vj = aj + ai[i]; 515 nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */ 516 517 /* first, set the diagonal elements */ 518 AjUp[offset] = (PetscInt) i; 519 AAUp[offset] = (MatScalar)1.0/v[nz]; 520 AiUp[i] = offset; 521 AALo[offset] = (MatScalar)1.0/v[nz]; 522 523 offset+=1; 524 if (nz>0) { 525 ierr = PetscArraycpy(&(AjUp[offset]), vj, nz);CHKERRQ(ierr); 526 ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr); 527 for (j=offset; j<offset+nz; j++) { 528 AAUp[j] = -AAUp[j]; 529 AALo[j] = AAUp[j]/v[nz]; 530 } 531 offset+=nz; 532 } 533 } 534 535 /* allocate space for the triangular factor information */ 536 upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct; 537 538 /* Create the matrix description */ 539 stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUDA(stat); 540 stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat); 541 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUDA(stat); 542 stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUDA(stat); 543 stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUDA(stat); 544 545 /* Create the solve analysis information */ 546 stat = cusparseCreateSolveAnalysisInfo(&upTriFactor->solveInfo);CHKERRCUDA(stat); 547 548 /* set the operation */ 549 upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 550 551 /* set the matrix */ 552 upTriFactor->csrMat = new CsrMatrix; 553 upTriFactor->csrMat->num_rows = A->rmap->n; 554 upTriFactor->csrMat->num_cols = A->cmap->n; 555 upTriFactor->csrMat->num_entries = a->nz; 556 557 upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 558 upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1); 559 560 upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 561 upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz); 562 563 upTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 564 upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz); 565 566 /* perform the solve analysis */ 567 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, 568 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 569 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 570 upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo);CHKERRCUDA(stat); 571 572 /* assign the pointer. Is this really necessary? */ 573 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor; 574 575 /* allocate space for the triangular factor information */ 576 loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct; 577 578 /* Create the matrix description */ 579 stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUDA(stat); 580 stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat); 581 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUDA(stat); 582 stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUDA(stat); 583 stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUDA(stat); 584 585 /* Create the solve analysis information */ 586 stat = cusparseCreateSolveAnalysisInfo(&loTriFactor->solveInfo);CHKERRCUDA(stat); 587 588 /* set the operation */ 589 loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE; 590 591 /* set the matrix */ 592 loTriFactor->csrMat = new CsrMatrix; 593 loTriFactor->csrMat->num_rows = A->rmap->n; 594 loTriFactor->csrMat->num_cols = A->cmap->n; 595 loTriFactor->csrMat->num_entries = a->nz; 596 597 loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 598 loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1); 599 600 loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 601 loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz); 602 603 loTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 604 loTriFactor->csrMat->values->assign(AALo, AALo+a->nz); 605 ierr = PetscLogCpuToGpu(2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)));CHKERRQ(ierr); 606 607 /* perform the solve analysis */ 608 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, 609 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 610 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 611 loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo);CHKERRCUDA(stat); 612 613 /* assign the pointer. Is this really necessary? */ 614 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor; 615 616 A->valid_GPU_matrix = PETSC_OFFLOAD_BOTH; 617 ierr = cudaFreeHost(AiUp);CHKERRCUDA(ierr); 618 ierr = cudaFreeHost(AjUp);CHKERRCUDA(ierr); 619 ierr = cudaFreeHost(AAUp);CHKERRCUDA(ierr); 620 ierr = cudaFreeHost(AALo);CHKERRCUDA(ierr); 621 } catch(char *ex) { 622 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 623 } 624 } 625 PetscFunctionReturn(0); 626 } 627 628 static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A) 629 { 630 PetscErrorCode ierr; 631 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 632 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 633 IS ip = a->row; 634 const PetscInt *rip; 635 PetscBool perm_identity; 636 PetscInt n = A->rmap->n; 637 638 PetscFunctionBegin; 639 ierr = MatSeqAIJCUSPARSEBuildICCTriMatrices(A);CHKERRQ(ierr); 640 cusparseTriFactors->workVector = new THRUSTARRAY(n); 641 cusparseTriFactors->nnz=(a->nz-n)*2 + n; 642 643 /*lower triangular indices */ 644 ierr = ISGetIndices(ip,&rip);CHKERRQ(ierr); 645 ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr); 646 if (!perm_identity) { 647 cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 648 cusparseTriFactors->rpermIndices->assign(rip, rip+n); 649 cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 650 cusparseTriFactors->cpermIndices->assign(rip, rip+n); 651 ierr = PetscLogCpuToGpu(2*n*sizeof(PetscInt));CHKERRQ(ierr); 652 } 653 ierr = ISRestoreIndices(ip,&rip);CHKERRQ(ierr); 654 PetscFunctionReturn(0); 655 } 656 657 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info) 658 { 659 Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 660 IS isrow = b->row,iscol = b->col; 661 PetscBool row_identity,col_identity; 662 PetscErrorCode ierr; 663 664 PetscFunctionBegin; 665 ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 666 /* determine which version of MatSolve needs to be used. */ 667 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 668 ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 669 if (row_identity && col_identity) { 670 B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 671 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 672 } else { 673 B->ops->solve = MatSolve_SeqAIJCUSPARSE; 674 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 675 } 676 677 /* get the triangular factors */ 678 ierr = MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);CHKERRQ(ierr); 679 PetscFunctionReturn(0); 680 } 681 682 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info) 683 { 684 Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 685 IS ip = b->row; 686 PetscBool perm_identity; 687 PetscErrorCode ierr; 688 689 PetscFunctionBegin; 690 ierr = MatCholeskyFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 691 692 /* determine which version of MatSolve needs to be used. */ 693 ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr); 694 if (perm_identity) { 695 B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 696 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 697 } else { 698 B->ops->solve = MatSolve_SeqAIJCUSPARSE; 699 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 700 } 701 702 /* get the triangular factors */ 703 ierr = MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);CHKERRQ(ierr); 704 PetscFunctionReturn(0); 705 } 706 707 static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A) 708 { 709 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 710 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 711 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 712 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 713 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 714 cusparseStatus_t stat; 715 cusparseIndexBase_t indexBase; 716 cusparseMatrixType_t matrixType; 717 cusparseFillMode_t fillMode; 718 cusparseDiagType_t diagType; 719 720 PetscFunctionBegin; 721 722 /*********************************************/ 723 /* Now the Transpose of the Lower Tri Factor */ 724 /*********************************************/ 725 726 /* allocate space for the transpose of the lower triangular factor */ 727 loTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct; 728 729 /* set the matrix descriptors of the lower triangular factor */ 730 matrixType = cusparseGetMatType(loTriFactor->descr); 731 indexBase = cusparseGetMatIndexBase(loTriFactor->descr); 732 fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ? 733 CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 734 diagType = cusparseGetMatDiagType(loTriFactor->descr); 735 736 /* Create the matrix description */ 737 stat = cusparseCreateMatDescr(&loTriFactorT->descr);CHKERRCUDA(stat); 738 stat = cusparseSetMatIndexBase(loTriFactorT->descr, indexBase);CHKERRCUDA(stat); 739 stat = cusparseSetMatType(loTriFactorT->descr, matrixType);CHKERRCUDA(stat); 740 stat = cusparseSetMatFillMode(loTriFactorT->descr, fillMode);CHKERRCUDA(stat); 741 stat = cusparseSetMatDiagType(loTriFactorT->descr, diagType);CHKERRCUDA(stat); 742 743 /* Create the solve analysis information */ 744 stat = cusparseCreateSolveAnalysisInfo(&loTriFactorT->solveInfo);CHKERRCUDA(stat); 745 746 /* set the operation */ 747 loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 748 749 /* allocate GPU space for the CSC of the lower triangular factor*/ 750 loTriFactorT->csrMat = new CsrMatrix; 751 loTriFactorT->csrMat->num_rows = loTriFactor->csrMat->num_rows; 752 loTriFactorT->csrMat->num_cols = loTriFactor->csrMat->num_cols; 753 loTriFactorT->csrMat->num_entries = loTriFactor->csrMat->num_entries; 754 loTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(loTriFactor->csrMat->num_rows+1); 755 loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactor->csrMat->num_entries); 756 loTriFactorT->csrMat->values = new THRUSTARRAY(loTriFactor->csrMat->num_entries); 757 758 /* compute the transpose of the lower triangular factor, i.e. the CSC */ 759 stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, 760 loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, 761 loTriFactor->csrMat->values->data().get(), 762 loTriFactor->csrMat->row_offsets->data().get(), 763 loTriFactor->csrMat->column_indices->data().get(), 764 loTriFactorT->csrMat->values->data().get(), 765 loTriFactorT->csrMat->column_indices->data().get(), 766 loTriFactorT->csrMat->row_offsets->data().get(), 767 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUDA(stat); 768 769 /* perform the solve analysis on the transposed matrix */ 770 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp, 771 loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, 772 loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(), 773 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), 774 loTriFactorT->solveInfo);CHKERRCUDA(stat); 775 776 /* assign the pointer. Is this really necessary? */ 777 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT; 778 779 /*********************************************/ 780 /* Now the Transpose of the Upper Tri Factor */ 781 /*********************************************/ 782 783 /* allocate space for the transpose of the upper triangular factor */ 784 upTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct; 785 786 /* set the matrix descriptors of the upper triangular factor */ 787 matrixType = cusparseGetMatType(upTriFactor->descr); 788 indexBase = cusparseGetMatIndexBase(upTriFactor->descr); 789 fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ? 790 CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 791 diagType = cusparseGetMatDiagType(upTriFactor->descr); 792 793 /* Create the matrix description */ 794 stat = cusparseCreateMatDescr(&upTriFactorT->descr);CHKERRCUDA(stat); 795 stat = cusparseSetMatIndexBase(upTriFactorT->descr, indexBase);CHKERRCUDA(stat); 796 stat = cusparseSetMatType(upTriFactorT->descr, matrixType);CHKERRCUDA(stat); 797 stat = cusparseSetMatFillMode(upTriFactorT->descr, fillMode);CHKERRCUDA(stat); 798 stat = cusparseSetMatDiagType(upTriFactorT->descr, diagType);CHKERRCUDA(stat); 799 800 /* Create the solve analysis information */ 801 stat = cusparseCreateSolveAnalysisInfo(&upTriFactorT->solveInfo);CHKERRCUDA(stat); 802 803 /* set the operation */ 804 upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 805 806 /* allocate GPU space for the CSC of the upper triangular factor*/ 807 upTriFactorT->csrMat = new CsrMatrix; 808 upTriFactorT->csrMat->num_rows = upTriFactor->csrMat->num_rows; 809 upTriFactorT->csrMat->num_cols = upTriFactor->csrMat->num_cols; 810 upTriFactorT->csrMat->num_entries = upTriFactor->csrMat->num_entries; 811 upTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(upTriFactor->csrMat->num_rows+1); 812 upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactor->csrMat->num_entries); 813 upTriFactorT->csrMat->values = new THRUSTARRAY(upTriFactor->csrMat->num_entries); 814 815 /* compute the transpose of the upper triangular factor, i.e. the CSC */ 816 stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, 817 upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, 818 upTriFactor->csrMat->values->data().get(), 819 upTriFactor->csrMat->row_offsets->data().get(), 820 upTriFactor->csrMat->column_indices->data().get(), 821 upTriFactorT->csrMat->values->data().get(), 822 upTriFactorT->csrMat->column_indices->data().get(), 823 upTriFactorT->csrMat->row_offsets->data().get(), 824 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUDA(stat); 825 826 /* perform the solve analysis on the transposed matrix */ 827 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp, 828 upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, 829 upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(), 830 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), 831 upTriFactorT->solveInfo);CHKERRCUDA(stat); 832 833 /* assign the pointer. Is this really necessary? */ 834 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT; 835 PetscFunctionReturn(0); 836 } 837 838 static PetscErrorCode MatSeqAIJCUSPARSEGenerateTransposeForMult(Mat A) 839 { 840 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 841 Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 842 Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 843 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 844 cusparseStatus_t stat; 845 cusparseIndexBase_t indexBase; 846 cudaError_t err; 847 PetscErrorCode ierr; 848 849 PetscFunctionBegin; 850 851 /* allocate space for the triangular factor information */ 852 matstructT = new Mat_SeqAIJCUSPARSEMultStruct; 853 stat = cusparseCreateMatDescr(&matstructT->descr);CHKERRCUDA(stat); 854 indexBase = cusparseGetMatIndexBase(matstruct->descr); 855 stat = cusparseSetMatIndexBase(matstructT->descr, indexBase);CHKERRCUDA(stat); 856 stat = cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUDA(stat); 857 858 /* set alpha and beta */ 859 err = cudaMalloc((void **)&(matstructT->alpha), sizeof(PetscScalar));CHKERRCUDA(err); 860 err = cudaMalloc((void **)&(matstructT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err); 861 err = cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar));CHKERRCUDA(err); 862 err = cudaMemcpy(matstructT->alpha, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 863 err = cudaMemcpy(matstructT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 864 err = cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 865 stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUDA(stat); 866 867 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 868 CsrMatrix *matrix = (CsrMatrix*)matstruct->mat; 869 CsrMatrix *matrixT= new CsrMatrix; 870 matrixT->num_rows = A->cmap->n; 871 matrixT->num_cols = A->rmap->n; 872 matrixT->num_entries = a->nz; 873 matrixT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 874 matrixT->column_indices = new THRUSTINTARRAY32(a->nz); 875 matrixT->values = new THRUSTARRAY(a->nz); 876 877 /* compute the transpose of the upper triangular factor, i.e. the CSC */ 878 indexBase = cusparseGetMatIndexBase(matstruct->descr); 879 stat = cusparse_csr2csc(cusparsestruct->handle, matrix->num_rows, 880 matrix->num_cols, matrix->num_entries, 881 matrix->values->data().get(), 882 matrix->row_offsets->data().get(), 883 matrix->column_indices->data().get(), 884 matrixT->values->data().get(), 885 matrixT->column_indices->data().get(), 886 matrixT->row_offsets->data().get(), 887 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUDA(stat); 888 889 /* assign the pointer */ 890 matstructT->mat = matrixT; 891 ierr = PetscLogCpuToGpu(((A->rmap->n+1)+(a->nz))*sizeof(int)+(3+a->nz)*sizeof(PetscScalar));CHKERRQ(ierr); 892 } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) { 893 /* First convert HYB to CSR */ 894 CsrMatrix *temp= new CsrMatrix; 895 temp->num_rows = A->rmap->n; 896 temp->num_cols = A->cmap->n; 897 temp->num_entries = a->nz; 898 temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 899 temp->column_indices = new THRUSTINTARRAY32(a->nz); 900 temp->values = new THRUSTARRAY(a->nz); 901 902 903 stat = cusparse_hyb2csr(cusparsestruct->handle, 904 matstruct->descr, (cusparseHybMat_t)matstruct->mat, 905 temp->values->data().get(), 906 temp->row_offsets->data().get(), 907 temp->column_indices->data().get());CHKERRCUDA(stat); 908 909 /* Next, convert CSR to CSC (i.e. the matrix transpose) */ 910 CsrMatrix *tempT= new CsrMatrix; 911 tempT->num_rows = A->rmap->n; 912 tempT->num_cols = A->cmap->n; 913 tempT->num_entries = a->nz; 914 tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 915 tempT->column_indices = new THRUSTINTARRAY32(a->nz); 916 tempT->values = new THRUSTARRAY(a->nz); 917 918 stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows, 919 temp->num_cols, temp->num_entries, 920 temp->values->data().get(), 921 temp->row_offsets->data().get(), 922 temp->column_indices->data().get(), 923 tempT->values->data().get(), 924 tempT->column_indices->data().get(), 925 tempT->row_offsets->data().get(), 926 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUDA(stat); 927 928 /* Last, convert CSC to HYB */ 929 cusparseHybMat_t hybMat; 930 stat = cusparseCreateHybMat(&hybMat);CHKERRCUDA(stat); 931 cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ? 932 CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 933 stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n, 934 matstructT->descr, tempT->values->data().get(), 935 tempT->row_offsets->data().get(), 936 tempT->column_indices->data().get(), 937 hybMat, 0, partition);CHKERRCUDA(stat); 938 939 /* assign the pointer */ 940 matstructT->mat = hybMat; 941 ierr = PetscLogCpuToGpu((2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)))+3*sizeof(PetscScalar));CHKERRQ(ierr); 942 943 /* delete temporaries */ 944 if (tempT) { 945 if (tempT->values) delete (THRUSTARRAY*) tempT->values; 946 if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices; 947 if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets; 948 delete (CsrMatrix*) tempT; 949 } 950 if (temp) { 951 if (temp->values) delete (THRUSTARRAY*) temp->values; 952 if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices; 953 if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets; 954 delete (CsrMatrix*) temp; 955 } 956 } 957 /* assign the compressed row indices */ 958 matstructT->cprowIndices = new THRUSTINTARRAY; 959 matstructT->cprowIndices->resize(A->cmap->n); 960 thrust::sequence(matstructT->cprowIndices->begin(), matstructT->cprowIndices->end()); 961 /* assign the pointer */ 962 ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT; 963 PetscFunctionReturn(0); 964 } 965 966 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 967 { 968 PetscInt n = xx->map->n; 969 const PetscScalar *barray; 970 PetscScalar *xarray; 971 thrust::device_ptr<const PetscScalar> bGPU; 972 thrust::device_ptr<PetscScalar> xGPU; 973 cusparseStatus_t stat; 974 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 975 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 976 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 977 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 978 PetscErrorCode ierr; 979 980 PetscFunctionBegin; 981 /* Analyze the matrix and create the transpose ... on the fly */ 982 if (!loTriFactorT && !upTriFactorT) { 983 ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr); 984 loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 985 upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 986 } 987 988 /* Get the GPU pointers */ 989 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 990 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 991 xGPU = thrust::device_pointer_cast(xarray); 992 bGPU = thrust::device_pointer_cast(barray); 993 994 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 995 /* First, reorder with the row permutation */ 996 thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), 997 thrust::make_permutation_iterator(bGPU+n, cusparseTriFactors->rpermIndices->end()), 998 xGPU); 999 1000 /* First, solve U */ 1001 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, 1002 upTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactorT->descr, 1003 upTriFactorT->csrMat->values->data().get(), 1004 upTriFactorT->csrMat->row_offsets->data().get(), 1005 upTriFactorT->csrMat->column_indices->data().get(), 1006 upTriFactorT->solveInfo, 1007 xarray, tempGPU->data().get());CHKERRCUDA(stat); 1008 1009 /* Then, solve L */ 1010 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, 1011 loTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactorT->descr, 1012 loTriFactorT->csrMat->values->data().get(), 1013 loTriFactorT->csrMat->row_offsets->data().get(), 1014 loTriFactorT->csrMat->column_indices->data().get(), 1015 loTriFactorT->solveInfo, 1016 tempGPU->data().get(), xarray);CHKERRCUDA(stat); 1017 1018 /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */ 1019 thrust::copy(thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()), 1020 thrust::make_permutation_iterator(xGPU+n, cusparseTriFactors->cpermIndices->end()), 1021 tempGPU->begin()); 1022 1023 /* Copy the temporary to the full solution. */ 1024 thrust::copy(tempGPU->begin(), tempGPU->end(), xGPU); 1025 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1026 1027 /* restore */ 1028 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1029 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1030 ierr = WaitForGPU();CHKERRCUDA(ierr); 1031 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1032 PetscFunctionReturn(0); 1033 } 1034 1035 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 1036 { 1037 const PetscScalar *barray; 1038 PetscScalar *xarray; 1039 cusparseStatus_t stat; 1040 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1041 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1042 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1043 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1044 PetscErrorCode ierr; 1045 1046 PetscFunctionBegin; 1047 /* Analyze the matrix and create the transpose ... on the fly */ 1048 if (!loTriFactorT && !upTriFactorT) { 1049 ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr); 1050 loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1051 upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1052 } 1053 1054 /* Get the GPU pointers */ 1055 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1056 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1057 1058 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1059 /* First, solve U */ 1060 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, 1061 upTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactorT->descr, 1062 upTriFactorT->csrMat->values->data().get(), 1063 upTriFactorT->csrMat->row_offsets->data().get(), 1064 upTriFactorT->csrMat->column_indices->data().get(), 1065 upTriFactorT->solveInfo, 1066 barray, tempGPU->data().get());CHKERRCUDA(stat); 1067 1068 /* Then, solve L */ 1069 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, 1070 loTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactorT->descr, 1071 loTriFactorT->csrMat->values->data().get(), 1072 loTriFactorT->csrMat->row_offsets->data().get(), 1073 loTriFactorT->csrMat->column_indices->data().get(), 1074 loTriFactorT->solveInfo, 1075 tempGPU->data().get(), xarray);CHKERRCUDA(stat); 1076 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1077 1078 /* restore */ 1079 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1080 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1081 ierr = WaitForGPU();CHKERRCUDA(ierr); 1082 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1083 PetscFunctionReturn(0); 1084 } 1085 1086 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 1087 { 1088 const PetscScalar *barray; 1089 PetscScalar *xarray; 1090 thrust::device_ptr<const PetscScalar> bGPU; 1091 thrust::device_ptr<PetscScalar> xGPU; 1092 cusparseStatus_t stat; 1093 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1094 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1095 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1096 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1097 PetscErrorCode ierr; 1098 1099 PetscFunctionBegin; 1100 1101 /* Get the GPU pointers */ 1102 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1103 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1104 xGPU = thrust::device_pointer_cast(xarray); 1105 bGPU = thrust::device_pointer_cast(barray); 1106 1107 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1108 /* First, reorder with the row permutation */ 1109 thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), 1110 thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()), 1111 xGPU); 1112 1113 /* Next, solve L */ 1114 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp, 1115 loTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactor->descr, 1116 loTriFactor->csrMat->values->data().get(), 1117 loTriFactor->csrMat->row_offsets->data().get(), 1118 loTriFactor->csrMat->column_indices->data().get(), 1119 loTriFactor->solveInfo, 1120 xarray, tempGPU->data().get());CHKERRCUDA(stat); 1121 1122 /* Then, solve U */ 1123 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp, 1124 upTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactor->descr, 1125 upTriFactor->csrMat->values->data().get(), 1126 upTriFactor->csrMat->row_offsets->data().get(), 1127 upTriFactor->csrMat->column_indices->data().get(), 1128 upTriFactor->solveInfo, 1129 tempGPU->data().get(), xarray);CHKERRCUDA(stat); 1130 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1131 1132 /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */ 1133 thrust::copy(thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()), 1134 thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->end()), 1135 tempGPU->begin()); 1136 1137 /* Copy the temporary to the full solution. */ 1138 thrust::copy(tempGPU->begin(), tempGPU->end(), xGPU); 1139 1140 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1141 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1142 ierr = WaitForGPU();CHKERRCUDA(ierr); 1143 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1144 PetscFunctionReturn(0); 1145 } 1146 1147 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 1148 { 1149 const PetscScalar *barray; 1150 PetscScalar *xarray; 1151 cusparseStatus_t stat; 1152 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1153 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1154 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1155 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1156 PetscErrorCode ierr; 1157 1158 PetscFunctionBegin; 1159 /* Get the GPU pointers */ 1160 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1161 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1162 1163 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1164 /* First, solve L */ 1165 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp, 1166 loTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactor->descr, 1167 loTriFactor->csrMat->values->data().get(), 1168 loTriFactor->csrMat->row_offsets->data().get(), 1169 loTriFactor->csrMat->column_indices->data().get(), 1170 loTriFactor->solveInfo, 1171 barray, tempGPU->data().get());CHKERRCUDA(stat); 1172 1173 /* Next, solve U */ 1174 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp, 1175 upTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactor->descr, 1176 upTriFactor->csrMat->values->data().get(), 1177 upTriFactor->csrMat->row_offsets->data().get(), 1178 upTriFactor->csrMat->column_indices->data().get(), 1179 upTriFactor->solveInfo, 1180 tempGPU->data().get(), xarray);CHKERRCUDA(stat); 1181 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1182 1183 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1184 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1185 ierr = WaitForGPU();CHKERRCUDA(ierr); 1186 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1187 PetscFunctionReturn(0); 1188 } 1189 1190 static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A) 1191 { 1192 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1193 Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 1194 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1195 PetscInt m = A->rmap->n,*ii,*ridx; 1196 PetscErrorCode ierr; 1197 cusparseStatus_t stat; 1198 cudaError_t err; 1199 1200 PetscFunctionBegin; 1201 if (A->pinnedtocpu) PetscFunctionReturn(0); 1202 if (A->valid_GPU_matrix == PETSC_OFFLOAD_UNALLOCATED || A->valid_GPU_matrix == PETSC_OFFLOAD_CPU) { 1203 ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1204 if (A->assembled && A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) { 1205 CsrMatrix *matrix = (CsrMatrix*)matstruct->mat; 1206 /* copy values only */ 1207 matrix->values->assign(a->a, a->a+a->nz); 1208 ierr = PetscLogCpuToGpu((a->nz)*sizeof(PetscScalar));CHKERRQ(ierr); 1209 } else { 1210 MatSeqAIJCUSPARSEMultStruct_Destroy(&matstruct,cusparsestruct->format); 1211 try { 1212 cusparsestruct->nonzerorow=0; 1213 for (int j = 0; j<m; j++) cusparsestruct->nonzerorow += ((a->i[j+1]-a->i[j])>0); 1214 1215 if (a->compressedrow.use) { 1216 m = a->compressedrow.nrows; 1217 ii = a->compressedrow.i; 1218 ridx = a->compressedrow.rindex; 1219 } else { 1220 /* Forcing compressed row on the GPU */ 1221 int k=0; 1222 ierr = PetscMalloc1(cusparsestruct->nonzerorow+1, &ii);CHKERRQ(ierr); 1223 ierr = PetscMalloc1(cusparsestruct->nonzerorow, &ridx);CHKERRQ(ierr); 1224 ii[0]=0; 1225 for (int j = 0; j<m; j++) { 1226 if ((a->i[j+1]-a->i[j])>0) { 1227 ii[k] = a->i[j]; 1228 ridx[k]= j; 1229 k++; 1230 } 1231 } 1232 ii[cusparsestruct->nonzerorow] = a->nz; 1233 m = cusparsestruct->nonzerorow; 1234 } 1235 1236 /* allocate space for the triangular factor information */ 1237 matstruct = new Mat_SeqAIJCUSPARSEMultStruct; 1238 stat = cusparseCreateMatDescr(&matstruct->descr);CHKERRCUDA(stat); 1239 stat = cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat); 1240 stat = cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUDA(stat); 1241 1242 err = cudaMalloc((void **)&(matstruct->alpha), sizeof(PetscScalar));CHKERRCUDA(err); 1243 err = cudaMalloc((void **)&(matstruct->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err); 1244 err = cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar));CHKERRCUDA(err); 1245 err = cudaMemcpy(matstruct->alpha, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1246 err = cudaMemcpy(matstruct->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1247 err = cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1248 stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUDA(stat); 1249 1250 /* Build a hybrid/ellpack matrix if this option is chosen for the storage */ 1251 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1252 /* set the matrix */ 1253 CsrMatrix *matrix= new CsrMatrix; 1254 matrix->num_rows = m; 1255 matrix->num_cols = A->cmap->n; 1256 matrix->num_entries = a->nz; 1257 matrix->row_offsets = new THRUSTINTARRAY32(m+1); 1258 matrix->row_offsets->assign(ii, ii + m+1); 1259 1260 matrix->column_indices = new THRUSTINTARRAY32(a->nz); 1261 matrix->column_indices->assign(a->j, a->j+a->nz); 1262 1263 matrix->values = new THRUSTARRAY(a->nz); 1264 matrix->values->assign(a->a, a->a+a->nz); 1265 1266 /* assign the pointer */ 1267 matstruct->mat = matrix; 1268 1269 } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) { 1270 CsrMatrix *matrix= new CsrMatrix; 1271 matrix->num_rows = m; 1272 matrix->num_cols = A->cmap->n; 1273 matrix->num_entries = a->nz; 1274 matrix->row_offsets = new THRUSTINTARRAY32(m+1); 1275 matrix->row_offsets->assign(ii, ii + m+1); 1276 1277 matrix->column_indices = new THRUSTINTARRAY32(a->nz); 1278 matrix->column_indices->assign(a->j, a->j+a->nz); 1279 1280 matrix->values = new THRUSTARRAY(a->nz); 1281 matrix->values->assign(a->a, a->a+a->nz); 1282 1283 cusparseHybMat_t hybMat; 1284 stat = cusparseCreateHybMat(&hybMat);CHKERRCUDA(stat); 1285 cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ? 1286 CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 1287 stat = cusparse_csr2hyb(cusparsestruct->handle, matrix->num_rows, matrix->num_cols, 1288 matstruct->descr, matrix->values->data().get(), 1289 matrix->row_offsets->data().get(), 1290 matrix->column_indices->data().get(), 1291 hybMat, 0, partition);CHKERRCUDA(stat); 1292 /* assign the pointer */ 1293 matstruct->mat = hybMat; 1294 1295 if (matrix) { 1296 if (matrix->values) delete (THRUSTARRAY*)matrix->values; 1297 if (matrix->column_indices) delete (THRUSTINTARRAY32*)matrix->column_indices; 1298 if (matrix->row_offsets) delete (THRUSTINTARRAY32*)matrix->row_offsets; 1299 delete (CsrMatrix*)matrix; 1300 } 1301 } 1302 1303 /* assign the compressed row indices */ 1304 matstruct->cprowIndices = new THRUSTINTARRAY(m); 1305 matstruct->cprowIndices->assign(ridx,ridx+m); 1306 ierr = PetscLogCpuToGpu(((m+1)+(a->nz))*sizeof(int)+m*sizeof(PetscInt)+(3+(a->nz))*sizeof(PetscScalar));CHKERRQ(ierr); 1307 1308 /* assign the pointer */ 1309 cusparsestruct->mat = matstruct; 1310 1311 if (!a->compressedrow.use) { 1312 ierr = PetscFree(ii);CHKERRQ(ierr); 1313 ierr = PetscFree(ridx);CHKERRQ(ierr); 1314 } 1315 cusparsestruct->workVector = new THRUSTARRAY(m); 1316 } catch(char *ex) { 1317 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 1318 } 1319 cusparsestruct->nonzerostate = A->nonzerostate; 1320 } 1321 ierr = WaitForGPU();CHKERRCUDA(ierr); 1322 A->valid_GPU_matrix = PETSC_OFFLOAD_BOTH; 1323 ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1324 } 1325 PetscFunctionReturn(0); 1326 } 1327 1328 struct VecCUDAPlusEquals 1329 { 1330 template <typename Tuple> 1331 __host__ __device__ 1332 void operator()(Tuple t) 1333 { 1334 thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t); 1335 } 1336 }; 1337 1338 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 1339 { 1340 PetscErrorCode ierr; 1341 1342 PetscFunctionBegin; 1343 ierr = MatMultAdd_SeqAIJCUSPARSE(A,xx,NULL,yy);CHKERRQ(ierr); 1344 PetscFunctionReturn(0); 1345 } 1346 1347 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 1348 { 1349 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1350 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1351 Mat_SeqAIJCUSPARSEMultStruct *matstructT; 1352 const PetscScalar *xarray; 1353 PetscScalar *yarray; 1354 PetscErrorCode ierr; 1355 cusparseStatus_t stat; 1356 1357 PetscFunctionBegin; 1358 /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */ 1359 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 1360 matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 1361 if (!matstructT) { 1362 ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr); 1363 matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 1364 } 1365 ierr = VecCUDAGetArrayRead(xx,&xarray);CHKERRQ(ierr); 1366 ierr = VecSet(yy,0);CHKERRQ(ierr); 1367 ierr = VecCUDAGetArrayWrite(yy,&yarray);CHKERRQ(ierr); 1368 1369 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1370 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1371 CsrMatrix *mat = (CsrMatrix*)matstructT->mat; 1372 stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1373 mat->num_rows, mat->num_cols, 1374 mat->num_entries, matstructT->alpha, matstructT->descr, 1375 mat->values->data().get(), mat->row_offsets->data().get(), 1376 mat->column_indices->data().get(), xarray, matstructT->beta_zero, 1377 yarray);CHKERRCUDA(stat); 1378 } else { 1379 cusparseHybMat_t hybMat = (cusparseHybMat_t)matstructT->mat; 1380 stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1381 matstructT->alpha, matstructT->descr, hybMat, 1382 xarray, matstructT->beta_zero, 1383 yarray);CHKERRCUDA(stat); 1384 } 1385 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1386 ierr = VecCUDARestoreArrayRead(xx,&xarray);CHKERRQ(ierr); 1387 ierr = VecCUDARestoreArrayWrite(yy,&yarray);CHKERRQ(ierr); 1388 if (!cusparsestruct->stream) { 1389 ierr = WaitForGPU();CHKERRCUDA(ierr); 1390 } 1391 ierr = PetscLogGpuFlops(2.0*a->nz - cusparsestruct->nonzerorow);CHKERRQ(ierr); 1392 PetscFunctionReturn(0); 1393 } 1394 1395 1396 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 1397 { 1398 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1399 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1400 Mat_SeqAIJCUSPARSEMultStruct *matstruct; 1401 const PetscScalar *xarray; 1402 PetscScalar *zarray,*dptr,*beta; 1403 PetscErrorCode ierr; 1404 cusparseStatus_t stat; 1405 PetscBool cmpr; /* if the matrix has been compressed (zero rows) */ 1406 1407 PetscFunctionBegin; 1408 /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */ 1409 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 1410 matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 1411 try { 1412 cmpr = (PetscBool)(cusparsestruct->workVector->size() == (thrust::detail::vector_base<PetscScalar, thrust::device_malloc_allocator<PetscScalar> >::size_type)(A->rmap->n)); 1413 ierr = VecCUDAGetArrayRead(xx,&xarray);CHKERRQ(ierr); 1414 if (yy && !cmpr) { /* MatMultAdd with noncompressed storage -> need uptodate zz vector */ 1415 ierr = VecCUDAGetArray(zz,&zarray);CHKERRQ(ierr); 1416 } else { 1417 ierr = VecCUDAGetArrayWrite(zz,&zarray);CHKERRQ(ierr); 1418 } 1419 dptr = cmpr ? zarray : cusparsestruct->workVector->data().get(); 1420 beta = (yy == zz && dptr == zarray) ? matstruct->beta_one : matstruct->beta_zero; 1421 1422 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1423 /* csr_spmv is multiply add */ 1424 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 1425 /* here we need to be careful to set the number of rows in the multiply to the 1426 number of compressed rows in the matrix ... which is equivalent to the 1427 size of the workVector */ 1428 CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 1429 stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1430 mat->num_rows, mat->num_cols, 1431 mat->num_entries, matstruct->alpha, matstruct->descr, 1432 mat->values->data().get(), mat->row_offsets->data().get(), 1433 mat->column_indices->data().get(), xarray, beta, 1434 dptr);CHKERRCUDA(stat); 1435 } else { 1436 if (cusparsestruct->workVector->size()) { 1437 cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat; 1438 stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1439 matstruct->alpha, matstruct->descr, hybMat, 1440 xarray, beta, 1441 dptr);CHKERRCUDA(stat); 1442 } 1443 } 1444 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1445 1446 if (yy) { 1447 if (dptr != zarray) { 1448 ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr); 1449 } else if (zz != yy) { 1450 ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); 1451 } 1452 } else if (dptr != zarray) { 1453 ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr); 1454 } 1455 /* scatter the data from the temporary into the full vector with a += operation */ 1456 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1457 if (dptr != zarray) { 1458 thrust::device_ptr<PetscScalar> zptr; 1459 1460 zptr = thrust::device_pointer_cast(zarray); 1461 thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))), 1462 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + cusparsestruct->workVector->size(), 1463 VecCUDAPlusEquals()); 1464 } 1465 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1466 ierr = VecCUDARestoreArrayRead(xx,&xarray);CHKERRQ(ierr); 1467 if (yy && !cmpr) { 1468 ierr = VecCUDARestoreArray(zz,&zarray);CHKERRQ(ierr); 1469 } else { 1470 ierr = VecCUDARestoreArrayWrite(zz,&zarray);CHKERRQ(ierr); 1471 } 1472 } catch(char *ex) { 1473 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 1474 } 1475 if (!yy) { /* MatMult */ 1476 if (!cusparsestruct->stream) { 1477 ierr = WaitForGPU();CHKERRCUDA(ierr); 1478 } 1479 } 1480 ierr = PetscLogGpuFlops(2.0*a->nz);CHKERRQ(ierr); 1481 PetscFunctionReturn(0); 1482 } 1483 1484 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 1485 { 1486 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1487 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1488 Mat_SeqAIJCUSPARSEMultStruct *matstructT; 1489 thrust::device_ptr<PetscScalar> zptr; 1490 const PetscScalar *xarray; 1491 PetscScalar *zarray; 1492 PetscErrorCode ierr; 1493 cusparseStatus_t stat; 1494 1495 PetscFunctionBegin; 1496 /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */ 1497 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 1498 matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 1499 if (!matstructT) { 1500 ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr); 1501 matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 1502 } 1503 1504 try { 1505 ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr); 1506 ierr = VecCUDAGetArrayRead(xx,&xarray);CHKERRQ(ierr); 1507 ierr = VecCUDAGetArray(zz,&zarray);CHKERRQ(ierr); 1508 zptr = thrust::device_pointer_cast(zarray); 1509 1510 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1511 /* multiply add with matrix transpose */ 1512 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1513 CsrMatrix *mat = (CsrMatrix*)matstructT->mat; 1514 /* here we need to be careful to set the number of rows in the multiply to the 1515 number of compressed rows in the matrix ... which is equivalent to the 1516 size of the workVector */ 1517 stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1518 mat->num_rows, mat->num_cols, 1519 mat->num_entries, matstructT->alpha, matstructT->descr, 1520 mat->values->data().get(), mat->row_offsets->data().get(), 1521 mat->column_indices->data().get(), xarray, matstructT->beta_zero, 1522 cusparsestruct->workVector->data().get());CHKERRCUDA(stat); 1523 } else { 1524 cusparseHybMat_t hybMat = (cusparseHybMat_t)matstructT->mat; 1525 if (cusparsestruct->workVector->size()) { 1526 stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1527 matstructT->alpha, matstructT->descr, hybMat, 1528 xarray, matstructT->beta_zero, 1529 cusparsestruct->workVector->data().get());CHKERRCUDA(stat); 1530 } 1531 } 1532 1533 /* scatter the data from the temporary into the full vector with a += operation */ 1534 thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstructT->cprowIndices->begin()))), 1535 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstructT->cprowIndices->begin()))) + A->cmap->n, 1536 VecCUDAPlusEquals()); 1537 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1538 ierr = VecCUDARestoreArrayRead(xx,&xarray);CHKERRQ(ierr); 1539 ierr = VecCUDARestoreArray(zz,&zarray);CHKERRQ(ierr); 1540 1541 } catch(char *ex) { 1542 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 1543 } 1544 ierr = WaitForGPU();CHKERRCUDA(ierr); 1545 ierr = PetscLogGpuFlops(2.0*a->nz);CHKERRQ(ierr); 1546 PetscFunctionReturn(0); 1547 } 1548 1549 static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode) 1550 { 1551 PetscErrorCode ierr; 1552 1553 PetscFunctionBegin; 1554 ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr); 1555 if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0); 1556 if (A->factortype == MAT_FACTOR_NONE) { 1557 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 1558 } 1559 A->ops->mult = MatMult_SeqAIJCUSPARSE; 1560 A->ops->multadd = MatMultAdd_SeqAIJCUSPARSE; 1561 A->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE; 1562 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE; 1563 PetscFunctionReturn(0); 1564 } 1565 1566 /* --------------------------------------------------------------------------------*/ 1567 /*@ 1568 MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format 1569 (the default parallel PETSc format). This matrix will ultimately pushed down 1570 to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix 1571 assembly performance the user should preallocate the matrix storage by setting 1572 the parameter nz (or the array nnz). By setting these parameters accurately, 1573 performance during matrix assembly can be increased by more than a factor of 50. 1574 1575 Collective 1576 1577 Input Parameters: 1578 + comm - MPI communicator, set to PETSC_COMM_SELF 1579 . m - number of rows 1580 . n - number of columns 1581 . nz - number of nonzeros per row (same for all rows) 1582 - nnz - array containing the number of nonzeros in the various rows 1583 (possibly different for each row) or NULL 1584 1585 Output Parameter: 1586 . A - the matrix 1587 1588 It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 1589 MatXXXXSetPreallocation() paradgm instead of this routine directly. 1590 [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 1591 1592 Notes: 1593 If nnz is given then nz is ignored 1594 1595 The AIJ format (also called the Yale sparse matrix format or 1596 compressed row storage), is fully compatible with standard Fortran 77 1597 storage. That is, the stored row and column indices can begin at 1598 either one (as in Fortran) or zero. See the users' manual for details. 1599 1600 Specify the preallocated storage with either nz or nnz (not both). 1601 Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 1602 allocation. For large problems you MUST preallocate memory or you 1603 will get TERRIBLE performance, see the users' manual chapter on matrices. 1604 1605 By default, this format uses inodes (identical nodes) when possible, to 1606 improve numerical efficiency of matrix-vector products and solves. We 1607 search for consecutive rows with the same nonzero structure, thereby 1608 reusing matrix information to achieve increased efficiency. 1609 1610 Level: intermediate 1611 1612 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE 1613 @*/ 1614 PetscErrorCode MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A) 1615 { 1616 PetscErrorCode ierr; 1617 1618 PetscFunctionBegin; 1619 ierr = MatCreate(comm,A);CHKERRQ(ierr); 1620 ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr); 1621 ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 1622 ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr); 1623 PetscFunctionReturn(0); 1624 } 1625 1626 static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A) 1627 { 1628 PetscErrorCode ierr; 1629 1630 PetscFunctionBegin; 1631 if (A->factortype==MAT_FACTOR_NONE) { 1632 if (A->valid_GPU_matrix != PETSC_OFFLOAD_UNALLOCATED) { 1633 ierr = MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr);CHKERRQ(ierr); 1634 } 1635 } else { 1636 ierr = MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr);CHKERRQ(ierr); 1637 } 1638 ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr); 1639 PetscFunctionReturn(0); 1640 } 1641 1642 static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A,MatDuplicateOption cpvalues,Mat *B) 1643 { 1644 PetscErrorCode ierr; 1645 Mat C; 1646 cusparseStatus_t stat; 1647 cusparseHandle_t handle=0; 1648 1649 PetscFunctionBegin; 1650 ierr = MatDuplicate_SeqAIJ(A,cpvalues,B);CHKERRQ(ierr); 1651 C = *B; 1652 ierr = PetscFree(C->defaultvectype);CHKERRQ(ierr); 1653 ierr = PetscStrallocpy(VECCUDA,&C->defaultvectype);CHKERRQ(ierr); 1654 1655 /* inject CUSPARSE-specific stuff */ 1656 if (C->factortype==MAT_FACTOR_NONE) { 1657 /* you cannot check the inode.use flag here since the matrix was just created. 1658 now build a GPU matrix data structure */ 1659 C->spptr = new Mat_SeqAIJCUSPARSE; 1660 ((Mat_SeqAIJCUSPARSE*)C->spptr)->mat = 0; 1661 ((Mat_SeqAIJCUSPARSE*)C->spptr)->matTranspose = 0; 1662 ((Mat_SeqAIJCUSPARSE*)C->spptr)->workVector = 0; 1663 ((Mat_SeqAIJCUSPARSE*)C->spptr)->format = MAT_CUSPARSE_CSR; 1664 ((Mat_SeqAIJCUSPARSE*)C->spptr)->stream = 0; 1665 stat = cusparseCreate(&handle);CHKERRCUDA(stat); 1666 ((Mat_SeqAIJCUSPARSE*)C->spptr)->handle = handle; 1667 ((Mat_SeqAIJCUSPARSE*)C->spptr)->nonzerostate = 0; 1668 } else { 1669 /* NEXT, set the pointers to the triangular factors */ 1670 C->spptr = new Mat_SeqAIJCUSPARSETriFactors; 1671 ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->loTriFactorPtr = 0; 1672 ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->upTriFactorPtr = 0; 1673 ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->loTriFactorPtrTranspose = 0; 1674 ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->upTriFactorPtrTranspose = 0; 1675 ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->rpermIndices = 0; 1676 ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->cpermIndices = 0; 1677 ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->workVector = 0; 1678 ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->handle = 0; 1679 stat = cusparseCreate(&handle);CHKERRCUDA(stat); 1680 ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->handle = handle; 1681 ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->nnz = 0; 1682 } 1683 1684 C->ops->assemblyend = MatAssemblyEnd_SeqAIJCUSPARSE; 1685 C->ops->destroy = MatDestroy_SeqAIJCUSPARSE; 1686 C->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE; 1687 C->ops->mult = MatMult_SeqAIJCUSPARSE; 1688 C->ops->multadd = MatMultAdd_SeqAIJCUSPARSE; 1689 C->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE; 1690 C->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE; 1691 C->ops->duplicate = MatDuplicate_SeqAIJCUSPARSE; 1692 1693 ierr = PetscObjectChangeTypeName((PetscObject)C,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 1694 1695 C->valid_GPU_matrix = PETSC_OFFLOAD_UNALLOCATED; 1696 1697 ierr = PetscObjectComposeFunction((PetscObject)C, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr); 1698 PetscFunctionReturn(0); 1699 } 1700 1701 PETSC_EXTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat B) 1702 { 1703 PetscErrorCode ierr; 1704 cusparseStatus_t stat; 1705 cusparseHandle_t handle=0; 1706 1707 PetscFunctionBegin; 1708 ierr = PetscFree(B->defaultvectype);CHKERRQ(ierr); 1709 ierr = PetscStrallocpy(VECCUDA,&B->defaultvectype);CHKERRQ(ierr); 1710 1711 if (B->factortype==MAT_FACTOR_NONE) { 1712 /* you cannot check the inode.use flag here since the matrix was just created. 1713 now build a GPU matrix data structure */ 1714 B->spptr = new Mat_SeqAIJCUSPARSE; 1715 ((Mat_SeqAIJCUSPARSE*)B->spptr)->mat = 0; 1716 ((Mat_SeqAIJCUSPARSE*)B->spptr)->matTranspose = 0; 1717 ((Mat_SeqAIJCUSPARSE*)B->spptr)->workVector = 0; 1718 ((Mat_SeqAIJCUSPARSE*)B->spptr)->format = MAT_CUSPARSE_CSR; 1719 ((Mat_SeqAIJCUSPARSE*)B->spptr)->stream = 0; 1720 stat = cusparseCreate(&handle);CHKERRCUDA(stat); 1721 ((Mat_SeqAIJCUSPARSE*)B->spptr)->handle = handle; 1722 ((Mat_SeqAIJCUSPARSE*)B->spptr)->nonzerostate = 0; 1723 } else { 1724 /* NEXT, set the pointers to the triangular factors */ 1725 B->spptr = new Mat_SeqAIJCUSPARSETriFactors; 1726 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->loTriFactorPtr = 0; 1727 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->upTriFactorPtr = 0; 1728 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->loTriFactorPtrTranspose = 0; 1729 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->upTriFactorPtrTranspose = 0; 1730 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->rpermIndices = 0; 1731 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->cpermIndices = 0; 1732 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->workVector = 0; 1733 stat = cusparseCreate(&handle);CHKERRCUDA(stat); 1734 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->handle = handle; 1735 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->nnz = 0; 1736 } 1737 1738 B->ops->assemblyend = MatAssemblyEnd_SeqAIJCUSPARSE; 1739 B->ops->destroy = MatDestroy_SeqAIJCUSPARSE; 1740 B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE; 1741 B->ops->mult = MatMult_SeqAIJCUSPARSE; 1742 B->ops->multadd = MatMultAdd_SeqAIJCUSPARSE; 1743 B->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE; 1744 B->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE; 1745 B->ops->duplicate = MatDuplicate_SeqAIJCUSPARSE; 1746 1747 ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 1748 1749 B->valid_GPU_matrix = PETSC_OFFLOAD_UNALLOCATED; 1750 1751 ierr = PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr); 1752 PetscFunctionReturn(0); 1753 } 1754 1755 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B) 1756 { 1757 PetscErrorCode ierr; 1758 1759 PetscFunctionBegin; 1760 ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr); 1761 ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(B);CHKERRQ(ierr); 1762 PetscFunctionReturn(0); 1763 } 1764 1765 /*MC 1766 MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices. 1767 1768 A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either 1769 CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later. 1770 All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library. 1771 1772 Options Database Keys: 1773 + -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions() 1774 . -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 1775 - -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 1776 1777 Level: beginner 1778 1779 .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 1780 M*/ 1781 1782 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat,MatFactorType,Mat*); 1783 1784 1785 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void) 1786 { 1787 PetscErrorCode ierr; 1788 1789 PetscFunctionBegin; 1790 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 1791 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 1792 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 1793 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 1794 PetscFunctionReturn(0); 1795 } 1796 1797 1798 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct) 1799 { 1800 cusparseStatus_t stat; 1801 cusparseHandle_t handle; 1802 1803 PetscFunctionBegin; 1804 if (*cusparsestruct) { 1805 MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format); 1806 MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format); 1807 delete (*cusparsestruct)->workVector; 1808 if (handle = (*cusparsestruct)->handle) { 1809 stat = cusparseDestroy(handle);CHKERRCUDA(stat); 1810 } 1811 delete *cusparsestruct; 1812 *cusparsestruct = 0; 1813 } 1814 PetscFunctionReturn(0); 1815 } 1816 1817 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat) 1818 { 1819 PetscFunctionBegin; 1820 if (*mat) { 1821 delete (*mat)->values; 1822 delete (*mat)->column_indices; 1823 delete (*mat)->row_offsets; 1824 delete *mat; 1825 *mat = 0; 1826 } 1827 PetscFunctionReturn(0); 1828 } 1829 1830 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor) 1831 { 1832 cusparseStatus_t stat; 1833 PetscErrorCode ierr; 1834 1835 PetscFunctionBegin; 1836 if (*trifactor) { 1837 if ((*trifactor)->descr) { stat = cusparseDestroyMatDescr((*trifactor)->descr);CHKERRCUDA(stat); } 1838 if ((*trifactor)->solveInfo) { stat = cusparseDestroySolveAnalysisInfo((*trifactor)->solveInfo);CHKERRCUDA(stat); } 1839 ierr = CsrMatrix_Destroy(&(*trifactor)->csrMat);CHKERRQ(ierr); 1840 delete *trifactor; 1841 *trifactor = 0; 1842 } 1843 PetscFunctionReturn(0); 1844 } 1845 1846 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format) 1847 { 1848 CsrMatrix *mat; 1849 cusparseStatus_t stat; 1850 cudaError_t err; 1851 1852 PetscFunctionBegin; 1853 if (*matstruct) { 1854 if ((*matstruct)->mat) { 1855 if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) { 1856 cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat; 1857 stat = cusparseDestroyHybMat(hybMat);CHKERRCUDA(stat); 1858 } else { 1859 mat = (CsrMatrix*)(*matstruct)->mat; 1860 CsrMatrix_Destroy(&mat); 1861 } 1862 } 1863 if ((*matstruct)->descr) { stat = cusparseDestroyMatDescr((*matstruct)->descr);CHKERRCUDA(stat); } 1864 delete (*matstruct)->cprowIndices; 1865 if ((*matstruct)->alpha) { err=cudaFree((*matstruct)->alpha);CHKERRCUDA(err); } 1866 if ((*matstruct)->beta_zero) { err=cudaFree((*matstruct)->beta_zero);CHKERRCUDA(err); } 1867 if ((*matstruct)->beta_one) { err=cudaFree((*matstruct)->beta_one);CHKERRCUDA(err); } 1868 delete *matstruct; 1869 *matstruct = 0; 1870 } 1871 PetscFunctionReturn(0); 1872 } 1873 1874 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors) 1875 { 1876 cusparseHandle_t handle; 1877 cusparseStatus_t stat; 1878 1879 PetscFunctionBegin; 1880 if (*trifactors) { 1881 MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtr); 1882 MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtr); 1883 MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtrTranspose); 1884 MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtrTranspose); 1885 delete (*trifactors)->rpermIndices; 1886 delete (*trifactors)->cpermIndices; 1887 delete (*trifactors)->workVector; 1888 if (handle = (*trifactors)->handle) { 1889 stat = cusparseDestroy(handle);CHKERRCUDA(stat); 1890 } 1891 delete *trifactors; 1892 *trifactors = 0; 1893 } 1894 PetscFunctionReturn(0); 1895 } 1896 1897