1 /* 2 Defines the basic matrix operations for the AIJ (compressed row) 3 matrix storage format using the CUSPARSE library, 4 */ 5 #define PETSC_SKIP_SPINLOCK 6 7 #include <petscconf.h> 8 #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/ 9 #include <../src/mat/impls/sbaij/seq/sbaij.h> 10 #include <../src/vec/vec/impls/dvecimpl.h> 11 #include <petsc/private/vecimpl.h> 12 #undef VecType 13 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h> 14 15 const char *const MatCUSPARSEStorageFormats[] = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0}; 16 17 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*); 18 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*); 19 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*); 20 21 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 22 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 23 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*); 24 25 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec); 26 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 27 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 28 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 29 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat); 30 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec); 31 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 32 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 33 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 34 35 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**); 36 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**); 37 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat); 38 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**); 39 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**); 40 41 PetscErrorCode MatCUSPARSESetStream(Mat A,const cudaStream_t stream) 42 { 43 cusparseStatus_t stat; 44 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 45 46 PetscFunctionBegin; 47 cusparsestruct->stream = stream; 48 stat = cusparseSetStream(cusparsestruct->handle,cusparsestruct->stream);CHKERRCUDA(stat); 49 PetscFunctionReturn(0); 50 } 51 52 PetscErrorCode MatCUSPARSESetHandle(Mat A,const cusparseHandle_t handle) 53 { 54 cusparseStatus_t stat; 55 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 56 57 PetscFunctionBegin; 58 if (cusparsestruct->handle != handle) { 59 if (cusparsestruct->handle) { 60 stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUDA(stat); 61 } 62 cusparsestruct->handle = handle; 63 } 64 stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUDA(stat); 65 PetscFunctionReturn(0); 66 } 67 68 PetscErrorCode MatCUSPARSEClearHandle(Mat A) 69 { 70 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 71 PetscFunctionBegin; 72 if (cusparsestruct->handle) 73 cusparsestruct->handle = 0; 74 PetscFunctionReturn(0); 75 } 76 77 PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A,MatSolverType *type) 78 { 79 PetscFunctionBegin; 80 *type = MATSOLVERCUSPARSE; 81 PetscFunctionReturn(0); 82 } 83 84 /*MC 85 MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices 86 on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported 87 algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer 88 performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the 89 CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these 90 algorithms are not recommended. This class does NOT support direct solver operations. 91 92 Level: beginner 93 94 .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 95 M*/ 96 97 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B) 98 { 99 PetscErrorCode ierr; 100 PetscInt n = A->rmap->n; 101 102 PetscFunctionBegin; 103 ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr); 104 (*B)->factortype = ftype; 105 ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr); 106 ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 107 108 if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) { 109 ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr); 110 (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE; 111 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJCUSPARSE; 112 } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) { 113 (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJCUSPARSE; 114 (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE; 115 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types"); 116 117 ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr); 118 ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_cusparse);CHKERRQ(ierr); 119 PetscFunctionReturn(0); 120 } 121 122 PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 123 { 124 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 125 126 PetscFunctionBegin; 127 switch (op) { 128 case MAT_CUSPARSE_MULT: 129 cusparsestruct->format = format; 130 break; 131 case MAT_CUSPARSE_ALL: 132 cusparsestruct->format = format; 133 break; 134 default: 135 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op); 136 } 137 PetscFunctionReturn(0); 138 } 139 140 /*@ 141 MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular 142 operation. Only the MatMult operation can use different GPU storage formats 143 for MPIAIJCUSPARSE matrices. 144 Not Collective 145 146 Input Parameters: 147 + A - Matrix of type SEQAIJCUSPARSE 148 . op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL. 149 - format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2) 150 151 Output Parameter: 152 153 Level: intermediate 154 155 .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 156 @*/ 157 PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 158 { 159 PetscErrorCode ierr; 160 161 PetscFunctionBegin; 162 PetscValidHeaderSpecific(A, MAT_CLASSID,1); 163 ierr = PetscTryMethod(A, "MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));CHKERRQ(ierr); 164 PetscFunctionReturn(0); 165 } 166 167 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A) 168 { 169 PetscErrorCode ierr; 170 MatCUSPARSEStorageFormat format; 171 PetscBool flg; 172 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 173 174 PetscFunctionBegin; 175 ierr = PetscOptionsHead(PetscOptionsObject,"SeqAIJCUSPARSE options");CHKERRQ(ierr); 176 if (A->factortype==MAT_FACTOR_NONE) { 177 ierr = PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV", 178 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 179 if (flg) { 180 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);CHKERRQ(ierr); 181 } 182 } 183 ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve", 184 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 185 if (flg) { 186 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr); 187 } 188 ierr = PetscOptionsTail();CHKERRQ(ierr); 189 PetscFunctionReturn(0); 190 191 } 192 193 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 194 { 195 PetscErrorCode ierr; 196 197 PetscFunctionBegin; 198 ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 199 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 200 PetscFunctionReturn(0); 201 } 202 203 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 204 { 205 PetscErrorCode ierr; 206 207 PetscFunctionBegin; 208 ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 209 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 210 PetscFunctionReturn(0); 211 } 212 213 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info) 214 { 215 PetscErrorCode ierr; 216 217 PetscFunctionBegin; 218 ierr = MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr); 219 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 220 PetscFunctionReturn(0); 221 } 222 223 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info) 224 { 225 PetscErrorCode ierr; 226 227 PetscFunctionBegin; 228 ierr = MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr); 229 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 230 PetscFunctionReturn(0); 231 } 232 233 static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A) 234 { 235 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 236 PetscInt n = A->rmap->n; 237 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 238 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 239 cusparseStatus_t stat; 240 const PetscInt *ai = a->i,*aj = a->j,*vi; 241 const MatScalar *aa = a->a,*v; 242 PetscInt *AiLo, *AjLo; 243 PetscScalar *AALo; 244 PetscInt i,nz, nzLower, offset, rowOffset; 245 PetscErrorCode ierr; 246 247 PetscFunctionBegin; 248 if (!n) PetscFunctionReturn(0); 249 if (A->valid_GPU_matrix == PETSC_OFFLOAD_UNALLOCATED || A->valid_GPU_matrix == PETSC_OFFLOAD_CPU) { 250 try { 251 /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */ 252 nzLower=n+ai[n]-ai[1]; 253 254 /* Allocate Space for the lower triangular matrix */ 255 ierr = cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUDA(ierr); 256 ierr = cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUDA(ierr); 257 ierr = cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUDA(ierr); 258 259 /* Fill the lower triangular matrix */ 260 AiLo[0] = (PetscInt) 0; 261 AiLo[n] = nzLower; 262 AjLo[0] = (PetscInt) 0; 263 AALo[0] = (MatScalar) 1.0; 264 v = aa; 265 vi = aj; 266 offset = 1; 267 rowOffset= 1; 268 for (i=1; i<n; i++) { 269 nz = ai[i+1] - ai[i]; 270 /* additional 1 for the term on the diagonal */ 271 AiLo[i] = rowOffset; 272 rowOffset += nz+1; 273 274 ierr = PetscArraycpy(&(AjLo[offset]), vi, nz);CHKERRQ(ierr); 275 ierr = PetscArraycpy(&(AALo[offset]), v, nz);CHKERRQ(ierr); 276 277 offset += nz; 278 AjLo[offset] = (PetscInt) i; 279 AALo[offset] = (MatScalar) 1.0; 280 offset += 1; 281 282 v += nz; 283 vi += nz; 284 } 285 286 /* allocate space for the triangular factor information */ 287 loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct; 288 289 /* Create the matrix description */ 290 stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUDA(stat); 291 stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat); 292 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUDA(stat); 293 stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER);CHKERRCUDA(stat); 294 stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUDA(stat); 295 296 /* Create the solve analysis information */ 297 stat = cusparseCreateSolveAnalysisInfo(&loTriFactor->solveInfo);CHKERRCUDA(stat); 298 299 /* set the operation */ 300 loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 301 302 /* set the matrix */ 303 loTriFactor->csrMat = new CsrMatrix; 304 loTriFactor->csrMat->num_rows = n; 305 loTriFactor->csrMat->num_cols = n; 306 loTriFactor->csrMat->num_entries = nzLower; 307 308 loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1); 309 loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1); 310 311 loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower); 312 loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower); 313 314 loTriFactor->csrMat->values = new THRUSTARRAY(nzLower); 315 loTriFactor->csrMat->values->assign(AALo, AALo+nzLower); 316 317 /* perform the solve analysis */ 318 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, 319 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 320 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 321 loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo);CHKERRCUDA(stat); 322 323 /* assign the pointer. Is this really necessary? */ 324 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor; 325 326 ierr = cudaFreeHost(AiLo);CHKERRCUDA(ierr); 327 ierr = cudaFreeHost(AjLo);CHKERRCUDA(ierr); 328 ierr = cudaFreeHost(AALo);CHKERRCUDA(ierr); 329 ierr = PetscLogCpuToGpu((n+1+nzLower)*sizeof(int)+nzLower*sizeof(PetscScalar));CHKERRQ(ierr); 330 } catch(char *ex) { 331 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 332 } 333 } 334 PetscFunctionReturn(0); 335 } 336 337 static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A) 338 { 339 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 340 PetscInt n = A->rmap->n; 341 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 342 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 343 cusparseStatus_t stat; 344 const PetscInt *aj = a->j,*adiag = a->diag,*vi; 345 const MatScalar *aa = a->a,*v; 346 PetscInt *AiUp, *AjUp; 347 PetscScalar *AAUp; 348 PetscInt i,nz, nzUpper, offset; 349 PetscErrorCode ierr; 350 351 PetscFunctionBegin; 352 if (!n) PetscFunctionReturn(0); 353 if (A->valid_GPU_matrix == PETSC_OFFLOAD_UNALLOCATED || A->valid_GPU_matrix == PETSC_OFFLOAD_CPU) { 354 try { 355 /* next, figure out the number of nonzeros in the upper triangular matrix. */ 356 nzUpper = adiag[0]-adiag[n]; 357 358 /* Allocate Space for the upper triangular matrix */ 359 ierr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(ierr); 360 ierr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(ierr); 361 ierr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(ierr); 362 363 /* Fill the upper triangular matrix */ 364 AiUp[0]=(PetscInt) 0; 365 AiUp[n]=nzUpper; 366 offset = nzUpper; 367 for (i=n-1; i>=0; i--) { 368 v = aa + adiag[i+1] + 1; 369 vi = aj + adiag[i+1] + 1; 370 371 /* number of elements NOT on the diagonal */ 372 nz = adiag[i] - adiag[i+1]-1; 373 374 /* decrement the offset */ 375 offset -= (nz+1); 376 377 /* first, set the diagonal elements */ 378 AjUp[offset] = (PetscInt) i; 379 AAUp[offset] = (MatScalar)1./v[nz]; 380 AiUp[i] = AiUp[i+1] - (nz+1); 381 382 ierr = PetscArraycpy(&(AjUp[offset+1]), vi, nz);CHKERRQ(ierr); 383 ierr = PetscArraycpy(&(AAUp[offset+1]), v, nz);CHKERRQ(ierr); 384 } 385 386 /* allocate space for the triangular factor information */ 387 upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct; 388 389 /* Create the matrix description */ 390 stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUDA(stat); 391 stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat); 392 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUDA(stat); 393 stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUDA(stat); 394 stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUDA(stat); 395 396 /* Create the solve analysis information */ 397 stat = cusparseCreateSolveAnalysisInfo(&upTriFactor->solveInfo);CHKERRCUDA(stat); 398 399 /* set the operation */ 400 upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 401 402 /* set the matrix */ 403 upTriFactor->csrMat = new CsrMatrix; 404 upTriFactor->csrMat->num_rows = n; 405 upTriFactor->csrMat->num_cols = n; 406 upTriFactor->csrMat->num_entries = nzUpper; 407 408 upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1); 409 upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1); 410 411 upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper); 412 upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper); 413 414 upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper); 415 upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper); 416 417 /* perform the solve analysis */ 418 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, 419 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 420 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 421 upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo);CHKERRCUDA(stat); 422 423 /* assign the pointer. Is this really necessary? */ 424 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor; 425 426 ierr = cudaFreeHost(AiUp);CHKERRCUDA(ierr); 427 ierr = cudaFreeHost(AjUp);CHKERRCUDA(ierr); 428 ierr = cudaFreeHost(AAUp);CHKERRCUDA(ierr); 429 ierr = PetscLogCpuToGpu((n+1+nzUpper)*sizeof(int)+nzUpper*sizeof(PetscScalar));CHKERRQ(ierr); 430 } catch(char *ex) { 431 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 432 } 433 } 434 PetscFunctionReturn(0); 435 } 436 437 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A) 438 { 439 PetscErrorCode ierr; 440 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 441 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 442 IS isrow = a->row,iscol = a->icol; 443 PetscBool row_identity,col_identity; 444 const PetscInt *r,*c; 445 PetscInt n = A->rmap->n; 446 447 PetscFunctionBegin; 448 ierr = MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);CHKERRQ(ierr); 449 ierr = MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);CHKERRQ(ierr); 450 451 cusparseTriFactors->workVector = new THRUSTARRAY(n); 452 cusparseTriFactors->nnz=a->nz; 453 454 A->valid_GPU_matrix = PETSC_OFFLOAD_BOTH; 455 /*lower triangular indices */ 456 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 457 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 458 if (!row_identity) { 459 cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 460 cusparseTriFactors->rpermIndices->assign(r, r+n); 461 } 462 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 463 464 /*upper triangular indices */ 465 ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr); 466 ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 467 if (!col_identity) { 468 cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 469 cusparseTriFactors->cpermIndices->assign(c, c+n); 470 } 471 472 if(!row_identity && !col_identity) { 473 ierr = PetscLogCpuToGpu(2*n*sizeof(PetscInt));CHKERRQ(ierr); 474 } else if(!row_identity) { 475 ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr); 476 } else if(!col_identity) { 477 ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr); 478 } 479 480 ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr); 481 PetscFunctionReturn(0); 482 } 483 484 static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A) 485 { 486 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 487 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 488 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 489 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 490 cusparseStatus_t stat; 491 PetscErrorCode ierr; 492 PetscInt *AiUp, *AjUp; 493 PetscScalar *AAUp; 494 PetscScalar *AALo; 495 PetscInt nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j; 496 Mat_SeqSBAIJ *b = (Mat_SeqSBAIJ*)A->data; 497 const PetscInt *ai = b->i,*aj = b->j,*vj; 498 const MatScalar *aa = b->a,*v; 499 500 PetscFunctionBegin; 501 if (!n) PetscFunctionReturn(0); 502 if (A->valid_GPU_matrix == PETSC_OFFLOAD_UNALLOCATED || A->valid_GPU_matrix == PETSC_OFFLOAD_CPU) { 503 try { 504 /* Allocate Space for the upper triangular matrix */ 505 ierr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(ierr); 506 ierr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(ierr); 507 ierr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(ierr); 508 ierr = cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUDA(ierr); 509 510 /* Fill the upper triangular matrix */ 511 AiUp[0]=(PetscInt) 0; 512 AiUp[n]=nzUpper; 513 offset = 0; 514 for (i=0; i<n; i++) { 515 /* set the pointers */ 516 v = aa + ai[i]; 517 vj = aj + ai[i]; 518 nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */ 519 520 /* first, set the diagonal elements */ 521 AjUp[offset] = (PetscInt) i; 522 AAUp[offset] = (MatScalar)1.0/v[nz]; 523 AiUp[i] = offset; 524 AALo[offset] = (MatScalar)1.0/v[nz]; 525 526 offset+=1; 527 if (nz>0) { 528 ierr = PetscArraycpy(&(AjUp[offset]), vj, nz);CHKERRQ(ierr); 529 ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr); 530 for (j=offset; j<offset+nz; j++) { 531 AAUp[j] = -AAUp[j]; 532 AALo[j] = AAUp[j]/v[nz]; 533 } 534 offset+=nz; 535 } 536 } 537 538 /* allocate space for the triangular factor information */ 539 upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct; 540 541 /* Create the matrix description */ 542 stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUDA(stat); 543 stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat); 544 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUDA(stat); 545 stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUDA(stat); 546 stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUDA(stat); 547 548 /* Create the solve analysis information */ 549 stat = cusparseCreateSolveAnalysisInfo(&upTriFactor->solveInfo);CHKERRCUDA(stat); 550 551 /* set the operation */ 552 upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 553 554 /* set the matrix */ 555 upTriFactor->csrMat = new CsrMatrix; 556 upTriFactor->csrMat->num_rows = A->rmap->n; 557 upTriFactor->csrMat->num_cols = A->cmap->n; 558 upTriFactor->csrMat->num_entries = a->nz; 559 560 upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 561 upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1); 562 563 upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 564 upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz); 565 566 upTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 567 upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz); 568 569 /* perform the solve analysis */ 570 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, 571 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 572 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 573 upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo);CHKERRCUDA(stat); 574 575 /* assign the pointer. Is this really necessary? */ 576 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor; 577 578 /* allocate space for the triangular factor information */ 579 loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct; 580 581 /* Create the matrix description */ 582 stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUDA(stat); 583 stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat); 584 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUDA(stat); 585 stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUDA(stat); 586 stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUDA(stat); 587 588 /* Create the solve analysis information */ 589 stat = cusparseCreateSolveAnalysisInfo(&loTriFactor->solveInfo);CHKERRCUDA(stat); 590 591 /* set the operation */ 592 loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE; 593 594 /* set the matrix */ 595 loTriFactor->csrMat = new CsrMatrix; 596 loTriFactor->csrMat->num_rows = A->rmap->n; 597 loTriFactor->csrMat->num_cols = A->cmap->n; 598 loTriFactor->csrMat->num_entries = a->nz; 599 600 loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 601 loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1); 602 603 loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 604 loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz); 605 606 loTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 607 loTriFactor->csrMat->values->assign(AALo, AALo+a->nz); 608 ierr = PetscLogCpuToGpu(2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)));CHKERRQ(ierr); 609 610 /* perform the solve analysis */ 611 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, 612 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 613 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 614 loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo);CHKERRCUDA(stat); 615 616 /* assign the pointer. Is this really necessary? */ 617 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor; 618 619 A->valid_GPU_matrix = PETSC_OFFLOAD_BOTH; 620 ierr = cudaFreeHost(AiUp);CHKERRCUDA(ierr); 621 ierr = cudaFreeHost(AjUp);CHKERRCUDA(ierr); 622 ierr = cudaFreeHost(AAUp);CHKERRCUDA(ierr); 623 ierr = cudaFreeHost(AALo);CHKERRCUDA(ierr); 624 } catch(char *ex) { 625 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 626 } 627 } 628 PetscFunctionReturn(0); 629 } 630 631 static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A) 632 { 633 PetscErrorCode ierr; 634 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 635 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 636 IS ip = a->row; 637 const PetscInt *rip; 638 PetscBool perm_identity; 639 PetscInt n = A->rmap->n; 640 641 PetscFunctionBegin; 642 ierr = MatSeqAIJCUSPARSEBuildICCTriMatrices(A);CHKERRQ(ierr); 643 cusparseTriFactors->workVector = new THRUSTARRAY(n); 644 cusparseTriFactors->nnz=(a->nz-n)*2 + n; 645 646 /*lower triangular indices */ 647 ierr = ISGetIndices(ip,&rip);CHKERRQ(ierr); 648 ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr); 649 if (!perm_identity) { 650 cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 651 cusparseTriFactors->rpermIndices->assign(rip, rip+n); 652 cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 653 cusparseTriFactors->cpermIndices->assign(rip, rip+n); 654 ierr = PetscLogCpuToGpu(2*n*sizeof(PetscInt));CHKERRQ(ierr); 655 } 656 ierr = ISRestoreIndices(ip,&rip);CHKERRQ(ierr); 657 PetscFunctionReturn(0); 658 } 659 660 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info) 661 { 662 Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 663 IS isrow = b->row,iscol = b->col; 664 PetscBool row_identity,col_identity; 665 PetscErrorCode ierr; 666 667 PetscFunctionBegin; 668 ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 669 /* determine which version of MatSolve needs to be used. */ 670 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 671 ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 672 if (row_identity && col_identity) { 673 B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 674 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 675 } else { 676 B->ops->solve = MatSolve_SeqAIJCUSPARSE; 677 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 678 } 679 680 /* get the triangular factors */ 681 ierr = MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);CHKERRQ(ierr); 682 PetscFunctionReturn(0); 683 } 684 685 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info) 686 { 687 Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 688 IS ip = b->row; 689 PetscBool perm_identity; 690 PetscErrorCode ierr; 691 692 PetscFunctionBegin; 693 ierr = MatCholeskyFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 694 695 /* determine which version of MatSolve needs to be used. */ 696 ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr); 697 if (perm_identity) { 698 B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 699 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 700 } else { 701 B->ops->solve = MatSolve_SeqAIJCUSPARSE; 702 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 703 } 704 705 /* get the triangular factors */ 706 ierr = MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);CHKERRQ(ierr); 707 PetscFunctionReturn(0); 708 } 709 710 static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A) 711 { 712 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 713 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 714 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 715 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 716 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 717 cusparseStatus_t stat; 718 cusparseIndexBase_t indexBase; 719 cusparseMatrixType_t matrixType; 720 cusparseFillMode_t fillMode; 721 cusparseDiagType_t diagType; 722 723 PetscFunctionBegin; 724 725 /*********************************************/ 726 /* Now the Transpose of the Lower Tri Factor */ 727 /*********************************************/ 728 729 /* allocate space for the transpose of the lower triangular factor */ 730 loTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct; 731 732 /* set the matrix descriptors of the lower triangular factor */ 733 matrixType = cusparseGetMatType(loTriFactor->descr); 734 indexBase = cusparseGetMatIndexBase(loTriFactor->descr); 735 fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ? 736 CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 737 diagType = cusparseGetMatDiagType(loTriFactor->descr); 738 739 /* Create the matrix description */ 740 stat = cusparseCreateMatDescr(&loTriFactorT->descr);CHKERRCUDA(stat); 741 stat = cusparseSetMatIndexBase(loTriFactorT->descr, indexBase);CHKERRCUDA(stat); 742 stat = cusparseSetMatType(loTriFactorT->descr, matrixType);CHKERRCUDA(stat); 743 stat = cusparseSetMatFillMode(loTriFactorT->descr, fillMode);CHKERRCUDA(stat); 744 stat = cusparseSetMatDiagType(loTriFactorT->descr, diagType);CHKERRCUDA(stat); 745 746 /* Create the solve analysis information */ 747 stat = cusparseCreateSolveAnalysisInfo(&loTriFactorT->solveInfo);CHKERRCUDA(stat); 748 749 /* set the operation */ 750 loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 751 752 /* allocate GPU space for the CSC of the lower triangular factor*/ 753 loTriFactorT->csrMat = new CsrMatrix; 754 loTriFactorT->csrMat->num_rows = loTriFactor->csrMat->num_rows; 755 loTriFactorT->csrMat->num_cols = loTriFactor->csrMat->num_cols; 756 loTriFactorT->csrMat->num_entries = loTriFactor->csrMat->num_entries; 757 loTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(loTriFactor->csrMat->num_rows+1); 758 loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactor->csrMat->num_entries); 759 loTriFactorT->csrMat->values = new THRUSTARRAY(loTriFactor->csrMat->num_entries); 760 761 /* compute the transpose of the lower triangular factor, i.e. the CSC */ 762 stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, 763 loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, 764 loTriFactor->csrMat->values->data().get(), 765 loTriFactor->csrMat->row_offsets->data().get(), 766 loTriFactor->csrMat->column_indices->data().get(), 767 loTriFactorT->csrMat->values->data().get(), 768 loTriFactorT->csrMat->column_indices->data().get(), 769 loTriFactorT->csrMat->row_offsets->data().get(), 770 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUDA(stat); 771 772 /* perform the solve analysis on the transposed matrix */ 773 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp, 774 loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, 775 loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(), 776 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), 777 loTriFactorT->solveInfo);CHKERRCUDA(stat); 778 779 /* assign the pointer. Is this really necessary? */ 780 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT; 781 782 /*********************************************/ 783 /* Now the Transpose of the Upper Tri Factor */ 784 /*********************************************/ 785 786 /* allocate space for the transpose of the upper triangular factor */ 787 upTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct; 788 789 /* set the matrix descriptors of the upper triangular factor */ 790 matrixType = cusparseGetMatType(upTriFactor->descr); 791 indexBase = cusparseGetMatIndexBase(upTriFactor->descr); 792 fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ? 793 CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 794 diagType = cusparseGetMatDiagType(upTriFactor->descr); 795 796 /* Create the matrix description */ 797 stat = cusparseCreateMatDescr(&upTriFactorT->descr);CHKERRCUDA(stat); 798 stat = cusparseSetMatIndexBase(upTriFactorT->descr, indexBase);CHKERRCUDA(stat); 799 stat = cusparseSetMatType(upTriFactorT->descr, matrixType);CHKERRCUDA(stat); 800 stat = cusparseSetMatFillMode(upTriFactorT->descr, fillMode);CHKERRCUDA(stat); 801 stat = cusparseSetMatDiagType(upTriFactorT->descr, diagType);CHKERRCUDA(stat); 802 803 /* Create the solve analysis information */ 804 stat = cusparseCreateSolveAnalysisInfo(&upTriFactorT->solveInfo);CHKERRCUDA(stat); 805 806 /* set the operation */ 807 upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 808 809 /* allocate GPU space for the CSC of the upper triangular factor*/ 810 upTriFactorT->csrMat = new CsrMatrix; 811 upTriFactorT->csrMat->num_rows = upTriFactor->csrMat->num_rows; 812 upTriFactorT->csrMat->num_cols = upTriFactor->csrMat->num_cols; 813 upTriFactorT->csrMat->num_entries = upTriFactor->csrMat->num_entries; 814 upTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(upTriFactor->csrMat->num_rows+1); 815 upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactor->csrMat->num_entries); 816 upTriFactorT->csrMat->values = new THRUSTARRAY(upTriFactor->csrMat->num_entries); 817 818 /* compute the transpose of the upper triangular factor, i.e. the CSC */ 819 stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, 820 upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, 821 upTriFactor->csrMat->values->data().get(), 822 upTriFactor->csrMat->row_offsets->data().get(), 823 upTriFactor->csrMat->column_indices->data().get(), 824 upTriFactorT->csrMat->values->data().get(), 825 upTriFactorT->csrMat->column_indices->data().get(), 826 upTriFactorT->csrMat->row_offsets->data().get(), 827 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUDA(stat); 828 829 /* perform the solve analysis on the transposed matrix */ 830 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp, 831 upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, 832 upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(), 833 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), 834 upTriFactorT->solveInfo);CHKERRCUDA(stat); 835 836 /* assign the pointer. Is this really necessary? */ 837 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT; 838 PetscFunctionReturn(0); 839 } 840 841 static PetscErrorCode MatSeqAIJCUSPARSEGenerateTransposeForMult(Mat A) 842 { 843 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 844 Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 845 Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 846 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 847 cusparseStatus_t stat; 848 cusparseIndexBase_t indexBase; 849 cudaError_t err; 850 PetscErrorCode ierr; 851 852 PetscFunctionBegin; 853 854 /* allocate space for the triangular factor information */ 855 matstructT = new Mat_SeqAIJCUSPARSEMultStruct; 856 stat = cusparseCreateMatDescr(&matstructT->descr);CHKERRCUDA(stat); 857 indexBase = cusparseGetMatIndexBase(matstruct->descr); 858 stat = cusparseSetMatIndexBase(matstructT->descr, indexBase);CHKERRCUDA(stat); 859 stat = cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUDA(stat); 860 861 /* set alpha and beta */ 862 err = cudaMalloc((void **)&(matstructT->alpha), sizeof(PetscScalar));CHKERRCUDA(err); 863 err = cudaMalloc((void **)&(matstructT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err); 864 err = cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar));CHKERRCUDA(err); 865 err = cudaMemcpy(matstructT->alpha, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 866 err = cudaMemcpy(matstructT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 867 err = cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 868 stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUDA(stat); 869 870 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 871 CsrMatrix *matrix = (CsrMatrix*)matstruct->mat; 872 CsrMatrix *matrixT= new CsrMatrix; 873 matrixT->num_rows = A->cmap->n; 874 matrixT->num_cols = A->rmap->n; 875 matrixT->num_entries = a->nz; 876 matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows+1); 877 matrixT->column_indices = new THRUSTINTARRAY32(a->nz); 878 matrixT->values = new THRUSTARRAY(a->nz); 879 880 /* compute the transpose of the upper triangular factor, i.e. the CSC */ 881 indexBase = cusparseGetMatIndexBase(matstruct->descr); 882 stat = cusparse_csr2csc(cusparsestruct->handle, matrix->num_rows, 883 matrix->num_cols, matrix->num_entries, 884 matrix->values->data().get(), 885 matrix->row_offsets->data().get(), 886 matrix->column_indices->data().get(), 887 matrixT->values->data().get(), 888 matrixT->column_indices->data().get(), 889 matrixT->row_offsets->data().get(), 890 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUDA(stat); 891 892 /* assign the pointer */ 893 matstructT->mat = matrixT; 894 ierr = PetscLogCpuToGpu(((A->rmap->n+1)+(a->nz))*sizeof(int)+(3+a->nz)*sizeof(PetscScalar));CHKERRQ(ierr); 895 } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) { 896 /* First convert HYB to CSR */ 897 CsrMatrix *temp= new CsrMatrix; 898 temp->num_rows = A->rmap->n; 899 temp->num_cols = A->cmap->n; 900 temp->num_entries = a->nz; 901 temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 902 temp->column_indices = new THRUSTINTARRAY32(a->nz); 903 temp->values = new THRUSTARRAY(a->nz); 904 905 906 stat = cusparse_hyb2csr(cusparsestruct->handle, 907 matstruct->descr, (cusparseHybMat_t)matstruct->mat, 908 temp->values->data().get(), 909 temp->row_offsets->data().get(), 910 temp->column_indices->data().get());CHKERRCUDA(stat); 911 912 /* Next, convert CSR to CSC (i.e. the matrix transpose) */ 913 CsrMatrix *tempT= new CsrMatrix; 914 tempT->num_rows = A->rmap->n; 915 tempT->num_cols = A->cmap->n; 916 tempT->num_entries = a->nz; 917 tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 918 tempT->column_indices = new THRUSTINTARRAY32(a->nz); 919 tempT->values = new THRUSTARRAY(a->nz); 920 921 stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows, 922 temp->num_cols, temp->num_entries, 923 temp->values->data().get(), 924 temp->row_offsets->data().get(), 925 temp->column_indices->data().get(), 926 tempT->values->data().get(), 927 tempT->column_indices->data().get(), 928 tempT->row_offsets->data().get(), 929 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUDA(stat); 930 931 /* Last, convert CSC to HYB */ 932 cusparseHybMat_t hybMat; 933 stat = cusparseCreateHybMat(&hybMat);CHKERRCUDA(stat); 934 cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ? 935 CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 936 stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n, 937 matstructT->descr, tempT->values->data().get(), 938 tempT->row_offsets->data().get(), 939 tempT->column_indices->data().get(), 940 hybMat, 0, partition);CHKERRCUDA(stat); 941 942 /* assign the pointer */ 943 matstructT->mat = hybMat; 944 ierr = PetscLogCpuToGpu((2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)))+3*sizeof(PetscScalar));CHKERRQ(ierr); 945 946 /* delete temporaries */ 947 if (tempT) { 948 if (tempT->values) delete (THRUSTARRAY*) tempT->values; 949 if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices; 950 if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets; 951 delete (CsrMatrix*) tempT; 952 } 953 if (temp) { 954 if (temp->values) delete (THRUSTARRAY*) temp->values; 955 if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices; 956 if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets; 957 delete (CsrMatrix*) temp; 958 } 959 } 960 /* assign the compressed row indices */ 961 matstructT->cprowIndices = new THRUSTINTARRAY; 962 matstructT->cprowIndices->resize(A->cmap->n); 963 thrust::sequence(matstructT->cprowIndices->begin(), matstructT->cprowIndices->end()); 964 /* assign the pointer */ 965 ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT; 966 PetscFunctionReturn(0); 967 } 968 969 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 970 { 971 PetscInt n = xx->map->n; 972 const PetscScalar *barray; 973 PetscScalar *xarray; 974 thrust::device_ptr<const PetscScalar> bGPU; 975 thrust::device_ptr<PetscScalar> xGPU; 976 cusparseStatus_t stat; 977 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 978 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 979 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 980 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 981 PetscErrorCode ierr; 982 983 PetscFunctionBegin; 984 /* Analyze the matrix and create the transpose ... on the fly */ 985 if (!loTriFactorT && !upTriFactorT) { 986 ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr); 987 loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 988 upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 989 } 990 991 /* Get the GPU pointers */ 992 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 993 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 994 xGPU = thrust::device_pointer_cast(xarray); 995 bGPU = thrust::device_pointer_cast(barray); 996 997 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 998 /* First, reorder with the row permutation */ 999 thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), 1000 thrust::make_permutation_iterator(bGPU+n, cusparseTriFactors->rpermIndices->end()), 1001 xGPU); 1002 1003 /* First, solve U */ 1004 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, 1005 upTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactorT->descr, 1006 upTriFactorT->csrMat->values->data().get(), 1007 upTriFactorT->csrMat->row_offsets->data().get(), 1008 upTriFactorT->csrMat->column_indices->data().get(), 1009 upTriFactorT->solveInfo, 1010 xarray, tempGPU->data().get());CHKERRCUDA(stat); 1011 1012 /* Then, solve L */ 1013 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, 1014 loTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactorT->descr, 1015 loTriFactorT->csrMat->values->data().get(), 1016 loTriFactorT->csrMat->row_offsets->data().get(), 1017 loTriFactorT->csrMat->column_indices->data().get(), 1018 loTriFactorT->solveInfo, 1019 tempGPU->data().get(), xarray);CHKERRCUDA(stat); 1020 1021 /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */ 1022 thrust::copy(thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()), 1023 thrust::make_permutation_iterator(xGPU+n, cusparseTriFactors->cpermIndices->end()), 1024 tempGPU->begin()); 1025 1026 /* Copy the temporary to the full solution. */ 1027 thrust::copy(tempGPU->begin(), tempGPU->end(), xGPU); 1028 1029 /* restore */ 1030 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1031 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1032 ierr = WaitForGPU();CHKERRCUDA(ierr); 1033 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1034 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1035 PetscFunctionReturn(0); 1036 } 1037 1038 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 1039 { 1040 const PetscScalar *barray; 1041 PetscScalar *xarray; 1042 cusparseStatus_t stat; 1043 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1044 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1045 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1046 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1047 PetscErrorCode ierr; 1048 1049 PetscFunctionBegin; 1050 /* Analyze the matrix and create the transpose ... on the fly */ 1051 if (!loTriFactorT && !upTriFactorT) { 1052 ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr); 1053 loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1054 upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1055 } 1056 1057 /* Get the GPU pointers */ 1058 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1059 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1060 1061 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1062 /* First, solve U */ 1063 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, 1064 upTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactorT->descr, 1065 upTriFactorT->csrMat->values->data().get(), 1066 upTriFactorT->csrMat->row_offsets->data().get(), 1067 upTriFactorT->csrMat->column_indices->data().get(), 1068 upTriFactorT->solveInfo, 1069 barray, tempGPU->data().get());CHKERRCUDA(stat); 1070 1071 /* Then, solve L */ 1072 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, 1073 loTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactorT->descr, 1074 loTriFactorT->csrMat->values->data().get(), 1075 loTriFactorT->csrMat->row_offsets->data().get(), 1076 loTriFactorT->csrMat->column_indices->data().get(), 1077 loTriFactorT->solveInfo, 1078 tempGPU->data().get(), xarray);CHKERRCUDA(stat); 1079 1080 /* restore */ 1081 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1082 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1083 ierr = WaitForGPU();CHKERRCUDA(ierr); 1084 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1085 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1086 PetscFunctionReturn(0); 1087 } 1088 1089 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 1090 { 1091 const PetscScalar *barray; 1092 PetscScalar *xarray; 1093 thrust::device_ptr<const PetscScalar> bGPU; 1094 thrust::device_ptr<PetscScalar> xGPU; 1095 cusparseStatus_t stat; 1096 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1097 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1098 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1099 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1100 PetscErrorCode ierr; 1101 1102 PetscFunctionBegin; 1103 1104 /* Get the GPU pointers */ 1105 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1106 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1107 xGPU = thrust::device_pointer_cast(xarray); 1108 bGPU = thrust::device_pointer_cast(barray); 1109 1110 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1111 /* First, reorder with the row permutation */ 1112 thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), 1113 thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()), 1114 xGPU); 1115 1116 /* Next, solve L */ 1117 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp, 1118 loTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactor->descr, 1119 loTriFactor->csrMat->values->data().get(), 1120 loTriFactor->csrMat->row_offsets->data().get(), 1121 loTriFactor->csrMat->column_indices->data().get(), 1122 loTriFactor->solveInfo, 1123 xarray, tempGPU->data().get());CHKERRCUDA(stat); 1124 1125 /* Then, solve U */ 1126 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp, 1127 upTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactor->descr, 1128 upTriFactor->csrMat->values->data().get(), 1129 upTriFactor->csrMat->row_offsets->data().get(), 1130 upTriFactor->csrMat->column_indices->data().get(), 1131 upTriFactor->solveInfo, 1132 tempGPU->data().get(), xarray);CHKERRCUDA(stat); 1133 1134 /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */ 1135 thrust::copy(thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()), 1136 thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->end()), 1137 tempGPU->begin()); 1138 1139 /* Copy the temporary to the full solution. */ 1140 thrust::copy(tempGPU->begin(), tempGPU->end(), xGPU); 1141 1142 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1143 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1144 ierr = WaitForGPU();CHKERRCUDA(ierr); 1145 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1146 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1147 PetscFunctionReturn(0); 1148 } 1149 1150 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 1151 { 1152 const PetscScalar *barray; 1153 PetscScalar *xarray; 1154 cusparseStatus_t stat; 1155 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1156 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1157 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1158 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1159 PetscErrorCode ierr; 1160 1161 PetscFunctionBegin; 1162 /* Get the GPU pointers */ 1163 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1164 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1165 1166 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1167 /* First, solve L */ 1168 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp, 1169 loTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactor->descr, 1170 loTriFactor->csrMat->values->data().get(), 1171 loTriFactor->csrMat->row_offsets->data().get(), 1172 loTriFactor->csrMat->column_indices->data().get(), 1173 loTriFactor->solveInfo, 1174 barray, tempGPU->data().get());CHKERRCUDA(stat); 1175 1176 /* Next, solve U */ 1177 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp, 1178 upTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactor->descr, 1179 upTriFactor->csrMat->values->data().get(), 1180 upTriFactor->csrMat->row_offsets->data().get(), 1181 upTriFactor->csrMat->column_indices->data().get(), 1182 upTriFactor->solveInfo, 1183 tempGPU->data().get(), xarray);CHKERRCUDA(stat); 1184 1185 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1186 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1187 ierr = WaitForGPU();CHKERRCUDA(ierr); 1188 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1189 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1190 PetscFunctionReturn(0); 1191 } 1192 1193 static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A) 1194 { 1195 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1196 Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 1197 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1198 PetscInt m = A->rmap->n,*ii,*ridx; 1199 PetscErrorCode ierr; 1200 cusparseStatus_t stat; 1201 cudaError_t err; 1202 1203 PetscFunctionBegin; 1204 if (A->pinnedtocpu) PetscFunctionReturn(0); 1205 if (A->valid_GPU_matrix == PETSC_OFFLOAD_UNALLOCATED || A->valid_GPU_matrix == PETSC_OFFLOAD_CPU) { 1206 ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1207 if (A->assembled && A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) { 1208 CsrMatrix *matrix = (CsrMatrix*)matstruct->mat; 1209 /* copy values only */ 1210 matrix->values->assign(a->a, a->a+a->nz); 1211 ierr = PetscLogCpuToGpu((a->nz)*sizeof(PetscScalar));CHKERRQ(ierr); 1212 } else { 1213 MatSeqAIJCUSPARSEMultStruct_Destroy(&matstruct,cusparsestruct->format); 1214 try { 1215 cusparsestruct->nonzerorow=0; 1216 for (int j = 0; j<m; j++) cusparsestruct->nonzerorow += ((a->i[j+1]-a->i[j])>0); 1217 1218 if (a->compressedrow.use) { 1219 m = a->compressedrow.nrows; 1220 ii = a->compressedrow.i; 1221 ridx = a->compressedrow.rindex; 1222 } else { 1223 /* Forcing compressed row on the GPU */ 1224 int k=0; 1225 ierr = PetscMalloc1(cusparsestruct->nonzerorow+1, &ii);CHKERRQ(ierr); 1226 ierr = PetscMalloc1(cusparsestruct->nonzerorow, &ridx);CHKERRQ(ierr); 1227 ii[0]=0; 1228 for (int j = 0; j<m; j++) { 1229 if ((a->i[j+1]-a->i[j])>0) { 1230 ii[k] = a->i[j]; 1231 ridx[k]= j; 1232 k++; 1233 } 1234 } 1235 ii[cusparsestruct->nonzerorow] = a->nz; 1236 m = cusparsestruct->nonzerorow; 1237 } 1238 1239 /* allocate space for the triangular factor information */ 1240 matstruct = new Mat_SeqAIJCUSPARSEMultStruct; 1241 stat = cusparseCreateMatDescr(&matstruct->descr);CHKERRCUDA(stat); 1242 stat = cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat); 1243 stat = cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUDA(stat); 1244 1245 err = cudaMalloc((void **)&(matstruct->alpha), sizeof(PetscScalar));CHKERRCUDA(err); 1246 err = cudaMalloc((void **)&(matstruct->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err); 1247 err = cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar));CHKERRCUDA(err); 1248 err = cudaMemcpy(matstruct->alpha, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1249 err = cudaMemcpy(matstruct->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1250 err = cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1251 stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUDA(stat); 1252 1253 /* Build a hybrid/ellpack matrix if this option is chosen for the storage */ 1254 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1255 /* set the matrix */ 1256 CsrMatrix *matrix= new CsrMatrix; 1257 matrix->num_rows = m; 1258 matrix->num_cols = A->cmap->n; 1259 matrix->num_entries = a->nz; 1260 matrix->row_offsets = new THRUSTINTARRAY32(m+1); 1261 matrix->row_offsets->assign(ii, ii + m+1); 1262 1263 matrix->column_indices = new THRUSTINTARRAY32(a->nz); 1264 matrix->column_indices->assign(a->j, a->j+a->nz); 1265 1266 matrix->values = new THRUSTARRAY(a->nz); 1267 matrix->values->assign(a->a, a->a+a->nz); 1268 1269 /* assign the pointer */ 1270 matstruct->mat = matrix; 1271 1272 } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) { 1273 CsrMatrix *matrix= new CsrMatrix; 1274 matrix->num_rows = m; 1275 matrix->num_cols = A->cmap->n; 1276 matrix->num_entries = a->nz; 1277 matrix->row_offsets = new THRUSTINTARRAY32(m+1); 1278 matrix->row_offsets->assign(ii, ii + m+1); 1279 1280 matrix->column_indices = new THRUSTINTARRAY32(a->nz); 1281 matrix->column_indices->assign(a->j, a->j+a->nz); 1282 1283 matrix->values = new THRUSTARRAY(a->nz); 1284 matrix->values->assign(a->a, a->a+a->nz); 1285 1286 cusparseHybMat_t hybMat; 1287 stat = cusparseCreateHybMat(&hybMat);CHKERRCUDA(stat); 1288 cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ? 1289 CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 1290 stat = cusparse_csr2hyb(cusparsestruct->handle, matrix->num_rows, matrix->num_cols, 1291 matstruct->descr, matrix->values->data().get(), 1292 matrix->row_offsets->data().get(), 1293 matrix->column_indices->data().get(), 1294 hybMat, 0, partition);CHKERRCUDA(stat); 1295 /* assign the pointer */ 1296 matstruct->mat = hybMat; 1297 1298 if (matrix) { 1299 if (matrix->values) delete (THRUSTARRAY*)matrix->values; 1300 if (matrix->column_indices) delete (THRUSTINTARRAY32*)matrix->column_indices; 1301 if (matrix->row_offsets) delete (THRUSTINTARRAY32*)matrix->row_offsets; 1302 delete (CsrMatrix*)matrix; 1303 } 1304 } 1305 1306 /* assign the compressed row indices */ 1307 matstruct->cprowIndices = new THRUSTINTARRAY(m); 1308 matstruct->cprowIndices->assign(ridx,ridx+m); 1309 ierr = PetscLogCpuToGpu(((m+1)+(a->nz))*sizeof(int)+m*sizeof(PetscInt)+(3+(a->nz))*sizeof(PetscScalar));CHKERRQ(ierr); 1310 1311 /* assign the pointer */ 1312 cusparsestruct->mat = matstruct; 1313 1314 if (!a->compressedrow.use) { 1315 ierr = PetscFree(ii);CHKERRQ(ierr); 1316 ierr = PetscFree(ridx);CHKERRQ(ierr); 1317 } 1318 cusparsestruct->workVector = new THRUSTARRAY(m); 1319 } catch(char *ex) { 1320 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 1321 } 1322 cusparsestruct->nonzerostate = A->nonzerostate; 1323 } 1324 ierr = WaitForGPU();CHKERRCUDA(ierr); 1325 A->valid_GPU_matrix = PETSC_OFFLOAD_BOTH; 1326 ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1327 } 1328 PetscFunctionReturn(0); 1329 } 1330 1331 struct VecCUDAPlusEquals 1332 { 1333 template <typename Tuple> 1334 __host__ __device__ 1335 void operator()(Tuple t) 1336 { 1337 thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t); 1338 } 1339 }; 1340 1341 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 1342 { 1343 PetscErrorCode ierr; 1344 1345 PetscFunctionBegin; 1346 ierr = MatMultAdd_SeqAIJCUSPARSE(A,xx,NULL,yy);CHKERRQ(ierr); 1347 PetscFunctionReturn(0); 1348 } 1349 1350 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 1351 { 1352 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1353 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1354 Mat_SeqAIJCUSPARSEMultStruct *matstructT; 1355 const PetscScalar *xarray; 1356 PetscScalar *yarray; 1357 PetscErrorCode ierr; 1358 cusparseStatus_t stat; 1359 1360 PetscFunctionBegin; 1361 /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */ 1362 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 1363 matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 1364 if (!matstructT) { 1365 ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr); 1366 matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 1367 } 1368 ierr = VecCUDAGetArrayRead(xx,&xarray);CHKERRQ(ierr); 1369 ierr = VecCUDAGetArrayWrite(yy,&yarray);CHKERRQ(ierr); 1370 1371 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1372 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1373 CsrMatrix *mat = (CsrMatrix*)matstructT->mat; 1374 stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1375 mat->num_rows, mat->num_cols, 1376 mat->num_entries, matstructT->alpha, matstructT->descr, 1377 mat->values->data().get(), mat->row_offsets->data().get(), 1378 mat->column_indices->data().get(), xarray, matstructT->beta_zero, 1379 yarray);CHKERRCUDA(stat); 1380 } else { 1381 cusparseHybMat_t hybMat = (cusparseHybMat_t)matstructT->mat; 1382 stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1383 matstructT->alpha, matstructT->descr, hybMat, 1384 xarray, matstructT->beta_zero, 1385 yarray);CHKERRCUDA(stat); 1386 } 1387 ierr = VecCUDARestoreArrayRead(xx,&xarray);CHKERRQ(ierr); 1388 ierr = VecCUDARestoreArrayWrite(yy,&yarray);CHKERRQ(ierr); 1389 if (!cusparsestruct->stream) { 1390 ierr = WaitForGPU();CHKERRCUDA(ierr); 1391 } 1392 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1393 ierr = PetscLogGpuFlops(2.0*a->nz - cusparsestruct->nonzerorow);CHKERRQ(ierr); 1394 PetscFunctionReturn(0); 1395 } 1396 1397 1398 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 1399 { 1400 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1401 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1402 Mat_SeqAIJCUSPARSEMultStruct *matstruct; 1403 const PetscScalar *xarray; 1404 PetscScalar *zarray,*dptr,*beta; 1405 PetscErrorCode ierr; 1406 cusparseStatus_t stat; 1407 PetscBool cmpr; /* if the matrix has been compressed (zero rows) */ 1408 1409 PetscFunctionBegin; 1410 /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */ 1411 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 1412 matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 1413 try { 1414 cmpr = (PetscBool)(cusparsestruct->workVector->size() == (thrust::detail::vector_base<PetscScalar, thrust::device_malloc_allocator<PetscScalar> >::size_type)(A->rmap->n)); 1415 ierr = VecCUDAGetArrayRead(xx,&xarray);CHKERRQ(ierr); 1416 if (yy && !cmpr) { /* MatMultAdd with noncompressed storage -> need uptodate zz vector */ 1417 ierr = VecCUDAGetArray(zz,&zarray);CHKERRQ(ierr); 1418 } else { 1419 ierr = VecCUDAGetArrayWrite(zz,&zarray);CHKERRQ(ierr); 1420 } 1421 dptr = cmpr ? zarray : cusparsestruct->workVector->data().get(); 1422 beta = (yy == zz && dptr == zarray) ? matstruct->beta_one : matstruct->beta_zero; 1423 1424 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1425 /* csr_spmv is multiply add */ 1426 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 1427 /* here we need to be careful to set the number of rows in the multiply to the 1428 number of compressed rows in the matrix ... which is equivalent to the 1429 size of the workVector */ 1430 CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 1431 stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1432 mat->num_rows, mat->num_cols, 1433 mat->num_entries, matstruct->alpha, matstruct->descr, 1434 mat->values->data().get(), mat->row_offsets->data().get(), 1435 mat->column_indices->data().get(), xarray, beta, 1436 dptr);CHKERRCUDA(stat); 1437 } else { 1438 if (cusparsestruct->workVector->size()) { 1439 cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat; 1440 stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1441 matstruct->alpha, matstruct->descr, hybMat, 1442 xarray, beta, 1443 dptr);CHKERRCUDA(stat); 1444 } 1445 } 1446 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1447 1448 if (yy) { 1449 if (dptr != zarray) { 1450 ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr); 1451 } else if (zz != yy) { 1452 ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); 1453 } 1454 } else if (dptr != zarray) { 1455 ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr); 1456 } 1457 /* scatter the data from the temporary into the full vector with a += operation */ 1458 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1459 if (dptr != zarray) { 1460 thrust::device_ptr<PetscScalar> zptr; 1461 1462 zptr = thrust::device_pointer_cast(zarray); 1463 thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))), 1464 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + cusparsestruct->workVector->size(), 1465 VecCUDAPlusEquals()); 1466 } 1467 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1468 ierr = VecCUDARestoreArrayRead(xx,&xarray);CHKERRQ(ierr); 1469 if (yy && !cmpr) { 1470 ierr = VecCUDARestoreArray(zz,&zarray);CHKERRQ(ierr); 1471 } else { 1472 ierr = VecCUDARestoreArrayWrite(zz,&zarray);CHKERRQ(ierr); 1473 } 1474 } catch(char *ex) { 1475 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 1476 } 1477 if (!yy) { /* MatMult */ 1478 if (!cusparsestruct->stream) { 1479 ierr = WaitForGPU();CHKERRCUDA(ierr); 1480 } 1481 } 1482 ierr = PetscLogGpuFlops(2.0*a->nz);CHKERRQ(ierr); 1483 PetscFunctionReturn(0); 1484 } 1485 1486 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 1487 { 1488 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1489 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1490 Mat_SeqAIJCUSPARSEMultStruct *matstructT; 1491 thrust::device_ptr<PetscScalar> zptr; 1492 const PetscScalar *xarray; 1493 PetscScalar *zarray; 1494 PetscErrorCode ierr; 1495 cusparseStatus_t stat; 1496 1497 PetscFunctionBegin; 1498 /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */ 1499 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 1500 matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 1501 if (!matstructT) { 1502 ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr); 1503 matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 1504 ierr = WaitForGPU();CHKERRCUDA(ierr); 1505 } 1506 1507 try { 1508 ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr); 1509 ierr = VecCUDAGetArrayRead(xx,&xarray);CHKERRQ(ierr); 1510 ierr = VecCUDAGetArray(zz,&zarray);CHKERRQ(ierr); 1511 zptr = thrust::device_pointer_cast(zarray); 1512 1513 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1514 /* multiply add with matrix transpose */ 1515 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1516 CsrMatrix *mat = (CsrMatrix*)matstructT->mat; 1517 /* here we need to be careful to set the number of rows in the multiply to the 1518 number of compressed rows in the matrix ... which is equivalent to the 1519 size of the workVector */ 1520 stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1521 mat->num_rows, mat->num_cols, 1522 mat->num_entries, matstructT->alpha, matstructT->descr, 1523 mat->values->data().get(), mat->row_offsets->data().get(), 1524 mat->column_indices->data().get(), xarray, matstructT->beta_zero, 1525 cusparsestruct->workVector->data().get());CHKERRCUDA(stat); 1526 } else { 1527 cusparseHybMat_t hybMat = (cusparseHybMat_t)matstructT->mat; 1528 if (cusparsestruct->workVector->size()) { 1529 stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1530 matstructT->alpha, matstructT->descr, hybMat, 1531 xarray, matstructT->beta_zero, 1532 cusparsestruct->workVector->data().get());CHKERRCUDA(stat); 1533 } 1534 } 1535 1536 /* scatter the data from the temporary into the full vector with a += operation */ 1537 thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstructT->cprowIndices->begin()))), 1538 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstructT->cprowIndices->begin()))) + A->cmap->n, 1539 VecCUDAPlusEquals()); 1540 ierr = VecCUDARestoreArrayRead(xx,&xarray);CHKERRQ(ierr); 1541 ierr = VecCUDARestoreArray(zz,&zarray);CHKERRQ(ierr); 1542 ierr = WaitForGPU();CHKERRCUDA(ierr); 1543 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1544 1545 } catch(char *ex) { 1546 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 1547 ierr = WaitForGPU();CHKERRCUDA(ierr); 1548 } 1549 ierr = PetscLogGpuFlops(2.0*a->nz);CHKERRQ(ierr); 1550 PetscFunctionReturn(0); 1551 } 1552 1553 static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode) 1554 { 1555 PetscErrorCode ierr; 1556 1557 PetscFunctionBegin; 1558 ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr); 1559 if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0); 1560 if (A->factortype == MAT_FACTOR_NONE) { 1561 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 1562 } 1563 A->ops->mult = MatMult_SeqAIJCUSPARSE; 1564 A->ops->multadd = MatMultAdd_SeqAIJCUSPARSE; 1565 A->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE; 1566 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE; 1567 PetscFunctionReturn(0); 1568 } 1569 1570 /* --------------------------------------------------------------------------------*/ 1571 /*@ 1572 MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format 1573 (the default parallel PETSc format). This matrix will ultimately pushed down 1574 to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix 1575 assembly performance the user should preallocate the matrix storage by setting 1576 the parameter nz (or the array nnz). By setting these parameters accurately, 1577 performance during matrix assembly can be increased by more than a factor of 50. 1578 1579 Collective 1580 1581 Input Parameters: 1582 + comm - MPI communicator, set to PETSC_COMM_SELF 1583 . m - number of rows 1584 . n - number of columns 1585 . nz - number of nonzeros per row (same for all rows) 1586 - nnz - array containing the number of nonzeros in the various rows 1587 (possibly different for each row) or NULL 1588 1589 Output Parameter: 1590 . A - the matrix 1591 1592 It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 1593 MatXXXXSetPreallocation() paradgm instead of this routine directly. 1594 [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 1595 1596 Notes: 1597 If nnz is given then nz is ignored 1598 1599 The AIJ format (also called the Yale sparse matrix format or 1600 compressed row storage), is fully compatible with standard Fortran 77 1601 storage. That is, the stored row and column indices can begin at 1602 either one (as in Fortran) or zero. See the users' manual for details. 1603 1604 Specify the preallocated storage with either nz or nnz (not both). 1605 Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 1606 allocation. For large problems you MUST preallocate memory or you 1607 will get TERRIBLE performance, see the users' manual chapter on matrices. 1608 1609 By default, this format uses inodes (identical nodes) when possible, to 1610 improve numerical efficiency of matrix-vector products and solves. We 1611 search for consecutive rows with the same nonzero structure, thereby 1612 reusing matrix information to achieve increased efficiency. 1613 1614 Level: intermediate 1615 1616 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE 1617 @*/ 1618 PetscErrorCode MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A) 1619 { 1620 PetscErrorCode ierr; 1621 1622 PetscFunctionBegin; 1623 ierr = MatCreate(comm,A);CHKERRQ(ierr); 1624 ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr); 1625 ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 1626 ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr); 1627 PetscFunctionReturn(0); 1628 } 1629 1630 static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A) 1631 { 1632 PetscErrorCode ierr; 1633 1634 PetscFunctionBegin; 1635 if (A->factortype==MAT_FACTOR_NONE) { 1636 if (A->valid_GPU_matrix != PETSC_OFFLOAD_UNALLOCATED) { 1637 ierr = MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr);CHKERRQ(ierr); 1638 } 1639 } else { 1640 ierr = MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr);CHKERRQ(ierr); 1641 } 1642 ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr); 1643 PetscFunctionReturn(0); 1644 } 1645 1646 static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A,MatDuplicateOption cpvalues,Mat *B) 1647 { 1648 PetscErrorCode ierr; 1649 Mat C; 1650 cusparseStatus_t stat; 1651 cusparseHandle_t handle=0; 1652 1653 PetscFunctionBegin; 1654 ierr = MatDuplicate_SeqAIJ(A,cpvalues,B);CHKERRQ(ierr); 1655 C = *B; 1656 ierr = PetscFree(C->defaultvectype);CHKERRQ(ierr); 1657 ierr = PetscStrallocpy(VECCUDA,&C->defaultvectype);CHKERRQ(ierr); 1658 1659 /* inject CUSPARSE-specific stuff */ 1660 if (C->factortype==MAT_FACTOR_NONE) { 1661 /* you cannot check the inode.use flag here since the matrix was just created. 1662 now build a GPU matrix data structure */ 1663 C->spptr = new Mat_SeqAIJCUSPARSE; 1664 ((Mat_SeqAIJCUSPARSE*)C->spptr)->mat = 0; 1665 ((Mat_SeqAIJCUSPARSE*)C->spptr)->matTranspose = 0; 1666 ((Mat_SeqAIJCUSPARSE*)C->spptr)->workVector = 0; 1667 ((Mat_SeqAIJCUSPARSE*)C->spptr)->format = MAT_CUSPARSE_CSR; 1668 ((Mat_SeqAIJCUSPARSE*)C->spptr)->stream = 0; 1669 stat = cusparseCreate(&handle);CHKERRCUDA(stat); 1670 ((Mat_SeqAIJCUSPARSE*)C->spptr)->handle = handle; 1671 ((Mat_SeqAIJCUSPARSE*)C->spptr)->nonzerostate = 0; 1672 } else { 1673 /* NEXT, set the pointers to the triangular factors */ 1674 C->spptr = new Mat_SeqAIJCUSPARSETriFactors; 1675 ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->loTriFactorPtr = 0; 1676 ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->upTriFactorPtr = 0; 1677 ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->loTriFactorPtrTranspose = 0; 1678 ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->upTriFactorPtrTranspose = 0; 1679 ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->rpermIndices = 0; 1680 ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->cpermIndices = 0; 1681 ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->workVector = 0; 1682 ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->handle = 0; 1683 stat = cusparseCreate(&handle);CHKERRCUDA(stat); 1684 ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->handle = handle; 1685 ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->nnz = 0; 1686 } 1687 1688 C->ops->assemblyend = MatAssemblyEnd_SeqAIJCUSPARSE; 1689 C->ops->destroy = MatDestroy_SeqAIJCUSPARSE; 1690 C->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE; 1691 C->ops->mult = MatMult_SeqAIJCUSPARSE; 1692 C->ops->multadd = MatMultAdd_SeqAIJCUSPARSE; 1693 C->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE; 1694 C->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE; 1695 C->ops->duplicate = MatDuplicate_SeqAIJCUSPARSE; 1696 1697 ierr = PetscObjectChangeTypeName((PetscObject)C,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 1698 1699 C->valid_GPU_matrix = PETSC_OFFLOAD_UNALLOCATED; 1700 1701 ierr = PetscObjectComposeFunction((PetscObject)C, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr); 1702 PetscFunctionReturn(0); 1703 } 1704 1705 PETSC_EXTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat B) 1706 { 1707 PetscErrorCode ierr; 1708 cusparseStatus_t stat; 1709 cusparseHandle_t handle=0; 1710 1711 PetscFunctionBegin; 1712 ierr = PetscFree(B->defaultvectype);CHKERRQ(ierr); 1713 ierr = PetscStrallocpy(VECCUDA,&B->defaultvectype);CHKERRQ(ierr); 1714 1715 if (B->factortype==MAT_FACTOR_NONE) { 1716 /* you cannot check the inode.use flag here since the matrix was just created. 1717 now build a GPU matrix data structure */ 1718 B->spptr = new Mat_SeqAIJCUSPARSE; 1719 ((Mat_SeqAIJCUSPARSE*)B->spptr)->mat = 0; 1720 ((Mat_SeqAIJCUSPARSE*)B->spptr)->matTranspose = 0; 1721 ((Mat_SeqAIJCUSPARSE*)B->spptr)->workVector = 0; 1722 ((Mat_SeqAIJCUSPARSE*)B->spptr)->format = MAT_CUSPARSE_CSR; 1723 ((Mat_SeqAIJCUSPARSE*)B->spptr)->stream = 0; 1724 stat = cusparseCreate(&handle);CHKERRCUDA(stat); 1725 ((Mat_SeqAIJCUSPARSE*)B->spptr)->handle = handle; 1726 ((Mat_SeqAIJCUSPARSE*)B->spptr)->nonzerostate = 0; 1727 } else { 1728 /* NEXT, set the pointers to the triangular factors */ 1729 B->spptr = new Mat_SeqAIJCUSPARSETriFactors; 1730 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->loTriFactorPtr = 0; 1731 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->upTriFactorPtr = 0; 1732 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->loTriFactorPtrTranspose = 0; 1733 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->upTriFactorPtrTranspose = 0; 1734 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->rpermIndices = 0; 1735 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->cpermIndices = 0; 1736 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->workVector = 0; 1737 stat = cusparseCreate(&handle);CHKERRCUDA(stat); 1738 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->handle = handle; 1739 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->nnz = 0; 1740 } 1741 1742 B->ops->assemblyend = MatAssemblyEnd_SeqAIJCUSPARSE; 1743 B->ops->destroy = MatDestroy_SeqAIJCUSPARSE; 1744 B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE; 1745 B->ops->mult = MatMult_SeqAIJCUSPARSE; 1746 B->ops->multadd = MatMultAdd_SeqAIJCUSPARSE; 1747 B->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE; 1748 B->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE; 1749 B->ops->duplicate = MatDuplicate_SeqAIJCUSPARSE; 1750 1751 ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 1752 1753 B->valid_GPU_matrix = PETSC_OFFLOAD_UNALLOCATED; 1754 1755 ierr = PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr); 1756 PetscFunctionReturn(0); 1757 } 1758 1759 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B) 1760 { 1761 PetscErrorCode ierr; 1762 1763 PetscFunctionBegin; 1764 ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr); 1765 ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(B);CHKERRQ(ierr); 1766 PetscFunctionReturn(0); 1767 } 1768 1769 /*MC 1770 MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices. 1771 1772 A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either 1773 CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later. 1774 All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library. 1775 1776 Options Database Keys: 1777 + -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions() 1778 . -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 1779 - -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 1780 1781 Level: beginner 1782 1783 .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 1784 M*/ 1785 1786 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat,MatFactorType,Mat*); 1787 1788 1789 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void) 1790 { 1791 PetscErrorCode ierr; 1792 1793 PetscFunctionBegin; 1794 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 1795 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 1796 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 1797 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 1798 PetscFunctionReturn(0); 1799 } 1800 1801 1802 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct) 1803 { 1804 cusparseStatus_t stat; 1805 cusparseHandle_t handle; 1806 1807 PetscFunctionBegin; 1808 if (*cusparsestruct) { 1809 MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format); 1810 MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format); 1811 delete (*cusparsestruct)->workVector; 1812 if (handle = (*cusparsestruct)->handle) { 1813 stat = cusparseDestroy(handle);CHKERRCUDA(stat); 1814 } 1815 delete *cusparsestruct; 1816 *cusparsestruct = 0; 1817 } 1818 PetscFunctionReturn(0); 1819 } 1820 1821 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat) 1822 { 1823 PetscFunctionBegin; 1824 if (*mat) { 1825 delete (*mat)->values; 1826 delete (*mat)->column_indices; 1827 delete (*mat)->row_offsets; 1828 delete *mat; 1829 *mat = 0; 1830 } 1831 PetscFunctionReturn(0); 1832 } 1833 1834 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor) 1835 { 1836 cusparseStatus_t stat; 1837 PetscErrorCode ierr; 1838 1839 PetscFunctionBegin; 1840 if (*trifactor) { 1841 if ((*trifactor)->descr) { stat = cusparseDestroyMatDescr((*trifactor)->descr);CHKERRCUDA(stat); } 1842 if ((*trifactor)->solveInfo) { stat = cusparseDestroySolveAnalysisInfo((*trifactor)->solveInfo);CHKERRCUDA(stat); } 1843 ierr = CsrMatrix_Destroy(&(*trifactor)->csrMat);CHKERRQ(ierr); 1844 delete *trifactor; 1845 *trifactor = 0; 1846 } 1847 PetscFunctionReturn(0); 1848 } 1849 1850 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format) 1851 { 1852 CsrMatrix *mat; 1853 cusparseStatus_t stat; 1854 cudaError_t err; 1855 1856 PetscFunctionBegin; 1857 if (*matstruct) { 1858 if ((*matstruct)->mat) { 1859 if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) { 1860 cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat; 1861 stat = cusparseDestroyHybMat(hybMat);CHKERRCUDA(stat); 1862 } else { 1863 mat = (CsrMatrix*)(*matstruct)->mat; 1864 CsrMatrix_Destroy(&mat); 1865 } 1866 } 1867 if ((*matstruct)->descr) { stat = cusparseDestroyMatDescr((*matstruct)->descr);CHKERRCUDA(stat); } 1868 delete (*matstruct)->cprowIndices; 1869 if ((*matstruct)->alpha) { err=cudaFree((*matstruct)->alpha);CHKERRCUDA(err); } 1870 if ((*matstruct)->beta_zero) { err=cudaFree((*matstruct)->beta_zero);CHKERRCUDA(err); } 1871 if ((*matstruct)->beta_one) { err=cudaFree((*matstruct)->beta_one);CHKERRCUDA(err); } 1872 delete *matstruct; 1873 *matstruct = 0; 1874 } 1875 PetscFunctionReturn(0); 1876 } 1877 1878 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors) 1879 { 1880 cusparseHandle_t handle; 1881 cusparseStatus_t stat; 1882 1883 PetscFunctionBegin; 1884 if (*trifactors) { 1885 MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtr); 1886 MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtr); 1887 MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtrTranspose); 1888 MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtrTranspose); 1889 delete (*trifactors)->rpermIndices; 1890 delete (*trifactors)->cpermIndices; 1891 delete (*trifactors)->workVector; 1892 if (handle = (*trifactors)->handle) { 1893 stat = cusparseDestroy(handle);CHKERRCUDA(stat); 1894 } 1895 delete *trifactors; 1896 *trifactors = 0; 1897 } 1898 PetscFunctionReturn(0); 1899 } 1900 1901