1 /* 2 Defines the basic matrix operations for the AIJ (compressed row) 3 matrix storage format using the CUSPARSE library, 4 */ 5 #define PETSC_SKIP_SPINLOCK 6 #define PETSC_SKIP_CXX_COMPLEX_FIX 7 8 #include <petscconf.h> 9 #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/ 10 #include <../src/mat/impls/sbaij/seq/sbaij.h> 11 #include <../src/vec/vec/impls/dvecimpl.h> 12 #include <petsc/private/vecimpl.h> 13 #undef VecType 14 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h> 15 16 const char *const MatCUSPARSEStorageFormats[] = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0}; 17 18 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*); 19 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*); 20 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*); 21 22 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 23 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 24 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*); 25 26 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec); 27 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 28 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 29 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 30 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat); 31 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec); 32 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 33 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 34 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 35 36 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**); 37 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**); 38 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat); 39 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**); 40 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**); 41 42 PetscErrorCode MatCUSPARSESetStream(Mat A,const cudaStream_t stream) 43 { 44 cusparseStatus_t stat; 45 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 46 47 PetscFunctionBegin; 48 cusparsestruct->stream = stream; 49 stat = cusparseSetStream(cusparsestruct->handle,cusparsestruct->stream);CHKERRCUDA(stat); 50 PetscFunctionReturn(0); 51 } 52 53 PetscErrorCode MatCUSPARSESetHandle(Mat A,const cusparseHandle_t handle) 54 { 55 cusparseStatus_t stat; 56 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 57 58 PetscFunctionBegin; 59 if (cusparsestruct->handle != handle) { 60 if (cusparsestruct->handle) { 61 stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUDA(stat); 62 } 63 cusparsestruct->handle = handle; 64 } 65 stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUDA(stat); 66 PetscFunctionReturn(0); 67 } 68 69 PetscErrorCode MatCUSPARSEClearHandle(Mat A) 70 { 71 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 72 PetscFunctionBegin; 73 if (cusparsestruct->handle) 74 cusparsestruct->handle = 0; 75 PetscFunctionReturn(0); 76 } 77 78 PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A,MatSolverType *type) 79 { 80 PetscFunctionBegin; 81 *type = MATSOLVERCUSPARSE; 82 PetscFunctionReturn(0); 83 } 84 85 /*MC 86 MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices 87 on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported 88 algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer 89 performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the 90 CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these 91 algorithms are not recommended. This class does NOT support direct solver operations. 92 93 Level: beginner 94 95 .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 96 M*/ 97 98 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B) 99 { 100 PetscErrorCode ierr; 101 PetscInt n = A->rmap->n; 102 103 PetscFunctionBegin; 104 ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr); 105 (*B)->factortype = ftype; 106 ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr); 107 ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 108 109 if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) { 110 ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr); 111 (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE; 112 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJCUSPARSE; 113 } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) { 114 (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJCUSPARSE; 115 (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE; 116 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types"); 117 118 ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr); 119 ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_cusparse);CHKERRQ(ierr); 120 PetscFunctionReturn(0); 121 } 122 123 PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 124 { 125 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 126 127 PetscFunctionBegin; 128 switch (op) { 129 case MAT_CUSPARSE_MULT: 130 cusparsestruct->format = format; 131 break; 132 case MAT_CUSPARSE_ALL: 133 cusparsestruct->format = format; 134 break; 135 default: 136 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op); 137 } 138 PetscFunctionReturn(0); 139 } 140 141 /*@ 142 MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular 143 operation. Only the MatMult operation can use different GPU storage formats 144 for MPIAIJCUSPARSE matrices. 145 Not Collective 146 147 Input Parameters: 148 + A - Matrix of type SEQAIJCUSPARSE 149 . op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL. 150 - format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2) 151 152 Output Parameter: 153 154 Level: intermediate 155 156 .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 157 @*/ 158 PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 159 { 160 PetscErrorCode ierr; 161 162 PetscFunctionBegin; 163 PetscValidHeaderSpecific(A, MAT_CLASSID,1); 164 ierr = PetscTryMethod(A, "MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));CHKERRQ(ierr); 165 PetscFunctionReturn(0); 166 } 167 168 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A) 169 { 170 PetscErrorCode ierr; 171 MatCUSPARSEStorageFormat format; 172 PetscBool flg; 173 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 174 175 PetscFunctionBegin; 176 ierr = PetscOptionsHead(PetscOptionsObject,"SeqAIJCUSPARSE options");CHKERRQ(ierr); 177 if (A->factortype==MAT_FACTOR_NONE) { 178 ierr = PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV", 179 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 180 if (flg) { 181 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);CHKERRQ(ierr); 182 } 183 } 184 ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve", 185 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 186 if (flg) { 187 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr); 188 } 189 ierr = PetscOptionsTail();CHKERRQ(ierr); 190 PetscFunctionReturn(0); 191 192 } 193 194 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 195 { 196 PetscErrorCode ierr; 197 198 PetscFunctionBegin; 199 ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 200 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 201 PetscFunctionReturn(0); 202 } 203 204 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 205 { 206 PetscErrorCode ierr; 207 208 PetscFunctionBegin; 209 ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 210 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 211 PetscFunctionReturn(0); 212 } 213 214 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info) 215 { 216 PetscErrorCode ierr; 217 218 PetscFunctionBegin; 219 ierr = MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr); 220 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 221 PetscFunctionReturn(0); 222 } 223 224 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info) 225 { 226 PetscErrorCode ierr; 227 228 PetscFunctionBegin; 229 ierr = MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr); 230 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 231 PetscFunctionReturn(0); 232 } 233 234 static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A) 235 { 236 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 237 PetscInt n = A->rmap->n; 238 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 239 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 240 cusparseStatus_t stat; 241 const PetscInt *ai = a->i,*aj = a->j,*vi; 242 const MatScalar *aa = a->a,*v; 243 PetscInt *AiLo, *AjLo; 244 PetscScalar *AALo; 245 PetscInt i,nz, nzLower, offset, rowOffset; 246 PetscErrorCode ierr; 247 248 PetscFunctionBegin; 249 if (!n) PetscFunctionReturn(0); 250 if (A->valid_GPU_matrix == PETSC_OFFLOAD_UNALLOCATED || A->valid_GPU_matrix == PETSC_OFFLOAD_CPU) { 251 try { 252 /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */ 253 nzLower=n+ai[n]-ai[1]; 254 255 /* Allocate Space for the lower triangular matrix */ 256 ierr = cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUDA(ierr); 257 ierr = cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUDA(ierr); 258 ierr = cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUDA(ierr); 259 260 /* Fill the lower triangular matrix */ 261 AiLo[0] = (PetscInt) 0; 262 AiLo[n] = nzLower; 263 AjLo[0] = (PetscInt) 0; 264 AALo[0] = (MatScalar) 1.0; 265 v = aa; 266 vi = aj; 267 offset = 1; 268 rowOffset= 1; 269 for (i=1; i<n; i++) { 270 nz = ai[i+1] - ai[i]; 271 /* additional 1 for the term on the diagonal */ 272 AiLo[i] = rowOffset; 273 rowOffset += nz+1; 274 275 ierr = PetscArraycpy(&(AjLo[offset]), vi, nz);CHKERRQ(ierr); 276 ierr = PetscArraycpy(&(AALo[offset]), v, nz);CHKERRQ(ierr); 277 278 offset += nz; 279 AjLo[offset] = (PetscInt) i; 280 AALo[offset] = (MatScalar) 1.0; 281 offset += 1; 282 283 v += nz; 284 vi += nz; 285 } 286 287 /* allocate space for the triangular factor information */ 288 loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct; 289 290 /* Create the matrix description */ 291 stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUDA(stat); 292 stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat); 293 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUDA(stat); 294 stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER);CHKERRCUDA(stat); 295 stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUDA(stat); 296 297 /* Create the solve analysis information */ 298 stat = cusparseCreateSolveAnalysisInfo(&loTriFactor->solveInfo);CHKERRCUDA(stat); 299 300 /* set the operation */ 301 loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 302 303 /* set the matrix */ 304 loTriFactor->csrMat = new CsrMatrix; 305 loTriFactor->csrMat->num_rows = n; 306 loTriFactor->csrMat->num_cols = n; 307 loTriFactor->csrMat->num_entries = nzLower; 308 309 loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1); 310 loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1); 311 312 loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower); 313 loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower); 314 315 loTriFactor->csrMat->values = new THRUSTARRAY(nzLower); 316 loTriFactor->csrMat->values->assign(AALo, AALo+nzLower); 317 318 /* perform the solve analysis */ 319 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, 320 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 321 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 322 loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo);CHKERRCUDA(stat); 323 324 /* assign the pointer. Is this really necessary? */ 325 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor; 326 327 ierr = cudaFreeHost(AiLo);CHKERRCUDA(ierr); 328 ierr = cudaFreeHost(AjLo);CHKERRCUDA(ierr); 329 ierr = cudaFreeHost(AALo);CHKERRCUDA(ierr); 330 ierr = PetscLogCpuToGpu((n+1+nzLower)*sizeof(int)+nzLower*sizeof(PetscScalar));CHKERRQ(ierr); 331 } catch(char *ex) { 332 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 333 } 334 } 335 PetscFunctionReturn(0); 336 } 337 338 static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A) 339 { 340 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 341 PetscInt n = A->rmap->n; 342 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 343 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 344 cusparseStatus_t stat; 345 const PetscInt *aj = a->j,*adiag = a->diag,*vi; 346 const MatScalar *aa = a->a,*v; 347 PetscInt *AiUp, *AjUp; 348 PetscScalar *AAUp; 349 PetscInt i,nz, nzUpper, offset; 350 PetscErrorCode ierr; 351 352 PetscFunctionBegin; 353 if (!n) PetscFunctionReturn(0); 354 if (A->valid_GPU_matrix == PETSC_OFFLOAD_UNALLOCATED || A->valid_GPU_matrix == PETSC_OFFLOAD_CPU) { 355 try { 356 /* next, figure out the number of nonzeros in the upper triangular matrix. */ 357 nzUpper = adiag[0]-adiag[n]; 358 359 /* Allocate Space for the upper triangular matrix */ 360 ierr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(ierr); 361 ierr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(ierr); 362 ierr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(ierr); 363 364 /* Fill the upper triangular matrix */ 365 AiUp[0]=(PetscInt) 0; 366 AiUp[n]=nzUpper; 367 offset = nzUpper; 368 for (i=n-1; i>=0; i--) { 369 v = aa + adiag[i+1] + 1; 370 vi = aj + adiag[i+1] + 1; 371 372 /* number of elements NOT on the diagonal */ 373 nz = adiag[i] - adiag[i+1]-1; 374 375 /* decrement the offset */ 376 offset -= (nz+1); 377 378 /* first, set the diagonal elements */ 379 AjUp[offset] = (PetscInt) i; 380 AAUp[offset] = (MatScalar)1./v[nz]; 381 AiUp[i] = AiUp[i+1] - (nz+1); 382 383 ierr = PetscArraycpy(&(AjUp[offset+1]), vi, nz);CHKERRQ(ierr); 384 ierr = PetscArraycpy(&(AAUp[offset+1]), v, nz);CHKERRQ(ierr); 385 } 386 387 /* allocate space for the triangular factor information */ 388 upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct; 389 390 /* Create the matrix description */ 391 stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUDA(stat); 392 stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat); 393 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUDA(stat); 394 stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUDA(stat); 395 stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUDA(stat); 396 397 /* Create the solve analysis information */ 398 stat = cusparseCreateSolveAnalysisInfo(&upTriFactor->solveInfo);CHKERRCUDA(stat); 399 400 /* set the operation */ 401 upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 402 403 /* set the matrix */ 404 upTriFactor->csrMat = new CsrMatrix; 405 upTriFactor->csrMat->num_rows = n; 406 upTriFactor->csrMat->num_cols = n; 407 upTriFactor->csrMat->num_entries = nzUpper; 408 409 upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1); 410 upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1); 411 412 upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper); 413 upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper); 414 415 upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper); 416 upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper); 417 418 /* perform the solve analysis */ 419 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, 420 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 421 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 422 upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo);CHKERRCUDA(stat); 423 424 /* assign the pointer. Is this really necessary? */ 425 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor; 426 427 ierr = cudaFreeHost(AiUp);CHKERRCUDA(ierr); 428 ierr = cudaFreeHost(AjUp);CHKERRCUDA(ierr); 429 ierr = cudaFreeHost(AAUp);CHKERRCUDA(ierr); 430 ierr = PetscLogCpuToGpu((n+1+nzUpper)*sizeof(int)+nzUpper*sizeof(PetscScalar));CHKERRQ(ierr); 431 } catch(char *ex) { 432 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 433 } 434 } 435 PetscFunctionReturn(0); 436 } 437 438 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A) 439 { 440 PetscErrorCode ierr; 441 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 442 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 443 IS isrow = a->row,iscol = a->icol; 444 PetscBool row_identity,col_identity; 445 const PetscInt *r,*c; 446 PetscInt n = A->rmap->n; 447 448 PetscFunctionBegin; 449 ierr = MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);CHKERRQ(ierr); 450 ierr = MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);CHKERRQ(ierr); 451 452 cusparseTriFactors->workVector = new THRUSTARRAY(n); 453 cusparseTriFactors->nnz=a->nz; 454 455 A->valid_GPU_matrix = PETSC_OFFLOAD_BOTH; 456 /*lower triangular indices */ 457 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 458 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 459 if (!row_identity) { 460 cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 461 cusparseTriFactors->rpermIndices->assign(r, r+n); 462 } 463 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 464 465 /*upper triangular indices */ 466 ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr); 467 ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 468 if (!col_identity) { 469 cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 470 cusparseTriFactors->cpermIndices->assign(c, c+n); 471 } 472 473 if(!row_identity && !col_identity) { 474 ierr = PetscLogCpuToGpu(2*n*sizeof(PetscInt));CHKERRQ(ierr); 475 } else if(!row_identity) { 476 ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr); 477 } else if(!col_identity) { 478 ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr); 479 } 480 481 ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr); 482 PetscFunctionReturn(0); 483 } 484 485 static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A) 486 { 487 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 488 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 489 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 490 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 491 cusparseStatus_t stat; 492 PetscErrorCode ierr; 493 PetscInt *AiUp, *AjUp; 494 PetscScalar *AAUp; 495 PetscScalar *AALo; 496 PetscInt nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j; 497 Mat_SeqSBAIJ *b = (Mat_SeqSBAIJ*)A->data; 498 const PetscInt *ai = b->i,*aj = b->j,*vj; 499 const MatScalar *aa = b->a,*v; 500 501 PetscFunctionBegin; 502 if (!n) PetscFunctionReturn(0); 503 if (A->valid_GPU_matrix == PETSC_OFFLOAD_UNALLOCATED || A->valid_GPU_matrix == PETSC_OFFLOAD_CPU) { 504 try { 505 /* Allocate Space for the upper triangular matrix */ 506 ierr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(ierr); 507 ierr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(ierr); 508 ierr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(ierr); 509 ierr = cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUDA(ierr); 510 511 /* Fill the upper triangular matrix */ 512 AiUp[0]=(PetscInt) 0; 513 AiUp[n]=nzUpper; 514 offset = 0; 515 for (i=0; i<n; i++) { 516 /* set the pointers */ 517 v = aa + ai[i]; 518 vj = aj + ai[i]; 519 nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */ 520 521 /* first, set the diagonal elements */ 522 AjUp[offset] = (PetscInt) i; 523 AAUp[offset] = (MatScalar)1.0/v[nz]; 524 AiUp[i] = offset; 525 AALo[offset] = (MatScalar)1.0/v[nz]; 526 527 offset+=1; 528 if (nz>0) { 529 ierr = PetscArraycpy(&(AjUp[offset]), vj, nz);CHKERRQ(ierr); 530 ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr); 531 for (j=offset; j<offset+nz; j++) { 532 AAUp[j] = -AAUp[j]; 533 AALo[j] = AAUp[j]/v[nz]; 534 } 535 offset+=nz; 536 } 537 } 538 539 /* allocate space for the triangular factor information */ 540 upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct; 541 542 /* Create the matrix description */ 543 stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUDA(stat); 544 stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat); 545 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUDA(stat); 546 stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUDA(stat); 547 stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUDA(stat); 548 549 /* Create the solve analysis information */ 550 stat = cusparseCreateSolveAnalysisInfo(&upTriFactor->solveInfo);CHKERRCUDA(stat); 551 552 /* set the operation */ 553 upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 554 555 /* set the matrix */ 556 upTriFactor->csrMat = new CsrMatrix; 557 upTriFactor->csrMat->num_rows = A->rmap->n; 558 upTriFactor->csrMat->num_cols = A->cmap->n; 559 upTriFactor->csrMat->num_entries = a->nz; 560 561 upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 562 upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1); 563 564 upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 565 upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz); 566 567 upTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 568 upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz); 569 570 /* perform the solve analysis */ 571 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, 572 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 573 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 574 upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo);CHKERRCUDA(stat); 575 576 /* assign the pointer. Is this really necessary? */ 577 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor; 578 579 /* allocate space for the triangular factor information */ 580 loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct; 581 582 /* Create the matrix description */ 583 stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUDA(stat); 584 stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat); 585 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUDA(stat); 586 stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUDA(stat); 587 stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUDA(stat); 588 589 /* Create the solve analysis information */ 590 stat = cusparseCreateSolveAnalysisInfo(&loTriFactor->solveInfo);CHKERRCUDA(stat); 591 592 /* set the operation */ 593 loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE; 594 595 /* set the matrix */ 596 loTriFactor->csrMat = new CsrMatrix; 597 loTriFactor->csrMat->num_rows = A->rmap->n; 598 loTriFactor->csrMat->num_cols = A->cmap->n; 599 loTriFactor->csrMat->num_entries = a->nz; 600 601 loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 602 loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1); 603 604 loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 605 loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz); 606 607 loTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 608 loTriFactor->csrMat->values->assign(AALo, AALo+a->nz); 609 ierr = PetscLogCpuToGpu(2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)));CHKERRQ(ierr); 610 611 /* perform the solve analysis */ 612 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, 613 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 614 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 615 loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo);CHKERRCUDA(stat); 616 617 /* assign the pointer. Is this really necessary? */ 618 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor; 619 620 A->valid_GPU_matrix = PETSC_OFFLOAD_BOTH; 621 ierr = cudaFreeHost(AiUp);CHKERRCUDA(ierr); 622 ierr = cudaFreeHost(AjUp);CHKERRCUDA(ierr); 623 ierr = cudaFreeHost(AAUp);CHKERRCUDA(ierr); 624 ierr = cudaFreeHost(AALo);CHKERRCUDA(ierr); 625 } catch(char *ex) { 626 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 627 } 628 } 629 PetscFunctionReturn(0); 630 } 631 632 static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A) 633 { 634 PetscErrorCode ierr; 635 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 636 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 637 IS ip = a->row; 638 const PetscInt *rip; 639 PetscBool perm_identity; 640 PetscInt n = A->rmap->n; 641 642 PetscFunctionBegin; 643 ierr = MatSeqAIJCUSPARSEBuildICCTriMatrices(A);CHKERRQ(ierr); 644 cusparseTriFactors->workVector = new THRUSTARRAY(n); 645 cusparseTriFactors->nnz=(a->nz-n)*2 + n; 646 647 /*lower triangular indices */ 648 ierr = ISGetIndices(ip,&rip);CHKERRQ(ierr); 649 ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr); 650 if (!perm_identity) { 651 cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 652 cusparseTriFactors->rpermIndices->assign(rip, rip+n); 653 cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 654 cusparseTriFactors->cpermIndices->assign(rip, rip+n); 655 ierr = PetscLogCpuToGpu(2*n*sizeof(PetscInt));CHKERRQ(ierr); 656 } 657 ierr = ISRestoreIndices(ip,&rip);CHKERRQ(ierr); 658 PetscFunctionReturn(0); 659 } 660 661 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info) 662 { 663 Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 664 IS isrow = b->row,iscol = b->col; 665 PetscBool row_identity,col_identity; 666 PetscErrorCode ierr; 667 668 PetscFunctionBegin; 669 ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 670 /* determine which version of MatSolve needs to be used. */ 671 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 672 ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 673 if (row_identity && col_identity) { 674 B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 675 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 676 } else { 677 B->ops->solve = MatSolve_SeqAIJCUSPARSE; 678 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 679 } 680 681 /* get the triangular factors */ 682 ierr = MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);CHKERRQ(ierr); 683 PetscFunctionReturn(0); 684 } 685 686 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info) 687 { 688 Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 689 IS ip = b->row; 690 PetscBool perm_identity; 691 PetscErrorCode ierr; 692 693 PetscFunctionBegin; 694 ierr = MatCholeskyFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 695 696 /* determine which version of MatSolve needs to be used. */ 697 ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr); 698 if (perm_identity) { 699 B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 700 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 701 } else { 702 B->ops->solve = MatSolve_SeqAIJCUSPARSE; 703 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 704 } 705 706 /* get the triangular factors */ 707 ierr = MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);CHKERRQ(ierr); 708 PetscFunctionReturn(0); 709 } 710 711 static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A) 712 { 713 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 714 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 715 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 716 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 717 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 718 cusparseStatus_t stat; 719 cusparseIndexBase_t indexBase; 720 cusparseMatrixType_t matrixType; 721 cusparseFillMode_t fillMode; 722 cusparseDiagType_t diagType; 723 724 PetscFunctionBegin; 725 726 /*********************************************/ 727 /* Now the Transpose of the Lower Tri Factor */ 728 /*********************************************/ 729 730 /* allocate space for the transpose of the lower triangular factor */ 731 loTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct; 732 733 /* set the matrix descriptors of the lower triangular factor */ 734 matrixType = cusparseGetMatType(loTriFactor->descr); 735 indexBase = cusparseGetMatIndexBase(loTriFactor->descr); 736 fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ? 737 CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 738 diagType = cusparseGetMatDiagType(loTriFactor->descr); 739 740 /* Create the matrix description */ 741 stat = cusparseCreateMatDescr(&loTriFactorT->descr);CHKERRCUDA(stat); 742 stat = cusparseSetMatIndexBase(loTriFactorT->descr, indexBase);CHKERRCUDA(stat); 743 stat = cusparseSetMatType(loTriFactorT->descr, matrixType);CHKERRCUDA(stat); 744 stat = cusparseSetMatFillMode(loTriFactorT->descr, fillMode);CHKERRCUDA(stat); 745 stat = cusparseSetMatDiagType(loTriFactorT->descr, diagType);CHKERRCUDA(stat); 746 747 /* Create the solve analysis information */ 748 stat = cusparseCreateSolveAnalysisInfo(&loTriFactorT->solveInfo);CHKERRCUDA(stat); 749 750 /* set the operation */ 751 loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 752 753 /* allocate GPU space for the CSC of the lower triangular factor*/ 754 loTriFactorT->csrMat = new CsrMatrix; 755 loTriFactorT->csrMat->num_rows = loTriFactor->csrMat->num_rows; 756 loTriFactorT->csrMat->num_cols = loTriFactor->csrMat->num_cols; 757 loTriFactorT->csrMat->num_entries = loTriFactor->csrMat->num_entries; 758 loTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(loTriFactor->csrMat->num_rows+1); 759 loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactor->csrMat->num_entries); 760 loTriFactorT->csrMat->values = new THRUSTARRAY(loTriFactor->csrMat->num_entries); 761 762 /* compute the transpose of the lower triangular factor, i.e. the CSC */ 763 stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, 764 loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, 765 loTriFactor->csrMat->values->data().get(), 766 loTriFactor->csrMat->row_offsets->data().get(), 767 loTriFactor->csrMat->column_indices->data().get(), 768 loTriFactorT->csrMat->values->data().get(), 769 loTriFactorT->csrMat->column_indices->data().get(), 770 loTriFactorT->csrMat->row_offsets->data().get(), 771 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUDA(stat); 772 773 /* perform the solve analysis on the transposed matrix */ 774 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp, 775 loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, 776 loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(), 777 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), 778 loTriFactorT->solveInfo);CHKERRCUDA(stat); 779 780 /* assign the pointer. Is this really necessary? */ 781 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT; 782 783 /*********************************************/ 784 /* Now the Transpose of the Upper Tri Factor */ 785 /*********************************************/ 786 787 /* allocate space for the transpose of the upper triangular factor */ 788 upTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct; 789 790 /* set the matrix descriptors of the upper triangular factor */ 791 matrixType = cusparseGetMatType(upTriFactor->descr); 792 indexBase = cusparseGetMatIndexBase(upTriFactor->descr); 793 fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ? 794 CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 795 diagType = cusparseGetMatDiagType(upTriFactor->descr); 796 797 /* Create the matrix description */ 798 stat = cusparseCreateMatDescr(&upTriFactorT->descr);CHKERRCUDA(stat); 799 stat = cusparseSetMatIndexBase(upTriFactorT->descr, indexBase);CHKERRCUDA(stat); 800 stat = cusparseSetMatType(upTriFactorT->descr, matrixType);CHKERRCUDA(stat); 801 stat = cusparseSetMatFillMode(upTriFactorT->descr, fillMode);CHKERRCUDA(stat); 802 stat = cusparseSetMatDiagType(upTriFactorT->descr, diagType);CHKERRCUDA(stat); 803 804 /* Create the solve analysis information */ 805 stat = cusparseCreateSolveAnalysisInfo(&upTriFactorT->solveInfo);CHKERRCUDA(stat); 806 807 /* set the operation */ 808 upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 809 810 /* allocate GPU space for the CSC of the upper triangular factor*/ 811 upTriFactorT->csrMat = new CsrMatrix; 812 upTriFactorT->csrMat->num_rows = upTriFactor->csrMat->num_rows; 813 upTriFactorT->csrMat->num_cols = upTriFactor->csrMat->num_cols; 814 upTriFactorT->csrMat->num_entries = upTriFactor->csrMat->num_entries; 815 upTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(upTriFactor->csrMat->num_rows+1); 816 upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactor->csrMat->num_entries); 817 upTriFactorT->csrMat->values = new THRUSTARRAY(upTriFactor->csrMat->num_entries); 818 819 /* compute the transpose of the upper triangular factor, i.e. the CSC */ 820 stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, 821 upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, 822 upTriFactor->csrMat->values->data().get(), 823 upTriFactor->csrMat->row_offsets->data().get(), 824 upTriFactor->csrMat->column_indices->data().get(), 825 upTriFactorT->csrMat->values->data().get(), 826 upTriFactorT->csrMat->column_indices->data().get(), 827 upTriFactorT->csrMat->row_offsets->data().get(), 828 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUDA(stat); 829 830 /* perform the solve analysis on the transposed matrix */ 831 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp, 832 upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, 833 upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(), 834 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), 835 upTriFactorT->solveInfo);CHKERRCUDA(stat); 836 837 /* assign the pointer. Is this really necessary? */ 838 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT; 839 PetscFunctionReturn(0); 840 } 841 842 static PetscErrorCode MatSeqAIJCUSPARSEGenerateTransposeForMult(Mat A) 843 { 844 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 845 Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 846 Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 847 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 848 cusparseStatus_t stat; 849 cusparseIndexBase_t indexBase; 850 cudaError_t err; 851 PetscErrorCode ierr; 852 853 PetscFunctionBegin; 854 855 /* allocate space for the triangular factor information */ 856 matstructT = new Mat_SeqAIJCUSPARSEMultStruct; 857 stat = cusparseCreateMatDescr(&matstructT->descr);CHKERRCUDA(stat); 858 indexBase = cusparseGetMatIndexBase(matstruct->descr); 859 stat = cusparseSetMatIndexBase(matstructT->descr, indexBase);CHKERRCUDA(stat); 860 stat = cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUDA(stat); 861 862 /* set alpha and beta */ 863 err = cudaMalloc((void **)&(matstructT->alpha), sizeof(PetscScalar));CHKERRCUDA(err); 864 err = cudaMalloc((void **)&(matstructT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err); 865 err = cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar));CHKERRCUDA(err); 866 err = cudaMemcpy(matstructT->alpha, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 867 err = cudaMemcpy(matstructT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 868 err = cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 869 stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUDA(stat); 870 871 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 872 CsrMatrix *matrix = (CsrMatrix*)matstruct->mat; 873 CsrMatrix *matrixT= new CsrMatrix; 874 thrust::device_vector<int> *rowoffsets_gpu; 875 const int ncomprow = matstruct->cprowIndices->size(); 876 thrust::host_vector<int> cprow(ncomprow); 877 cprow = *matstruct->cprowIndices; // GPU --> CPU 878 matrixT->num_rows = A->cmap->n; 879 matrixT->num_cols = A->rmap->n; 880 matrixT->num_entries = a->nz; 881 matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows+1); 882 matrixT->column_indices = new THRUSTINTARRAY32(a->nz); 883 matrixT->values = new THRUSTARRAY(a->nz); 884 PetscInt k,i; 885 thrust::host_vector<int> rowst_host(A->rmap->n+1); 886 887 /* expand compress rows, which is forced in constructor (MatSeqAIJCUSPARSECopyToGPU) */ 888 rowst_host[0] = 0; 889 for (k = 0, i = 0; i < A->rmap->n ; i++) { 890 if (k < ncomprow && i==cprow[k]) { 891 rowst_host[i+1] = a->i[i+1]; 892 k++; 893 } else rowst_host[i+1] = rowst_host[i]; 894 } 895 if (k!=ncomprow) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_SUP,"%D != %D",k,ncomprow); 896 rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1); 897 *rowoffsets_gpu = rowst_host; // CPU --> GPU 898 899 /* compute the transpose of the upper triangular factor, i.e. the CSC */ 900 indexBase = cusparseGetMatIndexBase(matstruct->descr); 901 stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n, 902 A->cmap->n, matrix->num_entries, 903 matrix->values->data().get(), 904 rowoffsets_gpu->data().get(), 905 matrix->column_indices->data().get(), 906 matrixT->values->data().get(), 907 matrixT->column_indices->data().get(), 908 matrixT->row_offsets->data().get(), 909 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUDA(stat); 910 911 /* assign the pointer */ 912 matstructT->mat = matrixT; 913 ierr = PetscLogCpuToGpu(((A->rmap->n+1)+(a->nz))*sizeof(int)+(3+a->nz)*sizeof(PetscScalar));CHKERRQ(ierr); 914 } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) { 915 /* First convert HYB to CSR */ 916 CsrMatrix *temp= new CsrMatrix; 917 temp->num_rows = A->rmap->n; 918 temp->num_cols = A->cmap->n; 919 temp->num_entries = a->nz; 920 temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 921 temp->column_indices = new THRUSTINTARRAY32(a->nz); 922 temp->values = new THRUSTARRAY(a->nz); 923 924 925 stat = cusparse_hyb2csr(cusparsestruct->handle, 926 matstruct->descr, (cusparseHybMat_t)matstruct->mat, 927 temp->values->data().get(), 928 temp->row_offsets->data().get(), 929 temp->column_indices->data().get());CHKERRCUDA(stat); 930 931 /* Next, convert CSR to CSC (i.e. the matrix transpose) */ 932 CsrMatrix *tempT= new CsrMatrix; 933 tempT->num_rows = A->rmap->n; 934 tempT->num_cols = A->cmap->n; 935 tempT->num_entries = a->nz; 936 tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 937 tempT->column_indices = new THRUSTINTARRAY32(a->nz); 938 tempT->values = new THRUSTARRAY(a->nz); 939 940 stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows, 941 temp->num_cols, temp->num_entries, 942 temp->values->data().get(), 943 temp->row_offsets->data().get(), 944 temp->column_indices->data().get(), 945 tempT->values->data().get(), 946 tempT->column_indices->data().get(), 947 tempT->row_offsets->data().get(), 948 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUDA(stat); 949 950 /* Last, convert CSC to HYB */ 951 cusparseHybMat_t hybMat; 952 stat = cusparseCreateHybMat(&hybMat);CHKERRCUDA(stat); 953 cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ? 954 CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 955 stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n, 956 matstructT->descr, tempT->values->data().get(), 957 tempT->row_offsets->data().get(), 958 tempT->column_indices->data().get(), 959 hybMat, 0, partition);CHKERRCUDA(stat); 960 961 /* assign the pointer */ 962 matstructT->mat = hybMat; 963 ierr = PetscLogCpuToGpu((2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)))+3*sizeof(PetscScalar));CHKERRQ(ierr); 964 965 /* delete temporaries */ 966 if (tempT) { 967 if (tempT->values) delete (THRUSTARRAY*) tempT->values; 968 if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices; 969 if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets; 970 delete (CsrMatrix*) tempT; 971 } 972 if (temp) { 973 if (temp->values) delete (THRUSTARRAY*) temp->values; 974 if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices; 975 if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets; 976 delete (CsrMatrix*) temp; 977 } 978 } 979 /* assign the compressed row indices */ 980 matstructT->cprowIndices = new THRUSTINTARRAY; 981 matstructT->cprowIndices->resize(A->cmap->n); 982 thrust::sequence(matstructT->cprowIndices->begin(), matstructT->cprowIndices->end()); 983 /* assign the pointer */ 984 ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT; 985 PetscFunctionReturn(0); 986 } 987 988 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 989 { 990 PetscInt n = xx->map->n; 991 const PetscScalar *barray; 992 PetscScalar *xarray; 993 thrust::device_ptr<const PetscScalar> bGPU; 994 thrust::device_ptr<PetscScalar> xGPU; 995 cusparseStatus_t stat; 996 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 997 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 998 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 999 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1000 PetscErrorCode ierr; 1001 1002 PetscFunctionBegin; 1003 /* Analyze the matrix and create the transpose ... on the fly */ 1004 if (!loTriFactorT && !upTriFactorT) { 1005 ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr); 1006 loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1007 upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1008 } 1009 1010 /* Get the GPU pointers */ 1011 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1012 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1013 xGPU = thrust::device_pointer_cast(xarray); 1014 bGPU = thrust::device_pointer_cast(barray); 1015 1016 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1017 /* First, reorder with the row permutation */ 1018 thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), 1019 thrust::make_permutation_iterator(bGPU+n, cusparseTriFactors->rpermIndices->end()), 1020 xGPU); 1021 1022 /* First, solve U */ 1023 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, 1024 upTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactorT->descr, 1025 upTriFactorT->csrMat->values->data().get(), 1026 upTriFactorT->csrMat->row_offsets->data().get(), 1027 upTriFactorT->csrMat->column_indices->data().get(), 1028 upTriFactorT->solveInfo, 1029 xarray, tempGPU->data().get());CHKERRCUDA(stat); 1030 1031 /* Then, solve L */ 1032 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, 1033 loTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactorT->descr, 1034 loTriFactorT->csrMat->values->data().get(), 1035 loTriFactorT->csrMat->row_offsets->data().get(), 1036 loTriFactorT->csrMat->column_indices->data().get(), 1037 loTriFactorT->solveInfo, 1038 tempGPU->data().get(), xarray);CHKERRCUDA(stat); 1039 1040 /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */ 1041 thrust::copy(thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()), 1042 thrust::make_permutation_iterator(xGPU+n, cusparseTriFactors->cpermIndices->end()), 1043 tempGPU->begin()); 1044 1045 /* Copy the temporary to the full solution. */ 1046 thrust::copy(tempGPU->begin(), tempGPU->end(), xGPU); 1047 1048 /* restore */ 1049 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1050 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1051 ierr = WaitForGPU();CHKERRCUDA(ierr); 1052 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1053 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1054 PetscFunctionReturn(0); 1055 } 1056 1057 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 1058 { 1059 const PetscScalar *barray; 1060 PetscScalar *xarray; 1061 cusparseStatus_t stat; 1062 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1063 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1064 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1065 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1066 PetscErrorCode ierr; 1067 1068 PetscFunctionBegin; 1069 /* Analyze the matrix and create the transpose ... on the fly */ 1070 if (!loTriFactorT && !upTriFactorT) { 1071 ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr); 1072 loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1073 upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1074 } 1075 1076 /* Get the GPU pointers */ 1077 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1078 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1079 1080 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1081 /* First, solve U */ 1082 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, 1083 upTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactorT->descr, 1084 upTriFactorT->csrMat->values->data().get(), 1085 upTriFactorT->csrMat->row_offsets->data().get(), 1086 upTriFactorT->csrMat->column_indices->data().get(), 1087 upTriFactorT->solveInfo, 1088 barray, tempGPU->data().get());CHKERRCUDA(stat); 1089 1090 /* Then, solve L */ 1091 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, 1092 loTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactorT->descr, 1093 loTriFactorT->csrMat->values->data().get(), 1094 loTriFactorT->csrMat->row_offsets->data().get(), 1095 loTriFactorT->csrMat->column_indices->data().get(), 1096 loTriFactorT->solveInfo, 1097 tempGPU->data().get(), xarray);CHKERRCUDA(stat); 1098 1099 /* restore */ 1100 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1101 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1102 ierr = WaitForGPU();CHKERRCUDA(ierr); 1103 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1104 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1105 PetscFunctionReturn(0); 1106 } 1107 1108 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 1109 { 1110 const PetscScalar *barray; 1111 PetscScalar *xarray; 1112 thrust::device_ptr<const PetscScalar> bGPU; 1113 thrust::device_ptr<PetscScalar> xGPU; 1114 cusparseStatus_t stat; 1115 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1116 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1117 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1118 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1119 PetscErrorCode ierr; 1120 1121 PetscFunctionBegin; 1122 1123 /* Get the GPU pointers */ 1124 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1125 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1126 xGPU = thrust::device_pointer_cast(xarray); 1127 bGPU = thrust::device_pointer_cast(barray); 1128 1129 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1130 /* First, reorder with the row permutation */ 1131 thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), 1132 thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()), 1133 xGPU); 1134 1135 /* Next, solve L */ 1136 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp, 1137 loTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactor->descr, 1138 loTriFactor->csrMat->values->data().get(), 1139 loTriFactor->csrMat->row_offsets->data().get(), 1140 loTriFactor->csrMat->column_indices->data().get(), 1141 loTriFactor->solveInfo, 1142 xarray, tempGPU->data().get());CHKERRCUDA(stat); 1143 1144 /* Then, solve U */ 1145 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp, 1146 upTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactor->descr, 1147 upTriFactor->csrMat->values->data().get(), 1148 upTriFactor->csrMat->row_offsets->data().get(), 1149 upTriFactor->csrMat->column_indices->data().get(), 1150 upTriFactor->solveInfo, 1151 tempGPU->data().get(), xarray);CHKERRCUDA(stat); 1152 1153 /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */ 1154 thrust::copy(thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()), 1155 thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->end()), 1156 tempGPU->begin()); 1157 1158 /* Copy the temporary to the full solution. */ 1159 thrust::copy(tempGPU->begin(), tempGPU->end(), xGPU); 1160 1161 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1162 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1163 ierr = WaitForGPU();CHKERRCUDA(ierr); 1164 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1165 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1166 PetscFunctionReturn(0); 1167 } 1168 1169 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 1170 { 1171 const PetscScalar *barray; 1172 PetscScalar *xarray; 1173 cusparseStatus_t stat; 1174 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1175 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1176 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1177 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1178 PetscErrorCode ierr; 1179 1180 PetscFunctionBegin; 1181 /* Get the GPU pointers */ 1182 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1183 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1184 1185 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1186 /* First, solve L */ 1187 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp, 1188 loTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactor->descr, 1189 loTriFactor->csrMat->values->data().get(), 1190 loTriFactor->csrMat->row_offsets->data().get(), 1191 loTriFactor->csrMat->column_indices->data().get(), 1192 loTriFactor->solveInfo, 1193 barray, tempGPU->data().get());CHKERRCUDA(stat); 1194 1195 /* Next, solve U */ 1196 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp, 1197 upTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactor->descr, 1198 upTriFactor->csrMat->values->data().get(), 1199 upTriFactor->csrMat->row_offsets->data().get(), 1200 upTriFactor->csrMat->column_indices->data().get(), 1201 upTriFactor->solveInfo, 1202 tempGPU->data().get(), xarray);CHKERRCUDA(stat); 1203 1204 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1205 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1206 ierr = WaitForGPU();CHKERRCUDA(ierr); 1207 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1208 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1209 PetscFunctionReturn(0); 1210 } 1211 1212 static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A) 1213 { 1214 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1215 Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 1216 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1217 PetscInt m = A->rmap->n,*ii,*ridx; 1218 PetscErrorCode ierr; 1219 cusparseStatus_t stat; 1220 cudaError_t err; 1221 1222 PetscFunctionBegin; 1223 if (A->pinnedtocpu) PetscFunctionReturn(0); 1224 if (A->valid_GPU_matrix == PETSC_OFFLOAD_UNALLOCATED || A->valid_GPU_matrix == PETSC_OFFLOAD_CPU) { 1225 ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1226 if (A->assembled && A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) { 1227 CsrMatrix *matrix = (CsrMatrix*)matstruct->mat; 1228 /* copy values only */ 1229 matrix->values->assign(a->a, a->a+a->nz); 1230 ierr = PetscLogCpuToGpu((a->nz)*sizeof(PetscScalar));CHKERRQ(ierr); 1231 } else { 1232 MatSeqAIJCUSPARSEMultStruct_Destroy(&matstruct,cusparsestruct->format); 1233 try { 1234 cusparsestruct->nonzerorow=0; 1235 for (int j = 0; j<m; j++) cusparsestruct->nonzerorow += ((a->i[j+1]-a->i[j])>0); 1236 1237 if (a->compressedrow.use) { 1238 m = a->compressedrow.nrows; 1239 ii = a->compressedrow.i; 1240 ridx = a->compressedrow.rindex; 1241 } else { 1242 /* Forcing compressed row on the GPU */ 1243 int k=0; 1244 ierr = PetscMalloc1(cusparsestruct->nonzerorow+1, &ii);CHKERRQ(ierr); 1245 ierr = PetscMalloc1(cusparsestruct->nonzerorow, &ridx);CHKERRQ(ierr); 1246 ii[0]=0; 1247 for (int j = 0; j<m; j++) { 1248 if ((a->i[j+1]-a->i[j])>0) { 1249 ii[k] = a->i[j]; 1250 ridx[k]= j; 1251 k++; 1252 } 1253 } 1254 ii[cusparsestruct->nonzerorow] = a->nz; 1255 m = cusparsestruct->nonzerorow; 1256 } 1257 1258 /* allocate space for the triangular factor information */ 1259 matstruct = new Mat_SeqAIJCUSPARSEMultStruct; 1260 stat = cusparseCreateMatDescr(&matstruct->descr);CHKERRCUDA(stat); 1261 stat = cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat); 1262 stat = cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUDA(stat); 1263 1264 err = cudaMalloc((void **)&(matstruct->alpha), sizeof(PetscScalar));CHKERRCUDA(err); 1265 err = cudaMalloc((void **)&(matstruct->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err); 1266 err = cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar));CHKERRCUDA(err); 1267 err = cudaMemcpy(matstruct->alpha, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1268 err = cudaMemcpy(matstruct->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1269 err = cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1270 stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUDA(stat); 1271 1272 /* Build a hybrid/ellpack matrix if this option is chosen for the storage */ 1273 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1274 /* set the matrix */ 1275 CsrMatrix *matrix= new CsrMatrix; 1276 matrix->num_rows = m; 1277 matrix->num_cols = A->cmap->n; 1278 matrix->num_entries = a->nz; 1279 matrix->row_offsets = new THRUSTINTARRAY32(m+1); 1280 matrix->row_offsets->assign(ii, ii + m+1); 1281 1282 matrix->column_indices = new THRUSTINTARRAY32(a->nz); 1283 matrix->column_indices->assign(a->j, a->j+a->nz); 1284 1285 matrix->values = new THRUSTARRAY(a->nz); 1286 matrix->values->assign(a->a, a->a+a->nz); 1287 1288 /* assign the pointer */ 1289 matstruct->mat = matrix; 1290 1291 } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) { 1292 CsrMatrix *matrix= new CsrMatrix; 1293 matrix->num_rows = m; 1294 matrix->num_cols = A->cmap->n; 1295 matrix->num_entries = a->nz; 1296 matrix->row_offsets = new THRUSTINTARRAY32(m+1); 1297 matrix->row_offsets->assign(ii, ii + m+1); 1298 1299 matrix->column_indices = new THRUSTINTARRAY32(a->nz); 1300 matrix->column_indices->assign(a->j, a->j+a->nz); 1301 1302 matrix->values = new THRUSTARRAY(a->nz); 1303 matrix->values->assign(a->a, a->a+a->nz); 1304 1305 cusparseHybMat_t hybMat; 1306 stat = cusparseCreateHybMat(&hybMat);CHKERRCUDA(stat); 1307 cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ? 1308 CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 1309 stat = cusparse_csr2hyb(cusparsestruct->handle, matrix->num_rows, matrix->num_cols, 1310 matstruct->descr, matrix->values->data().get(), 1311 matrix->row_offsets->data().get(), 1312 matrix->column_indices->data().get(), 1313 hybMat, 0, partition);CHKERRCUDA(stat); 1314 /* assign the pointer */ 1315 matstruct->mat = hybMat; 1316 1317 if (matrix) { 1318 if (matrix->values) delete (THRUSTARRAY*)matrix->values; 1319 if (matrix->column_indices) delete (THRUSTINTARRAY32*)matrix->column_indices; 1320 if (matrix->row_offsets) delete (THRUSTINTARRAY32*)matrix->row_offsets; 1321 delete (CsrMatrix*)matrix; 1322 } 1323 } 1324 1325 /* assign the compressed row indices */ 1326 matstruct->cprowIndices = new THRUSTINTARRAY(m); 1327 matstruct->cprowIndices->assign(ridx,ridx+m); 1328 ierr = PetscLogCpuToGpu(((m+1)+(a->nz))*sizeof(int)+m*sizeof(PetscInt)+(3+(a->nz))*sizeof(PetscScalar));CHKERRQ(ierr); 1329 1330 /* assign the pointer */ 1331 cusparsestruct->mat = matstruct; 1332 1333 if (!a->compressedrow.use) { 1334 ierr = PetscFree(ii);CHKERRQ(ierr); 1335 ierr = PetscFree(ridx);CHKERRQ(ierr); 1336 } 1337 cusparsestruct->workVector = new THRUSTARRAY(m); 1338 } catch(char *ex) { 1339 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 1340 } 1341 cusparsestruct->nonzerostate = A->nonzerostate; 1342 } 1343 ierr = WaitForGPU();CHKERRCUDA(ierr); 1344 A->valid_GPU_matrix = PETSC_OFFLOAD_BOTH; 1345 ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1346 } 1347 PetscFunctionReturn(0); 1348 } 1349 1350 struct VecCUDAPlusEquals 1351 { 1352 template <typename Tuple> 1353 __host__ __device__ 1354 void operator()(Tuple t) 1355 { 1356 thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t); 1357 } 1358 }; 1359 1360 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 1361 { 1362 PetscErrorCode ierr; 1363 1364 PetscFunctionBegin; 1365 ierr = MatMultAdd_SeqAIJCUSPARSE(A,xx,NULL,yy);CHKERRQ(ierr); 1366 PetscFunctionReturn(0); 1367 } 1368 1369 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 1370 { 1371 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1372 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1373 Mat_SeqAIJCUSPARSEMultStruct *matstructT; 1374 const PetscScalar *xarray; 1375 PetscScalar *yarray; 1376 PetscErrorCode ierr; 1377 cusparseStatus_t stat; 1378 1379 PetscFunctionBegin; 1380 /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */ 1381 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 1382 matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 1383 if (!matstructT) { 1384 ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr); 1385 matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 1386 } 1387 ierr = VecCUDAGetArrayRead(xx,&xarray);CHKERRQ(ierr); 1388 ierr = VecCUDAGetArrayWrite(yy,&yarray);CHKERRQ(ierr); 1389 1390 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1391 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1392 CsrMatrix *mat = (CsrMatrix*)matstructT->mat; 1393 stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1394 mat->num_rows, mat->num_cols, 1395 mat->num_entries, matstructT->alpha, matstructT->descr, 1396 mat->values->data().get(), mat->row_offsets->data().get(), 1397 mat->column_indices->data().get(), xarray, matstructT->beta_zero, 1398 yarray);CHKERRCUDA(stat); 1399 } else { 1400 cusparseHybMat_t hybMat = (cusparseHybMat_t)matstructT->mat; 1401 stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1402 matstructT->alpha, matstructT->descr, hybMat, 1403 xarray, matstructT->beta_zero, 1404 yarray);CHKERRCUDA(stat); 1405 } 1406 ierr = VecCUDARestoreArrayRead(xx,&xarray);CHKERRQ(ierr); 1407 ierr = VecCUDARestoreArrayWrite(yy,&yarray);CHKERRQ(ierr); 1408 ierr = WaitForGPU();CHKERRCUDA(ierr); 1409 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1410 ierr = PetscLogGpuFlops(2.0*a->nz - cusparsestruct->nonzerorow);CHKERRQ(ierr); 1411 PetscFunctionReturn(0); 1412 } 1413 1414 1415 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 1416 { 1417 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1418 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1419 Mat_SeqAIJCUSPARSEMultStruct *matstruct; 1420 const PetscScalar *xarray; 1421 PetscScalar *zarray,*dptr,*beta; 1422 PetscErrorCode ierr; 1423 cusparseStatus_t stat; 1424 PetscBool cmpr; /* if the matrix has been compressed (zero rows) */ 1425 1426 PetscFunctionBegin; 1427 /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */ 1428 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 1429 matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 1430 try { 1431 cmpr = (PetscBool)(cusparsestruct->workVector->size() == (thrust::detail::vector_base<PetscScalar, thrust::device_malloc_allocator<PetscScalar> >::size_type)(A->rmap->n)); 1432 ierr = VecCUDAGetArrayRead(xx,&xarray);CHKERRQ(ierr); 1433 if (yy && !cmpr) { /* MatMultAdd with noncompressed storage -> need uptodate zz vector */ 1434 ierr = VecCUDAGetArray(zz,&zarray);CHKERRQ(ierr); 1435 } else { 1436 ierr = VecCUDAGetArrayWrite(zz,&zarray);CHKERRQ(ierr); 1437 } 1438 dptr = cmpr ? zarray : cusparsestruct->workVector->data().get(); 1439 beta = (yy == zz && dptr == zarray) ? matstruct->beta_one : matstruct->beta_zero; 1440 1441 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1442 /* csr_spmv is multiply add */ 1443 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 1444 /* here we need to be careful to set the number of rows in the multiply to the 1445 number of compressed rows in the matrix ... which is equivalent to the 1446 size of the workVector */ 1447 CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 1448 stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1449 mat->num_rows, mat->num_cols, 1450 mat->num_entries, matstruct->alpha, matstruct->descr, 1451 mat->values->data().get(), mat->row_offsets->data().get(), 1452 mat->column_indices->data().get(), xarray, beta, 1453 dptr);CHKERRCUDA(stat); 1454 } else { 1455 if (cusparsestruct->workVector->size()) { 1456 cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat; 1457 stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1458 matstruct->alpha, matstruct->descr, hybMat, 1459 xarray, beta, 1460 dptr);CHKERRCUDA(stat); 1461 } 1462 } 1463 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1464 1465 if (yy) { 1466 if (dptr != zarray) { 1467 ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr); 1468 } else if (zz != yy) { 1469 ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); 1470 } 1471 } else if (dptr != zarray) { 1472 ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr); 1473 } 1474 /* scatter the data from the temporary into the full vector with a += operation */ 1475 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1476 if (dptr != zarray) { 1477 thrust::device_ptr<PetscScalar> zptr; 1478 1479 zptr = thrust::device_pointer_cast(zarray); 1480 thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))), 1481 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + cusparsestruct->workVector->size(), 1482 VecCUDAPlusEquals()); 1483 } 1484 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1485 ierr = VecCUDARestoreArrayRead(xx,&xarray);CHKERRQ(ierr); 1486 if (yy && !cmpr) { 1487 ierr = VecCUDARestoreArray(zz,&zarray);CHKERRQ(ierr); 1488 } else { 1489 ierr = VecCUDARestoreArrayWrite(zz,&zarray);CHKERRQ(ierr); 1490 } 1491 } catch(char *ex) { 1492 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 1493 } 1494 if (!yy) { /* MatMult */ 1495 if (!cusparsestruct->stream) { 1496 ierr = WaitForGPU();CHKERRCUDA(ierr); 1497 } 1498 } 1499 ierr = PetscLogGpuFlops(2.0*a->nz);CHKERRQ(ierr); 1500 PetscFunctionReturn(0); 1501 } 1502 1503 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 1504 { 1505 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1506 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1507 Mat_SeqAIJCUSPARSEMultStruct *matstructT; 1508 const PetscScalar *xarray; 1509 PetscScalar *zarray,*dptr,*beta; 1510 PetscErrorCode ierr; 1511 cusparseStatus_t stat; 1512 1513 PetscFunctionBegin; 1514 /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */ 1515 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 1516 matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 1517 if (!matstructT) { 1518 ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr); 1519 matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 1520 } 1521 1522 try { 1523 ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr); 1524 ierr = VecCUDAGetArrayRead(xx,&xarray);CHKERRQ(ierr); 1525 ierr = VecCUDAGetArray(zz,&zarray);CHKERRQ(ierr); 1526 dptr = cusparsestruct->workVector->size() == (thrust::detail::vector_base<PetscScalar, thrust::device_malloc_allocator<PetscScalar> >::size_type)(A->cmap->n) ? zarray : cusparsestruct->workVector->data().get(); 1527 beta = (yy == zz && dptr == zarray) ? matstructT->beta_one : matstructT->beta_zero; 1528 1529 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1530 /* multiply add with matrix transpose */ 1531 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1532 CsrMatrix *mat = (CsrMatrix*)matstructT->mat; 1533 /* here we need to be careful to set the number of rows in the multiply to the 1534 number of compressed rows in the matrix ... which is equivalent to the 1535 size of the workVector */ 1536 stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1537 mat->num_rows, mat->num_cols, 1538 mat->num_entries, matstructT->alpha, matstructT->descr, 1539 mat->values->data().get(), mat->row_offsets->data().get(), 1540 mat->column_indices->data().get(), xarray, beta, 1541 dptr);CHKERRCUDA(stat); 1542 } else { 1543 cusparseHybMat_t hybMat = (cusparseHybMat_t)matstructT->mat; 1544 if (cusparsestruct->workVector->size()) { 1545 stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1546 matstructT->alpha, matstructT->descr, hybMat, 1547 xarray, beta, 1548 dptr);CHKERRCUDA(stat); 1549 } 1550 } 1551 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1552 1553 if (dptr != zarray) { 1554 ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr); 1555 } else if (zz != yy) { 1556 ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); 1557 } 1558 /* scatter the data from the temporary into the full vector with a += operation */ 1559 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1560 if (dptr != zarray) { 1561 thrust::device_ptr<PetscScalar> zptr; 1562 1563 zptr = thrust::device_pointer_cast(zarray); 1564 1565 /* scatter the data from the temporary into the full vector with a += operation */ 1566 thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstructT->cprowIndices->begin()))), 1567 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstructT->cprowIndices->begin()))) + A->cmap->n, 1568 VecCUDAPlusEquals()); 1569 } 1570 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1571 ierr = VecCUDARestoreArrayRead(xx,&xarray);CHKERRQ(ierr); 1572 ierr = VecCUDARestoreArray(zz,&zarray);CHKERRQ(ierr); 1573 } catch(char *ex) { 1574 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 1575 } 1576 ierr = WaitForGPU();CHKERRCUDA(ierr); /* is this needed? just for yy==0 in Mult */ 1577 ierr = PetscLogGpuFlops(2.0*a->nz);CHKERRQ(ierr); 1578 PetscFunctionReturn(0); 1579 } 1580 1581 static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode) 1582 { 1583 PetscErrorCode ierr; 1584 1585 PetscFunctionBegin; 1586 ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr); 1587 if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0); 1588 if (A->factortype == MAT_FACTOR_NONE) { 1589 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 1590 } 1591 A->ops->mult = MatMult_SeqAIJCUSPARSE; 1592 A->ops->multadd = MatMultAdd_SeqAIJCUSPARSE; 1593 A->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE; 1594 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE; 1595 PetscFunctionReturn(0); 1596 } 1597 1598 /* --------------------------------------------------------------------------------*/ 1599 /*@ 1600 MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format 1601 (the default parallel PETSc format). This matrix will ultimately pushed down 1602 to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix 1603 assembly performance the user should preallocate the matrix storage by setting 1604 the parameter nz (or the array nnz). By setting these parameters accurately, 1605 performance during matrix assembly can be increased by more than a factor of 50. 1606 1607 Collective 1608 1609 Input Parameters: 1610 + comm - MPI communicator, set to PETSC_COMM_SELF 1611 . m - number of rows 1612 . n - number of columns 1613 . nz - number of nonzeros per row (same for all rows) 1614 - nnz - array containing the number of nonzeros in the various rows 1615 (possibly different for each row) or NULL 1616 1617 Output Parameter: 1618 . A - the matrix 1619 1620 It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 1621 MatXXXXSetPreallocation() paradgm instead of this routine directly. 1622 [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 1623 1624 Notes: 1625 If nnz is given then nz is ignored 1626 1627 The AIJ format (also called the Yale sparse matrix format or 1628 compressed row storage), is fully compatible with standard Fortran 77 1629 storage. That is, the stored row and column indices can begin at 1630 either one (as in Fortran) or zero. See the users' manual for details. 1631 1632 Specify the preallocated storage with either nz or nnz (not both). 1633 Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 1634 allocation. For large problems you MUST preallocate memory or you 1635 will get TERRIBLE performance, see the users' manual chapter on matrices. 1636 1637 By default, this format uses inodes (identical nodes) when possible, to 1638 improve numerical efficiency of matrix-vector products and solves. We 1639 search for consecutive rows with the same nonzero structure, thereby 1640 reusing matrix information to achieve increased efficiency. 1641 1642 Level: intermediate 1643 1644 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE 1645 @*/ 1646 PetscErrorCode MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A) 1647 { 1648 PetscErrorCode ierr; 1649 1650 PetscFunctionBegin; 1651 ierr = MatCreate(comm,A);CHKERRQ(ierr); 1652 ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr); 1653 ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 1654 ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr); 1655 PetscFunctionReturn(0); 1656 } 1657 1658 static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A) 1659 { 1660 PetscErrorCode ierr; 1661 1662 PetscFunctionBegin; 1663 if (A->factortype==MAT_FACTOR_NONE) { 1664 if (A->valid_GPU_matrix != PETSC_OFFLOAD_UNALLOCATED) { 1665 ierr = MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr);CHKERRQ(ierr); 1666 } 1667 } else { 1668 ierr = MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr);CHKERRQ(ierr); 1669 } 1670 ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr); 1671 PetscFunctionReturn(0); 1672 } 1673 1674 static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A,MatDuplicateOption cpvalues,Mat *B) 1675 { 1676 PetscErrorCode ierr; 1677 Mat C; 1678 cusparseStatus_t stat; 1679 cusparseHandle_t handle=0; 1680 1681 PetscFunctionBegin; 1682 ierr = MatDuplicate_SeqAIJ(A,cpvalues,B);CHKERRQ(ierr); 1683 C = *B; 1684 ierr = PetscFree(C->defaultvectype);CHKERRQ(ierr); 1685 ierr = PetscStrallocpy(VECCUDA,&C->defaultvectype);CHKERRQ(ierr); 1686 1687 /* inject CUSPARSE-specific stuff */ 1688 if (C->factortype==MAT_FACTOR_NONE) { 1689 /* you cannot check the inode.use flag here since the matrix was just created. 1690 now build a GPU matrix data structure */ 1691 C->spptr = new Mat_SeqAIJCUSPARSE; 1692 ((Mat_SeqAIJCUSPARSE*)C->spptr)->mat = 0; 1693 ((Mat_SeqAIJCUSPARSE*)C->spptr)->matTranspose = 0; 1694 ((Mat_SeqAIJCUSPARSE*)C->spptr)->workVector = 0; 1695 ((Mat_SeqAIJCUSPARSE*)C->spptr)->format = MAT_CUSPARSE_CSR; 1696 ((Mat_SeqAIJCUSPARSE*)C->spptr)->stream = 0; 1697 stat = cusparseCreate(&handle);CHKERRCUDA(stat); 1698 ((Mat_SeqAIJCUSPARSE*)C->spptr)->handle = handle; 1699 ((Mat_SeqAIJCUSPARSE*)C->spptr)->nonzerostate = 0; 1700 } else { 1701 /* NEXT, set the pointers to the triangular factors */ 1702 C->spptr = new Mat_SeqAIJCUSPARSETriFactors; 1703 ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->loTriFactorPtr = 0; 1704 ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->upTriFactorPtr = 0; 1705 ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->loTriFactorPtrTranspose = 0; 1706 ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->upTriFactorPtrTranspose = 0; 1707 ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->rpermIndices = 0; 1708 ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->cpermIndices = 0; 1709 ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->workVector = 0; 1710 ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->handle = 0; 1711 stat = cusparseCreate(&handle);CHKERRCUDA(stat); 1712 ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->handle = handle; 1713 ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->nnz = 0; 1714 } 1715 1716 C->ops->assemblyend = MatAssemblyEnd_SeqAIJCUSPARSE; 1717 C->ops->destroy = MatDestroy_SeqAIJCUSPARSE; 1718 C->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE; 1719 C->ops->mult = MatMult_SeqAIJCUSPARSE; 1720 C->ops->multadd = MatMultAdd_SeqAIJCUSPARSE; 1721 C->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE; 1722 C->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE; 1723 C->ops->duplicate = MatDuplicate_SeqAIJCUSPARSE; 1724 1725 ierr = PetscObjectChangeTypeName((PetscObject)C,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 1726 1727 C->valid_GPU_matrix = PETSC_OFFLOAD_UNALLOCATED; 1728 1729 ierr = PetscObjectComposeFunction((PetscObject)C, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr); 1730 PetscFunctionReturn(0); 1731 } 1732 1733 PETSC_EXTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat B) 1734 { 1735 PetscErrorCode ierr; 1736 cusparseStatus_t stat; 1737 cusparseHandle_t handle=0; 1738 1739 PetscFunctionBegin; 1740 ierr = PetscFree(B->defaultvectype);CHKERRQ(ierr); 1741 ierr = PetscStrallocpy(VECCUDA,&B->defaultvectype);CHKERRQ(ierr); 1742 1743 if (B->factortype==MAT_FACTOR_NONE) { 1744 /* you cannot check the inode.use flag here since the matrix was just created. 1745 now build a GPU matrix data structure */ 1746 B->spptr = new Mat_SeqAIJCUSPARSE; 1747 ((Mat_SeqAIJCUSPARSE*)B->spptr)->mat = 0; 1748 ((Mat_SeqAIJCUSPARSE*)B->spptr)->matTranspose = 0; 1749 ((Mat_SeqAIJCUSPARSE*)B->spptr)->workVector = 0; 1750 ((Mat_SeqAIJCUSPARSE*)B->spptr)->format = MAT_CUSPARSE_CSR; 1751 ((Mat_SeqAIJCUSPARSE*)B->spptr)->stream = 0; 1752 stat = cusparseCreate(&handle);CHKERRCUDA(stat); 1753 ((Mat_SeqAIJCUSPARSE*)B->spptr)->handle = handle; 1754 ((Mat_SeqAIJCUSPARSE*)B->spptr)->nonzerostate = 0; 1755 } else { 1756 /* NEXT, set the pointers to the triangular factors */ 1757 B->spptr = new Mat_SeqAIJCUSPARSETriFactors; 1758 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->loTriFactorPtr = 0; 1759 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->upTriFactorPtr = 0; 1760 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->loTriFactorPtrTranspose = 0; 1761 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->upTriFactorPtrTranspose = 0; 1762 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->rpermIndices = 0; 1763 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->cpermIndices = 0; 1764 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->workVector = 0; 1765 stat = cusparseCreate(&handle);CHKERRCUDA(stat); 1766 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->handle = handle; 1767 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->nnz = 0; 1768 } 1769 1770 B->ops->assemblyend = MatAssemblyEnd_SeqAIJCUSPARSE; 1771 B->ops->destroy = MatDestroy_SeqAIJCUSPARSE; 1772 B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE; 1773 B->ops->mult = MatMult_SeqAIJCUSPARSE; 1774 B->ops->multadd = MatMultAdd_SeqAIJCUSPARSE; 1775 B->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE; 1776 B->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE; 1777 B->ops->duplicate = MatDuplicate_SeqAIJCUSPARSE; 1778 1779 ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 1780 1781 B->valid_GPU_matrix = PETSC_OFFLOAD_UNALLOCATED; 1782 1783 ierr = PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr); 1784 PetscFunctionReturn(0); 1785 } 1786 1787 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B) 1788 { 1789 PetscErrorCode ierr; 1790 1791 PetscFunctionBegin; 1792 ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr); 1793 ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(B);CHKERRQ(ierr); 1794 PetscFunctionReturn(0); 1795 } 1796 1797 /*MC 1798 MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices. 1799 1800 A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either 1801 CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later. 1802 All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library. 1803 1804 Options Database Keys: 1805 + -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions() 1806 . -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 1807 - -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 1808 1809 Level: beginner 1810 1811 .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 1812 M*/ 1813 1814 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat,MatFactorType,Mat*); 1815 1816 1817 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void) 1818 { 1819 PetscErrorCode ierr; 1820 1821 PetscFunctionBegin; 1822 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 1823 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 1824 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 1825 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 1826 PetscFunctionReturn(0); 1827 } 1828 1829 1830 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct) 1831 { 1832 cusparseStatus_t stat; 1833 cusparseHandle_t handle; 1834 1835 PetscFunctionBegin; 1836 if (*cusparsestruct) { 1837 MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format); 1838 MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format); 1839 delete (*cusparsestruct)->workVector; 1840 if (handle = (*cusparsestruct)->handle) { 1841 stat = cusparseDestroy(handle);CHKERRCUDA(stat); 1842 } 1843 delete *cusparsestruct; 1844 *cusparsestruct = 0; 1845 } 1846 PetscFunctionReturn(0); 1847 } 1848 1849 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat) 1850 { 1851 PetscFunctionBegin; 1852 if (*mat) { 1853 delete (*mat)->values; 1854 delete (*mat)->column_indices; 1855 delete (*mat)->row_offsets; 1856 delete *mat; 1857 *mat = 0; 1858 } 1859 PetscFunctionReturn(0); 1860 } 1861 1862 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor) 1863 { 1864 cusparseStatus_t stat; 1865 PetscErrorCode ierr; 1866 1867 PetscFunctionBegin; 1868 if (*trifactor) { 1869 if ((*trifactor)->descr) { stat = cusparseDestroyMatDescr((*trifactor)->descr);CHKERRCUDA(stat); } 1870 if ((*trifactor)->solveInfo) { stat = cusparseDestroySolveAnalysisInfo((*trifactor)->solveInfo);CHKERRCUDA(stat); } 1871 ierr = CsrMatrix_Destroy(&(*trifactor)->csrMat);CHKERRQ(ierr); 1872 delete *trifactor; 1873 *trifactor = 0; 1874 } 1875 PetscFunctionReturn(0); 1876 } 1877 1878 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format) 1879 { 1880 CsrMatrix *mat; 1881 cusparseStatus_t stat; 1882 cudaError_t err; 1883 1884 PetscFunctionBegin; 1885 if (*matstruct) { 1886 if ((*matstruct)->mat) { 1887 if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) { 1888 cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat; 1889 stat = cusparseDestroyHybMat(hybMat);CHKERRCUDA(stat); 1890 } else { 1891 mat = (CsrMatrix*)(*matstruct)->mat; 1892 CsrMatrix_Destroy(&mat); 1893 } 1894 } 1895 if ((*matstruct)->descr) { stat = cusparseDestroyMatDescr((*matstruct)->descr);CHKERRCUDA(stat); } 1896 delete (*matstruct)->cprowIndices; 1897 if ((*matstruct)->alpha) { err=cudaFree((*matstruct)->alpha);CHKERRCUDA(err); } 1898 if ((*matstruct)->beta_zero) { err=cudaFree((*matstruct)->beta_zero);CHKERRCUDA(err); } 1899 if ((*matstruct)->beta_one) { err=cudaFree((*matstruct)->beta_one);CHKERRCUDA(err); } 1900 delete *matstruct; 1901 *matstruct = 0; 1902 } 1903 PetscFunctionReturn(0); 1904 } 1905 1906 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors) 1907 { 1908 cusparseHandle_t handle; 1909 cusparseStatus_t stat; 1910 1911 PetscFunctionBegin; 1912 if (*trifactors) { 1913 MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtr); 1914 MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtr); 1915 MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtrTranspose); 1916 MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtrTranspose); 1917 delete (*trifactors)->rpermIndices; 1918 delete (*trifactors)->cpermIndices; 1919 delete (*trifactors)->workVector; 1920 if (handle = (*trifactors)->handle) { 1921 stat = cusparseDestroy(handle);CHKERRCUDA(stat); 1922 } 1923 delete *trifactors; 1924 *trifactors = 0; 1925 } 1926 PetscFunctionReturn(0); 1927 } 1928 1929