1 /* 2 Defines the basic matrix operations for the AIJ (compressed row) 3 matrix storage format using the CUSPARSE library, 4 */ 5 #define PETSC_SKIP_SPINLOCK 6 7 #include <petscconf.h> 8 #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/ 9 #include <../src/mat/impls/sbaij/seq/sbaij.h> 10 #include <../src/vec/vec/impls/dvecimpl.h> 11 #include <petsc/private/vecimpl.h> 12 #undef VecType 13 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h> 14 15 const char *const MatCUSPARSEStorageFormats[] = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0}; 16 17 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*); 18 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*); 19 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*); 20 21 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 22 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 23 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*); 24 25 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec); 26 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 27 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 28 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 29 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat); 30 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec); 31 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 32 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 33 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 34 35 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**); 36 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**); 37 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat); 38 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**); 39 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**); 40 41 PetscErrorCode MatCUSPARSESetStream(Mat A,const cudaStream_t stream) 42 { 43 cusparseStatus_t stat; 44 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 45 46 PetscFunctionBegin; 47 cusparsestruct->stream = stream; 48 stat = cusparseSetStream(cusparsestruct->handle,cusparsestruct->stream);CHKERRCUDA(stat); 49 PetscFunctionReturn(0); 50 } 51 52 PetscErrorCode MatCUSPARSESetHandle(Mat A,const cusparseHandle_t handle) 53 { 54 cusparseStatus_t stat; 55 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 56 57 PetscFunctionBegin; 58 if (cusparsestruct->handle != handle) { 59 if (cusparsestruct->handle) { 60 stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUDA(stat); 61 } 62 cusparsestruct->handle = handle; 63 } 64 stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUDA(stat); 65 PetscFunctionReturn(0); 66 } 67 68 PetscErrorCode MatCUSPARSEClearHandle(Mat A) 69 { 70 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 71 PetscFunctionBegin; 72 if (cusparsestruct->handle) 73 cusparsestruct->handle = 0; 74 PetscFunctionReturn(0); 75 } 76 77 PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A,MatSolverType *type) 78 { 79 PetscFunctionBegin; 80 *type = MATSOLVERCUSPARSE; 81 PetscFunctionReturn(0); 82 } 83 84 /*MC 85 MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices 86 on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported 87 algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer 88 performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the 89 CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these 90 algorithms are not recommended. This class does NOT support direct solver operations. 91 92 Level: beginner 93 94 .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 95 M*/ 96 97 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B) 98 { 99 PetscErrorCode ierr; 100 PetscInt n = A->rmap->n; 101 102 PetscFunctionBegin; 103 ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr); 104 (*B)->factortype = ftype; 105 ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr); 106 ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 107 108 if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) { 109 ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr); 110 (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE; 111 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJCUSPARSE; 112 } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) { 113 (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJCUSPARSE; 114 (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE; 115 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types"); 116 117 ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr); 118 ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_cusparse);CHKERRQ(ierr); 119 PetscFunctionReturn(0); 120 } 121 122 PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 123 { 124 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 125 126 PetscFunctionBegin; 127 switch (op) { 128 case MAT_CUSPARSE_MULT: 129 cusparsestruct->format = format; 130 break; 131 case MAT_CUSPARSE_ALL: 132 cusparsestruct->format = format; 133 break; 134 default: 135 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op); 136 } 137 PetscFunctionReturn(0); 138 } 139 140 /*@ 141 MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular 142 operation. Only the MatMult operation can use different GPU storage formats 143 for MPIAIJCUSPARSE matrices. 144 Not Collective 145 146 Input Parameters: 147 + A - Matrix of type SEQAIJCUSPARSE 148 . op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL. 149 - format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2) 150 151 Output Parameter: 152 153 Level: intermediate 154 155 .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 156 @*/ 157 PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 158 { 159 PetscErrorCode ierr; 160 161 PetscFunctionBegin; 162 PetscValidHeaderSpecific(A, MAT_CLASSID,1); 163 ierr = PetscTryMethod(A, "MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));CHKERRQ(ierr); 164 PetscFunctionReturn(0); 165 } 166 167 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A) 168 { 169 PetscErrorCode ierr; 170 MatCUSPARSEStorageFormat format; 171 PetscBool flg; 172 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 173 174 PetscFunctionBegin; 175 ierr = PetscOptionsHead(PetscOptionsObject,"SeqAIJCUSPARSE options");CHKERRQ(ierr); 176 if (A->factortype==MAT_FACTOR_NONE) { 177 ierr = PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV", 178 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 179 if (flg) { 180 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);CHKERRQ(ierr); 181 } 182 } 183 ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve", 184 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 185 if (flg) { 186 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr); 187 } 188 ierr = PetscOptionsTail();CHKERRQ(ierr); 189 PetscFunctionReturn(0); 190 191 } 192 193 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 194 { 195 PetscErrorCode ierr; 196 197 PetscFunctionBegin; 198 ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 199 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 200 PetscFunctionReturn(0); 201 } 202 203 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 204 { 205 PetscErrorCode ierr; 206 207 PetscFunctionBegin; 208 ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 209 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 210 PetscFunctionReturn(0); 211 } 212 213 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info) 214 { 215 PetscErrorCode ierr; 216 217 PetscFunctionBegin; 218 ierr = MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr); 219 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 220 PetscFunctionReturn(0); 221 } 222 223 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info) 224 { 225 PetscErrorCode ierr; 226 227 PetscFunctionBegin; 228 ierr = MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr); 229 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 230 PetscFunctionReturn(0); 231 } 232 233 static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A) 234 { 235 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 236 PetscInt n = A->rmap->n; 237 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 238 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 239 cusparseStatus_t stat; 240 const PetscInt *ai = a->i,*aj = a->j,*vi; 241 const MatScalar *aa = a->a,*v; 242 PetscInt *AiLo, *AjLo; 243 PetscScalar *AALo; 244 PetscInt i,nz, nzLower, offset, rowOffset; 245 PetscErrorCode ierr; 246 247 PetscFunctionBegin; 248 if (A->valid_GPU_matrix == PETSC_OFFLOAD_UNALLOCATED || A->valid_GPU_matrix == PETSC_OFFLOAD_CPU) { 249 try { 250 /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */ 251 nzLower=n+ai[n]-ai[1]; 252 253 /* Allocate Space for the lower triangular matrix */ 254 ierr = cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUDA(ierr); 255 ierr = cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUDA(ierr); 256 ierr = cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUDA(ierr); 257 258 /* Fill the lower triangular matrix */ 259 AiLo[0] = (PetscInt) 0; 260 AiLo[n] = nzLower; 261 AjLo[0] = (PetscInt) 0; 262 AALo[0] = (MatScalar) 1.0; 263 v = aa; 264 vi = aj; 265 offset = 1; 266 rowOffset= 1; 267 for (i=1; i<n; i++) { 268 nz = ai[i+1] - ai[i]; 269 /* additional 1 for the term on the diagonal */ 270 AiLo[i] = rowOffset; 271 rowOffset += nz+1; 272 273 ierr = PetscArraycpy(&(AjLo[offset]), vi, nz);CHKERRQ(ierr); 274 ierr = PetscArraycpy(&(AALo[offset]), v, nz);CHKERRQ(ierr); 275 276 offset += nz; 277 AjLo[offset] = (PetscInt) i; 278 AALo[offset] = (MatScalar) 1.0; 279 offset += 1; 280 281 v += nz; 282 vi += nz; 283 } 284 285 /* allocate space for the triangular factor information */ 286 loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct; 287 288 /* Create the matrix description */ 289 stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUDA(stat); 290 stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat); 291 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUDA(stat); 292 stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER);CHKERRCUDA(stat); 293 stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUDA(stat); 294 295 /* Create the solve analysis information */ 296 stat = cusparseCreateSolveAnalysisInfo(&loTriFactor->solveInfo);CHKERRCUDA(stat); 297 298 /* set the operation */ 299 loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 300 301 /* set the matrix */ 302 loTriFactor->csrMat = new CsrMatrix; 303 loTriFactor->csrMat->num_rows = n; 304 loTriFactor->csrMat->num_cols = n; 305 loTriFactor->csrMat->num_entries = nzLower; 306 307 loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1); 308 loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1); 309 310 loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower); 311 loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower); 312 313 loTriFactor->csrMat->values = new THRUSTARRAY(nzLower); 314 loTriFactor->csrMat->values->assign(AALo, AALo+nzLower); 315 316 /* perform the solve analysis */ 317 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, 318 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 319 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 320 loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo);CHKERRCUDA(stat); 321 322 /* assign the pointer. Is this really necessary? */ 323 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor; 324 325 ierr = cudaFreeHost(AiLo);CHKERRCUDA(ierr); 326 ierr = cudaFreeHost(AjLo);CHKERRCUDA(ierr); 327 ierr = cudaFreeHost(AALo);CHKERRCUDA(ierr); 328 ierr = PetscLogCpuToGpu((n+1+nzLower)*sizeof(int)+nzLower*sizeof(PetscScalar));CHKERRQ(ierr); 329 } catch(char *ex) { 330 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 331 } 332 } 333 PetscFunctionReturn(0); 334 } 335 336 static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A) 337 { 338 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 339 PetscInt n = A->rmap->n; 340 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 341 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 342 cusparseStatus_t stat; 343 const PetscInt *aj = a->j,*adiag = a->diag,*vi; 344 const MatScalar *aa = a->a,*v; 345 PetscInt *AiUp, *AjUp; 346 PetscScalar *AAUp; 347 PetscInt i,nz, nzUpper, offset; 348 PetscErrorCode ierr; 349 350 PetscFunctionBegin; 351 if (A->valid_GPU_matrix == PETSC_OFFLOAD_UNALLOCATED || A->valid_GPU_matrix == PETSC_OFFLOAD_CPU) { 352 try { 353 /* next, figure out the number of nonzeros in the upper triangular matrix. */ 354 nzUpper = adiag[0]-adiag[n]; 355 356 /* Allocate Space for the upper triangular matrix */ 357 ierr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(ierr); 358 ierr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(ierr); 359 ierr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(ierr); 360 361 /* Fill the upper triangular matrix */ 362 AiUp[0]=(PetscInt) 0; 363 AiUp[n]=nzUpper; 364 offset = nzUpper; 365 for (i=n-1; i>=0; i--) { 366 v = aa + adiag[i+1] + 1; 367 vi = aj + adiag[i+1] + 1; 368 369 /* number of elements NOT on the diagonal */ 370 nz = adiag[i] - adiag[i+1]-1; 371 372 /* decrement the offset */ 373 offset -= (nz+1); 374 375 /* first, set the diagonal elements */ 376 AjUp[offset] = (PetscInt) i; 377 AAUp[offset] = (MatScalar)1./v[nz]; 378 AiUp[i] = AiUp[i+1] - (nz+1); 379 380 ierr = PetscArraycpy(&(AjUp[offset+1]), vi, nz);CHKERRQ(ierr); 381 ierr = PetscArraycpy(&(AAUp[offset+1]), v, nz);CHKERRQ(ierr); 382 } 383 384 /* allocate space for the triangular factor information */ 385 upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct; 386 387 /* Create the matrix description */ 388 stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUDA(stat); 389 stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat); 390 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUDA(stat); 391 stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUDA(stat); 392 stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUDA(stat); 393 394 /* Create the solve analysis information */ 395 stat = cusparseCreateSolveAnalysisInfo(&upTriFactor->solveInfo);CHKERRCUDA(stat); 396 397 /* set the operation */ 398 upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 399 400 /* set the matrix */ 401 upTriFactor->csrMat = new CsrMatrix; 402 upTriFactor->csrMat->num_rows = n; 403 upTriFactor->csrMat->num_cols = n; 404 upTriFactor->csrMat->num_entries = nzUpper; 405 406 upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1); 407 upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1); 408 409 upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper); 410 upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper); 411 412 upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper); 413 upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper); 414 415 /* perform the solve analysis */ 416 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, 417 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 418 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 419 upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo);CHKERRCUDA(stat); 420 421 /* assign the pointer. Is this really necessary? */ 422 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor; 423 424 ierr = cudaFreeHost(AiUp);CHKERRCUDA(ierr); 425 ierr = cudaFreeHost(AjUp);CHKERRCUDA(ierr); 426 ierr = cudaFreeHost(AAUp);CHKERRCUDA(ierr); 427 ierr = PetscLogCpuToGpu((n+1+nzUpper)*sizeof(int)+nzUpper*sizeof(PetscScalar));CHKERRQ(ierr); 428 } catch(char *ex) { 429 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 430 } 431 } 432 PetscFunctionReturn(0); 433 } 434 435 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A) 436 { 437 PetscErrorCode ierr; 438 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 439 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 440 IS isrow = a->row,iscol = a->icol; 441 PetscBool row_identity,col_identity; 442 const PetscInt *r,*c; 443 PetscInt n = A->rmap->n; 444 445 PetscFunctionBegin; 446 ierr = MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);CHKERRQ(ierr); 447 ierr = MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);CHKERRQ(ierr); 448 449 cusparseTriFactors->workVector = new THRUSTARRAY(n); 450 cusparseTriFactors->nnz=a->nz; 451 452 A->valid_GPU_matrix = PETSC_OFFLOAD_BOTH; 453 /*lower triangular indices */ 454 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 455 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 456 if (!row_identity) { 457 cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 458 cusparseTriFactors->rpermIndices->assign(r, r+n); 459 } 460 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 461 462 /*upper triangular indices */ 463 ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr); 464 ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 465 if (!col_identity) { 466 cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 467 cusparseTriFactors->cpermIndices->assign(c, c+n); 468 } 469 470 if(!row_identity && !col_identity) { 471 ierr = PetscLogCpuToGpu(2*n*sizeof(PetscInt));CHKERRQ(ierr); 472 } else if(!row_identity) { 473 ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr); 474 } else if(!col_identity) { 475 ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr); 476 } 477 478 ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr); 479 PetscFunctionReturn(0); 480 } 481 482 static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A) 483 { 484 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 485 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 486 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 487 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 488 cusparseStatus_t stat; 489 PetscErrorCode ierr; 490 PetscInt *AiUp, *AjUp; 491 PetscScalar *AAUp; 492 PetscScalar *AALo; 493 PetscInt nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j; 494 Mat_SeqSBAIJ *b = (Mat_SeqSBAIJ*)A->data; 495 const PetscInt *ai = b->i,*aj = b->j,*vj; 496 const MatScalar *aa = b->a,*v; 497 498 PetscFunctionBegin; 499 if (A->valid_GPU_matrix == PETSC_OFFLOAD_UNALLOCATED || A->valid_GPU_matrix == PETSC_OFFLOAD_CPU) { 500 try { 501 /* Allocate Space for the upper triangular matrix */ 502 ierr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(ierr); 503 ierr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(ierr); 504 ierr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(ierr); 505 ierr = cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUDA(ierr); 506 507 /* Fill the upper triangular matrix */ 508 AiUp[0]=(PetscInt) 0; 509 AiUp[n]=nzUpper; 510 offset = 0; 511 for (i=0; i<n; i++) { 512 /* set the pointers */ 513 v = aa + ai[i]; 514 vj = aj + ai[i]; 515 nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */ 516 517 /* first, set the diagonal elements */ 518 AjUp[offset] = (PetscInt) i; 519 AAUp[offset] = (MatScalar)1.0/v[nz]; 520 AiUp[i] = offset; 521 AALo[offset] = (MatScalar)1.0/v[nz]; 522 523 offset+=1; 524 if (nz>0) { 525 ierr = PetscArraycpy(&(AjUp[offset]), vj, nz);CHKERRQ(ierr); 526 ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr); 527 for (j=offset; j<offset+nz; j++) { 528 AAUp[j] = -AAUp[j]; 529 AALo[j] = AAUp[j]/v[nz]; 530 } 531 offset+=nz; 532 } 533 } 534 535 /* allocate space for the triangular factor information */ 536 upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct; 537 538 /* Create the matrix description */ 539 stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUDA(stat); 540 stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat); 541 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUDA(stat); 542 stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUDA(stat); 543 stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUDA(stat); 544 545 /* Create the solve analysis information */ 546 stat = cusparseCreateSolveAnalysisInfo(&upTriFactor->solveInfo);CHKERRCUDA(stat); 547 548 /* set the operation */ 549 upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 550 551 /* set the matrix */ 552 upTriFactor->csrMat = new CsrMatrix; 553 upTriFactor->csrMat->num_rows = A->rmap->n; 554 upTriFactor->csrMat->num_cols = A->cmap->n; 555 upTriFactor->csrMat->num_entries = a->nz; 556 557 upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 558 upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1); 559 560 upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 561 upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz); 562 563 upTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 564 upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz); 565 566 /* perform the solve analysis */ 567 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, 568 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 569 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 570 upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo);CHKERRCUDA(stat); 571 572 /* assign the pointer. Is this really necessary? */ 573 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor; 574 575 /* allocate space for the triangular factor information */ 576 loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct; 577 578 /* Create the matrix description */ 579 stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUDA(stat); 580 stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat); 581 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUDA(stat); 582 stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUDA(stat); 583 stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUDA(stat); 584 585 /* Create the solve analysis information */ 586 stat = cusparseCreateSolveAnalysisInfo(&loTriFactor->solveInfo);CHKERRCUDA(stat); 587 588 /* set the operation */ 589 loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE; 590 591 /* set the matrix */ 592 loTriFactor->csrMat = new CsrMatrix; 593 loTriFactor->csrMat->num_rows = A->rmap->n; 594 loTriFactor->csrMat->num_cols = A->cmap->n; 595 loTriFactor->csrMat->num_entries = a->nz; 596 597 loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 598 loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1); 599 600 loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 601 loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz); 602 603 loTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 604 loTriFactor->csrMat->values->assign(AALo, AALo+a->nz); 605 ierr = PetscLogCpuToGpu(2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)));CHKERRQ(ierr); 606 607 /* perform the solve analysis */ 608 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, 609 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 610 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 611 loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo);CHKERRCUDA(stat); 612 613 /* assign the pointer. Is this really necessary? */ 614 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor; 615 616 A->valid_GPU_matrix = PETSC_OFFLOAD_BOTH; 617 ierr = cudaFreeHost(AiUp);CHKERRCUDA(ierr); 618 ierr = cudaFreeHost(AjUp);CHKERRCUDA(ierr); 619 ierr = cudaFreeHost(AAUp);CHKERRCUDA(ierr); 620 ierr = cudaFreeHost(AALo);CHKERRCUDA(ierr); 621 } catch(char *ex) { 622 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 623 } 624 } 625 PetscFunctionReturn(0); 626 } 627 628 static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A) 629 { 630 PetscErrorCode ierr; 631 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 632 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 633 IS ip = a->row; 634 const PetscInt *rip; 635 PetscBool perm_identity; 636 PetscInt n = A->rmap->n; 637 638 PetscFunctionBegin; 639 ierr = MatSeqAIJCUSPARSEBuildICCTriMatrices(A);CHKERRQ(ierr); 640 cusparseTriFactors->workVector = new THRUSTARRAY(n); 641 cusparseTriFactors->nnz=(a->nz-n)*2 + n; 642 643 /*lower triangular indices */ 644 ierr = ISGetIndices(ip,&rip);CHKERRQ(ierr); 645 ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr); 646 if (!perm_identity) { 647 cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 648 cusparseTriFactors->rpermIndices->assign(rip, rip+n); 649 cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 650 cusparseTriFactors->cpermIndices->assign(rip, rip+n); 651 ierr = PetscLogCpuToGpu(2*n*sizeof(PetscInt));CHKERRQ(ierr); 652 } 653 ierr = ISRestoreIndices(ip,&rip);CHKERRQ(ierr); 654 PetscFunctionReturn(0); 655 } 656 657 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info) 658 { 659 Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 660 IS isrow = b->row,iscol = b->col; 661 PetscBool row_identity,col_identity; 662 PetscErrorCode ierr; 663 664 PetscFunctionBegin; 665 ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 666 /* determine which version of MatSolve needs to be used. */ 667 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 668 ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 669 if (row_identity && col_identity) { 670 B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 671 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 672 } else { 673 B->ops->solve = MatSolve_SeqAIJCUSPARSE; 674 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 675 } 676 677 /* get the triangular factors */ 678 ierr = MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);CHKERRQ(ierr); 679 PetscFunctionReturn(0); 680 } 681 682 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info) 683 { 684 Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 685 IS ip = b->row; 686 PetscBool perm_identity; 687 PetscErrorCode ierr; 688 689 PetscFunctionBegin; 690 ierr = MatCholeskyFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 691 692 /* determine which version of MatSolve needs to be used. */ 693 ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr); 694 if (perm_identity) { 695 B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 696 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 697 } else { 698 B->ops->solve = MatSolve_SeqAIJCUSPARSE; 699 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 700 } 701 702 /* get the triangular factors */ 703 ierr = MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);CHKERRQ(ierr); 704 PetscFunctionReturn(0); 705 } 706 707 static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A) 708 { 709 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 710 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 711 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 712 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 713 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 714 cusparseStatus_t stat; 715 cusparseIndexBase_t indexBase; 716 cusparseMatrixType_t matrixType; 717 cusparseFillMode_t fillMode; 718 cusparseDiagType_t diagType; 719 720 PetscFunctionBegin; 721 722 /*********************************************/ 723 /* Now the Transpose of the Lower Tri Factor */ 724 /*********************************************/ 725 726 /* allocate space for the transpose of the lower triangular factor */ 727 loTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct; 728 729 /* set the matrix descriptors of the lower triangular factor */ 730 matrixType = cusparseGetMatType(loTriFactor->descr); 731 indexBase = cusparseGetMatIndexBase(loTriFactor->descr); 732 fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ? 733 CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 734 diagType = cusparseGetMatDiagType(loTriFactor->descr); 735 736 /* Create the matrix description */ 737 stat = cusparseCreateMatDescr(&loTriFactorT->descr);CHKERRCUDA(stat); 738 stat = cusparseSetMatIndexBase(loTriFactorT->descr, indexBase);CHKERRCUDA(stat); 739 stat = cusparseSetMatType(loTriFactorT->descr, matrixType);CHKERRCUDA(stat); 740 stat = cusparseSetMatFillMode(loTriFactorT->descr, fillMode);CHKERRCUDA(stat); 741 stat = cusparseSetMatDiagType(loTriFactorT->descr, diagType);CHKERRCUDA(stat); 742 743 /* Create the solve analysis information */ 744 stat = cusparseCreateSolveAnalysisInfo(&loTriFactorT->solveInfo);CHKERRCUDA(stat); 745 746 /* set the operation */ 747 loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 748 749 /* allocate GPU space for the CSC of the lower triangular factor*/ 750 loTriFactorT->csrMat = new CsrMatrix; 751 loTriFactorT->csrMat->num_rows = loTriFactor->csrMat->num_rows; 752 loTriFactorT->csrMat->num_cols = loTriFactor->csrMat->num_cols; 753 loTriFactorT->csrMat->num_entries = loTriFactor->csrMat->num_entries; 754 loTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(loTriFactor->csrMat->num_rows+1); 755 loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactor->csrMat->num_entries); 756 loTriFactorT->csrMat->values = new THRUSTARRAY(loTriFactor->csrMat->num_entries); 757 758 /* compute the transpose of the lower triangular factor, i.e. the CSC */ 759 stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, 760 loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, 761 loTriFactor->csrMat->values->data().get(), 762 loTriFactor->csrMat->row_offsets->data().get(), 763 loTriFactor->csrMat->column_indices->data().get(), 764 loTriFactorT->csrMat->values->data().get(), 765 loTriFactorT->csrMat->column_indices->data().get(), 766 loTriFactorT->csrMat->row_offsets->data().get(), 767 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUDA(stat); 768 769 /* perform the solve analysis on the transposed matrix */ 770 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp, 771 loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, 772 loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(), 773 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), 774 loTriFactorT->solveInfo);CHKERRCUDA(stat); 775 776 /* assign the pointer. Is this really necessary? */ 777 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT; 778 779 /*********************************************/ 780 /* Now the Transpose of the Upper Tri Factor */ 781 /*********************************************/ 782 783 /* allocate space for the transpose of the upper triangular factor */ 784 upTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct; 785 786 /* set the matrix descriptors of the upper triangular factor */ 787 matrixType = cusparseGetMatType(upTriFactor->descr); 788 indexBase = cusparseGetMatIndexBase(upTriFactor->descr); 789 fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ? 790 CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 791 diagType = cusparseGetMatDiagType(upTriFactor->descr); 792 793 /* Create the matrix description */ 794 stat = cusparseCreateMatDescr(&upTriFactorT->descr);CHKERRCUDA(stat); 795 stat = cusparseSetMatIndexBase(upTriFactorT->descr, indexBase);CHKERRCUDA(stat); 796 stat = cusparseSetMatType(upTriFactorT->descr, matrixType);CHKERRCUDA(stat); 797 stat = cusparseSetMatFillMode(upTriFactorT->descr, fillMode);CHKERRCUDA(stat); 798 stat = cusparseSetMatDiagType(upTriFactorT->descr, diagType);CHKERRCUDA(stat); 799 800 /* Create the solve analysis information */ 801 stat = cusparseCreateSolveAnalysisInfo(&upTriFactorT->solveInfo);CHKERRCUDA(stat); 802 803 /* set the operation */ 804 upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 805 806 /* allocate GPU space for the CSC of the upper triangular factor*/ 807 upTriFactorT->csrMat = new CsrMatrix; 808 upTriFactorT->csrMat->num_rows = upTriFactor->csrMat->num_rows; 809 upTriFactorT->csrMat->num_cols = upTriFactor->csrMat->num_cols; 810 upTriFactorT->csrMat->num_entries = upTriFactor->csrMat->num_entries; 811 upTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(upTriFactor->csrMat->num_rows+1); 812 upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactor->csrMat->num_entries); 813 upTriFactorT->csrMat->values = new THRUSTARRAY(upTriFactor->csrMat->num_entries); 814 815 /* compute the transpose of the upper triangular factor, i.e. the CSC */ 816 stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, 817 upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, 818 upTriFactor->csrMat->values->data().get(), 819 upTriFactor->csrMat->row_offsets->data().get(), 820 upTriFactor->csrMat->column_indices->data().get(), 821 upTriFactorT->csrMat->values->data().get(), 822 upTriFactorT->csrMat->column_indices->data().get(), 823 upTriFactorT->csrMat->row_offsets->data().get(), 824 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUDA(stat); 825 826 /* perform the solve analysis on the transposed matrix */ 827 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp, 828 upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, 829 upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(), 830 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), 831 upTriFactorT->solveInfo);CHKERRCUDA(stat); 832 833 /* assign the pointer. Is this really necessary? */ 834 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT; 835 PetscFunctionReturn(0); 836 } 837 838 static PetscErrorCode MatSeqAIJCUSPARSEGenerateTransposeForMult(Mat A) 839 { 840 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 841 Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 842 Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 843 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 844 cusparseStatus_t stat; 845 cusparseIndexBase_t indexBase; 846 cudaError_t err; 847 PetscErrorCode ierr; 848 849 PetscFunctionBegin; 850 851 /* allocate space for the triangular factor information */ 852 matstructT = new Mat_SeqAIJCUSPARSEMultStruct; 853 stat = cusparseCreateMatDescr(&matstructT->descr);CHKERRCUDA(stat); 854 indexBase = cusparseGetMatIndexBase(matstruct->descr); 855 stat = cusparseSetMatIndexBase(matstructT->descr, indexBase);CHKERRCUDA(stat); 856 stat = cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUDA(stat); 857 858 /* set alpha and beta */ 859 err = cudaMalloc((void **)&(matstructT->alpha), sizeof(PetscScalar));CHKERRCUDA(err); 860 err = cudaMalloc((void **)&(matstructT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err); 861 err = cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar));CHKERRCUDA(err); 862 err = cudaMemcpy(matstructT->alpha, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 863 err = cudaMemcpy(matstructT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 864 err = cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 865 stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUDA(stat); 866 867 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 868 CsrMatrix *matrix = (CsrMatrix*)matstruct->mat; 869 CsrMatrix *matrixT= new CsrMatrix; 870 matrixT->num_rows = A->cmap->n; 871 matrixT->num_cols = A->rmap->n; 872 matrixT->num_entries = a->nz; 873 matrixT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 874 matrixT->column_indices = new THRUSTINTARRAY32(a->nz); 875 matrixT->values = new THRUSTARRAY(a->nz); 876 877 /* compute the transpose of the upper triangular factor, i.e. the CSC */ 878 indexBase = cusparseGetMatIndexBase(matstruct->descr); 879 stat = cusparse_csr2csc(cusparsestruct->handle, matrix->num_rows, 880 matrix->num_cols, matrix->num_entries, 881 matrix->values->data().get(), 882 matrix->row_offsets->data().get(), 883 matrix->column_indices->data().get(), 884 matrixT->values->data().get(), 885 matrixT->column_indices->data().get(), 886 matrixT->row_offsets->data().get(), 887 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUDA(stat); 888 889 /* assign the pointer */ 890 matstructT->mat = matrixT; 891 ierr = PetscLogCpuToGpu(((A->rmap->n+1)+(a->nz))*sizeof(int)+(3+a->nz)*sizeof(PetscScalar));CHKERRQ(ierr); 892 } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) { 893 #if CUDA_VERSION>=5000 894 /* First convert HYB to CSR */ 895 CsrMatrix *temp= new CsrMatrix; 896 temp->num_rows = A->rmap->n; 897 temp->num_cols = A->cmap->n; 898 temp->num_entries = a->nz; 899 temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 900 temp->column_indices = new THRUSTINTARRAY32(a->nz); 901 temp->values = new THRUSTARRAY(a->nz); 902 903 904 stat = cusparse_hyb2csr(cusparsestruct->handle, 905 matstruct->descr, (cusparseHybMat_t)matstruct->mat, 906 temp->values->data().get(), 907 temp->row_offsets->data().get(), 908 temp->column_indices->data().get());CHKERRCUDA(stat); 909 910 /* Next, convert CSR to CSC (i.e. the matrix transpose) */ 911 CsrMatrix *tempT= new CsrMatrix; 912 tempT->num_rows = A->rmap->n; 913 tempT->num_cols = A->cmap->n; 914 tempT->num_entries = a->nz; 915 tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 916 tempT->column_indices = new THRUSTINTARRAY32(a->nz); 917 tempT->values = new THRUSTARRAY(a->nz); 918 919 stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows, 920 temp->num_cols, temp->num_entries, 921 temp->values->data().get(), 922 temp->row_offsets->data().get(), 923 temp->column_indices->data().get(), 924 tempT->values->data().get(), 925 tempT->column_indices->data().get(), 926 tempT->row_offsets->data().get(), 927 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUDA(stat); 928 929 /* Last, convert CSC to HYB */ 930 cusparseHybMat_t hybMat; 931 stat = cusparseCreateHybMat(&hybMat);CHKERRCUDA(stat); 932 cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ? 933 CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 934 stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n, 935 matstructT->descr, tempT->values->data().get(), 936 tempT->row_offsets->data().get(), 937 tempT->column_indices->data().get(), 938 hybMat, 0, partition);CHKERRCUDA(stat); 939 940 /* assign the pointer */ 941 matstructT->mat = hybMat; 942 ierr = PetscLogCpuToGpu((2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)))+3*sizeof(PetscScalar));CHKERRQ(ierr); 943 944 /* delete temporaries */ 945 if (tempT) { 946 if (tempT->values) delete (THRUSTARRAY*) tempT->values; 947 if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices; 948 if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets; 949 delete (CsrMatrix*) tempT; 950 } 951 if (temp) { 952 if (temp->values) delete (THRUSTARRAY*) temp->values; 953 if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices; 954 if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets; 955 delete (CsrMatrix*) temp; 956 } 957 #else 958 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"ELL (Ellpack) and HYB (Hybrid) storage format for the Matrix Transpose (in MatMultTranspose) require CUDA 5.0 or later."); 959 #endif 960 } 961 /* assign the compressed row indices */ 962 matstructT->cprowIndices = new THRUSTINTARRAY; 963 matstructT->cprowIndices->resize(A->cmap->n); 964 thrust::sequence(matstructT->cprowIndices->begin(), matstructT->cprowIndices->end()); 965 /* assign the pointer */ 966 ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT; 967 PetscFunctionReturn(0); 968 } 969 970 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 971 { 972 PetscInt n = xx->map->n; 973 const PetscScalar *barray; 974 PetscScalar *xarray; 975 thrust::device_ptr<const PetscScalar> bGPU; 976 thrust::device_ptr<PetscScalar> xGPU; 977 cusparseStatus_t stat; 978 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 979 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 980 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 981 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 982 PetscErrorCode ierr; 983 984 PetscFunctionBegin; 985 /* Analyze the matrix and create the transpose ... on the fly */ 986 if (!loTriFactorT && !upTriFactorT) { 987 ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr); 988 loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 989 upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 990 } 991 992 /* Get the GPU pointers */ 993 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 994 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 995 xGPU = thrust::device_pointer_cast(xarray); 996 bGPU = thrust::device_pointer_cast(barray); 997 998 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 999 /* First, reorder with the row permutation */ 1000 thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), 1001 thrust::make_permutation_iterator(bGPU+n, cusparseTriFactors->rpermIndices->end()), 1002 xGPU); 1003 1004 /* First, solve U */ 1005 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, 1006 upTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactorT->descr, 1007 upTriFactorT->csrMat->values->data().get(), 1008 upTriFactorT->csrMat->row_offsets->data().get(), 1009 upTriFactorT->csrMat->column_indices->data().get(), 1010 upTriFactorT->solveInfo, 1011 xarray, tempGPU->data().get());CHKERRCUDA(stat); 1012 1013 /* Then, solve L */ 1014 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, 1015 loTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactorT->descr, 1016 loTriFactorT->csrMat->values->data().get(), 1017 loTriFactorT->csrMat->row_offsets->data().get(), 1018 loTriFactorT->csrMat->column_indices->data().get(), 1019 loTriFactorT->solveInfo, 1020 tempGPU->data().get(), xarray);CHKERRCUDA(stat); 1021 1022 /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */ 1023 thrust::copy(thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()), 1024 thrust::make_permutation_iterator(xGPU+n, cusparseTriFactors->cpermIndices->end()), 1025 tempGPU->begin()); 1026 1027 /* Copy the temporary to the full solution. */ 1028 thrust::copy(tempGPU->begin(), tempGPU->end(), xGPU); 1029 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1030 1031 /* restore */ 1032 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1033 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1034 ierr = WaitForGPU();CHKERRCUDA(ierr); 1035 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1036 PetscFunctionReturn(0); 1037 } 1038 1039 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 1040 { 1041 const PetscScalar *barray; 1042 PetscScalar *xarray; 1043 cusparseStatus_t stat; 1044 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1045 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1046 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1047 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1048 PetscErrorCode ierr; 1049 1050 PetscFunctionBegin; 1051 /* Analyze the matrix and create the transpose ... on the fly */ 1052 if (!loTriFactorT && !upTriFactorT) { 1053 ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr); 1054 loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1055 upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1056 } 1057 1058 /* Get the GPU pointers */ 1059 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1060 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1061 1062 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1063 /* First, solve U */ 1064 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, 1065 upTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactorT->descr, 1066 upTriFactorT->csrMat->values->data().get(), 1067 upTriFactorT->csrMat->row_offsets->data().get(), 1068 upTriFactorT->csrMat->column_indices->data().get(), 1069 upTriFactorT->solveInfo, 1070 barray, tempGPU->data().get());CHKERRCUDA(stat); 1071 1072 /* Then, solve L */ 1073 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, 1074 loTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactorT->descr, 1075 loTriFactorT->csrMat->values->data().get(), 1076 loTriFactorT->csrMat->row_offsets->data().get(), 1077 loTriFactorT->csrMat->column_indices->data().get(), 1078 loTriFactorT->solveInfo, 1079 tempGPU->data().get(), xarray);CHKERRCUDA(stat); 1080 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1081 1082 /* restore */ 1083 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1084 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1085 ierr = WaitForGPU();CHKERRCUDA(ierr); 1086 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1087 PetscFunctionReturn(0); 1088 } 1089 1090 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 1091 { 1092 const PetscScalar *barray; 1093 PetscScalar *xarray; 1094 thrust::device_ptr<const PetscScalar> bGPU; 1095 thrust::device_ptr<PetscScalar> xGPU; 1096 cusparseStatus_t stat; 1097 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1098 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1099 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1100 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1101 PetscErrorCode ierr; 1102 1103 PetscFunctionBegin; 1104 1105 /* Get the GPU pointers */ 1106 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1107 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1108 xGPU = thrust::device_pointer_cast(xarray); 1109 bGPU = thrust::device_pointer_cast(barray); 1110 1111 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1112 /* First, reorder with the row permutation */ 1113 thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), 1114 thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()), 1115 xGPU); 1116 1117 /* Next, solve L */ 1118 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp, 1119 loTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactor->descr, 1120 loTriFactor->csrMat->values->data().get(), 1121 loTriFactor->csrMat->row_offsets->data().get(), 1122 loTriFactor->csrMat->column_indices->data().get(), 1123 loTriFactor->solveInfo, 1124 xarray, tempGPU->data().get());CHKERRCUDA(stat); 1125 1126 /* Then, solve U */ 1127 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp, 1128 upTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactor->descr, 1129 upTriFactor->csrMat->values->data().get(), 1130 upTriFactor->csrMat->row_offsets->data().get(), 1131 upTriFactor->csrMat->column_indices->data().get(), 1132 upTriFactor->solveInfo, 1133 tempGPU->data().get(), xarray);CHKERRCUDA(stat); 1134 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1135 1136 /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */ 1137 thrust::copy(thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()), 1138 thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->end()), 1139 tempGPU->begin()); 1140 1141 /* Copy the temporary to the full solution. */ 1142 thrust::copy(tempGPU->begin(), tempGPU->end(), xGPU); 1143 1144 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1145 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1146 ierr = WaitForGPU();CHKERRCUDA(ierr); 1147 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1148 PetscFunctionReturn(0); 1149 } 1150 1151 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 1152 { 1153 const PetscScalar *barray; 1154 PetscScalar *xarray; 1155 cusparseStatus_t stat; 1156 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1157 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1158 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1159 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1160 PetscErrorCode ierr; 1161 1162 PetscFunctionBegin; 1163 /* Get the GPU pointers */ 1164 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1165 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1166 1167 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1168 /* First, solve L */ 1169 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp, 1170 loTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactor->descr, 1171 loTriFactor->csrMat->values->data().get(), 1172 loTriFactor->csrMat->row_offsets->data().get(), 1173 loTriFactor->csrMat->column_indices->data().get(), 1174 loTriFactor->solveInfo, 1175 barray, tempGPU->data().get());CHKERRCUDA(stat); 1176 1177 /* Next, solve U */ 1178 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp, 1179 upTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactor->descr, 1180 upTriFactor->csrMat->values->data().get(), 1181 upTriFactor->csrMat->row_offsets->data().get(), 1182 upTriFactor->csrMat->column_indices->data().get(), 1183 upTriFactor->solveInfo, 1184 tempGPU->data().get(), xarray);CHKERRCUDA(stat); 1185 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1186 1187 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1188 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1189 ierr = WaitForGPU();CHKERRCUDA(ierr); 1190 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1191 PetscFunctionReturn(0); 1192 } 1193 1194 static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A) 1195 { 1196 1197 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1198 Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 1199 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1200 PetscInt m = A->rmap->n,*ii,*ridx; 1201 PetscErrorCode ierr; 1202 cusparseStatus_t stat; 1203 cudaError_t err; 1204 1205 PetscFunctionBegin; 1206 if (A->pinnedtocpu) PetscFunctionReturn(0); 1207 if (A->valid_GPU_matrix == PETSC_OFFLOAD_UNALLOCATED || A->valid_GPU_matrix == PETSC_OFFLOAD_CPU) { 1208 ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1209 if (A->assembled && A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) { 1210 CsrMatrix *matrix = (CsrMatrix*)matstruct->mat; 1211 /* copy values only */ 1212 matrix->values->assign(a->a, a->a+a->nz); 1213 ierr = PetscLogCpuToGpu((a->nz)*sizeof(PetscScalar));CHKERRQ(ierr); 1214 } else { 1215 MatSeqAIJCUSPARSEMultStruct_Destroy(&matstruct,cusparsestruct->format); 1216 try { 1217 cusparsestruct->nonzerorow=0; 1218 for (int j = 0; j<m; j++) cusparsestruct->nonzerorow += ((a->i[j+1]-a->i[j])>0); 1219 1220 if (a->compressedrow.use) { 1221 m = a->compressedrow.nrows; 1222 ii = a->compressedrow.i; 1223 ridx = a->compressedrow.rindex; 1224 } else { 1225 /* Forcing compressed row on the GPU */ 1226 int k=0; 1227 ierr = PetscMalloc1(cusparsestruct->nonzerorow+1, &ii);CHKERRQ(ierr); 1228 ierr = PetscMalloc1(cusparsestruct->nonzerorow, &ridx);CHKERRQ(ierr); 1229 ii[0]=0; 1230 for (int j = 0; j<m; j++) { 1231 if ((a->i[j+1]-a->i[j])>0) { 1232 ii[k] = a->i[j]; 1233 ridx[k]= j; 1234 k++; 1235 } 1236 } 1237 ii[cusparsestruct->nonzerorow] = a->nz; 1238 m = cusparsestruct->nonzerorow; 1239 } 1240 1241 /* allocate space for the triangular factor information */ 1242 matstruct = new Mat_SeqAIJCUSPARSEMultStruct; 1243 stat = cusparseCreateMatDescr(&matstruct->descr);CHKERRCUDA(stat); 1244 stat = cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat); 1245 stat = cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUDA(stat); 1246 1247 err = cudaMalloc((void **)&(matstruct->alpha), sizeof(PetscScalar));CHKERRCUDA(err); 1248 err = cudaMalloc((void **)&(matstruct->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err); 1249 err = cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar));CHKERRCUDA(err); 1250 err = cudaMemcpy(matstruct->alpha, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1251 err = cudaMemcpy(matstruct->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1252 err = cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1253 stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUDA(stat); 1254 1255 /* Build a hybrid/ellpack matrix if this option is chosen for the storage */ 1256 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1257 /* set the matrix */ 1258 CsrMatrix *matrix= new CsrMatrix; 1259 matrix->num_rows = m; 1260 matrix->num_cols = A->cmap->n; 1261 matrix->num_entries = a->nz; 1262 matrix->row_offsets = new THRUSTINTARRAY32(m+1); 1263 matrix->row_offsets->assign(ii, ii + m+1); 1264 1265 matrix->column_indices = new THRUSTINTARRAY32(a->nz); 1266 matrix->column_indices->assign(a->j, a->j+a->nz); 1267 1268 matrix->values = new THRUSTARRAY(a->nz); 1269 matrix->values->assign(a->a, a->a+a->nz); 1270 1271 /* assign the pointer */ 1272 matstruct->mat = matrix; 1273 1274 } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) { 1275 CsrMatrix *matrix= new CsrMatrix; 1276 matrix->num_rows = m; 1277 matrix->num_cols = A->cmap->n; 1278 matrix->num_entries = a->nz; 1279 matrix->row_offsets = new THRUSTINTARRAY32(m+1); 1280 matrix->row_offsets->assign(ii, ii + m+1); 1281 1282 matrix->column_indices = new THRUSTINTARRAY32(a->nz); 1283 matrix->column_indices->assign(a->j, a->j+a->nz); 1284 1285 matrix->values = new THRUSTARRAY(a->nz); 1286 matrix->values->assign(a->a, a->a+a->nz); 1287 1288 cusparseHybMat_t hybMat; 1289 stat = cusparseCreateHybMat(&hybMat);CHKERRCUDA(stat); 1290 cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ? 1291 CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 1292 stat = cusparse_csr2hyb(cusparsestruct->handle, matrix->num_rows, matrix->num_cols, 1293 matstruct->descr, matrix->values->data().get(), 1294 matrix->row_offsets->data().get(), 1295 matrix->column_indices->data().get(), 1296 hybMat, 0, partition);CHKERRCUDA(stat); 1297 /* assign the pointer */ 1298 matstruct->mat = hybMat; 1299 1300 if (matrix) { 1301 if (matrix->values) delete (THRUSTARRAY*)matrix->values; 1302 if (matrix->column_indices) delete (THRUSTINTARRAY32*)matrix->column_indices; 1303 if (matrix->row_offsets) delete (THRUSTINTARRAY32*)matrix->row_offsets; 1304 delete (CsrMatrix*)matrix; 1305 } 1306 } 1307 1308 /* assign the compressed row indices */ 1309 matstruct->cprowIndices = new THRUSTINTARRAY(m); 1310 matstruct->cprowIndices->assign(ridx,ridx+m); 1311 ierr = PetscLogCpuToGpu(((m+1)+(a->nz))*sizeof(int)+m*sizeof(PetscInt)+(3+(a->nz))*sizeof(PetscScalar));CHKERRQ(ierr); 1312 1313 /* assign the pointer */ 1314 cusparsestruct->mat = matstruct; 1315 1316 if (!a->compressedrow.use) { 1317 ierr = PetscFree(ii);CHKERRQ(ierr); 1318 ierr = PetscFree(ridx);CHKERRQ(ierr); 1319 } 1320 cusparsestruct->workVector = new THRUSTARRAY(m); 1321 } catch(char *ex) { 1322 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 1323 } 1324 cusparsestruct->nonzerostate = A->nonzerostate; 1325 } 1326 ierr = WaitForGPU();CHKERRCUDA(ierr); 1327 A->valid_GPU_matrix = PETSC_OFFLOAD_BOTH; 1328 ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1329 } 1330 PetscFunctionReturn(0); 1331 } 1332 1333 struct VecCUDAPlusEquals 1334 { 1335 template <typename Tuple> 1336 __host__ __device__ 1337 void operator()(Tuple t) 1338 { 1339 thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t); 1340 } 1341 }; 1342 1343 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 1344 { 1345 PetscErrorCode ierr; 1346 1347 PetscFunctionBegin; 1348 ierr = MatMultAdd_SeqAIJCUSPARSE(A,xx,NULL,yy);CHKERRQ(ierr); 1349 PetscFunctionReturn(0); 1350 } 1351 1352 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 1353 { 1354 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1355 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1356 Mat_SeqAIJCUSPARSEMultStruct *matstructT; 1357 const PetscScalar *xarray; 1358 PetscScalar *yarray; 1359 PetscErrorCode ierr; 1360 cusparseStatus_t stat; 1361 1362 PetscFunctionBegin; 1363 /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */ 1364 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 1365 matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 1366 if (!matstructT) { 1367 ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr); 1368 matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 1369 } 1370 ierr = VecCUDAGetArrayRead(xx,&xarray);CHKERRQ(ierr); 1371 ierr = VecSet(yy,0);CHKERRQ(ierr); 1372 ierr = VecCUDAGetArrayWrite(yy,&yarray);CHKERRQ(ierr); 1373 1374 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1375 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1376 CsrMatrix *mat = (CsrMatrix*)matstructT->mat; 1377 stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1378 mat->num_rows, mat->num_cols, 1379 mat->num_entries, matstructT->alpha, matstructT->descr, 1380 mat->values->data().get(), mat->row_offsets->data().get(), 1381 mat->column_indices->data().get(), xarray, matstructT->beta_zero, 1382 yarray);CHKERRCUDA(stat); 1383 } else { 1384 cusparseHybMat_t hybMat = (cusparseHybMat_t)matstructT->mat; 1385 stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1386 matstructT->alpha, matstructT->descr, hybMat, 1387 xarray, matstructT->beta_zero, 1388 yarray);CHKERRCUDA(stat); 1389 } 1390 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1391 ierr = VecCUDARestoreArrayRead(xx,&xarray);CHKERRQ(ierr); 1392 ierr = VecCUDARestoreArrayWrite(yy,&yarray);CHKERRQ(ierr); 1393 if (!cusparsestruct->stream) { 1394 ierr = WaitForGPU();CHKERRCUDA(ierr); 1395 } 1396 ierr = PetscLogGpuFlops(2.0*a->nz - cusparsestruct->nonzerorow);CHKERRQ(ierr); 1397 PetscFunctionReturn(0); 1398 } 1399 1400 1401 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 1402 { 1403 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1404 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1405 Mat_SeqAIJCUSPARSEMultStruct *matstruct; 1406 const PetscScalar *xarray; 1407 PetscScalar *zarray,*dptr,*beta; 1408 PetscErrorCode ierr; 1409 cusparseStatus_t stat; 1410 1411 PetscFunctionBegin; 1412 /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */ 1413 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 1414 matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 1415 try { 1416 ierr = VecCUDAGetArrayRead(xx,&xarray);CHKERRQ(ierr); 1417 ierr = VecCUDAGetArray(zz,&zarray);CHKERRQ(ierr); 1418 dptr = cusparsestruct->workVector->size() == (thrust::detail::vector_base<PetscScalar, thrust::device_malloc_allocator<PetscScalar> >::size_type)(A->rmap->n) ? zarray : cusparsestruct->workVector->data().get(); 1419 beta = (yy == zz && dptr == zarray) ? matstruct->beta_one : matstruct->beta_zero; 1420 1421 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1422 /* csr_spmv is multiply add */ 1423 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 1424 /* here we need to be careful to set the number of rows in the multiply to the 1425 number of compressed rows in the matrix ... which is equivalent to the 1426 size of the workVector */ 1427 CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 1428 stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1429 mat->num_rows, mat->num_cols, 1430 mat->num_entries, matstruct->alpha, matstruct->descr, 1431 mat->values->data().get(), mat->row_offsets->data().get(), 1432 mat->column_indices->data().get(), xarray, beta, 1433 dptr);CHKERRCUDA(stat); 1434 } else { 1435 if (cusparsestruct->workVector->size()) { 1436 cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat; 1437 stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1438 matstruct->alpha, matstruct->descr, hybMat, 1439 xarray, beta, 1440 dptr);CHKERRCUDA(stat); 1441 } 1442 } 1443 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1444 1445 if (yy) { 1446 if (dptr != zarray) { 1447 ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr); 1448 } else if (zz != yy) { 1449 ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); 1450 } 1451 } else if (dptr != zarray) { 1452 ierr = VecSet(zz,0);CHKERRQ(ierr); 1453 } 1454 /* scatter the data from the temporary into the full vector with a += operation */ 1455 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1456 if (dptr != zarray) { 1457 thrust::device_ptr<PetscScalar> zptr; 1458 1459 zptr = thrust::device_pointer_cast(zarray); 1460 thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))), 1461 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + cusparsestruct->workVector->size(), 1462 VecCUDAPlusEquals()); 1463 } 1464 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1465 ierr = VecCUDARestoreArrayRead(xx,&xarray);CHKERRQ(ierr); 1466 ierr = VecCUDARestoreArray(zz,&zarray);CHKERRQ(ierr); 1467 } catch(char *ex) { 1468 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 1469 } 1470 if (!yy) { /* MatMult */ 1471 if (!cusparsestruct->stream) { 1472 ierr = WaitForGPU();CHKERRCUDA(ierr); 1473 } 1474 } 1475 ierr = PetscLogGpuFlops(2.0*a->nz);CHKERRQ(ierr); 1476 PetscFunctionReturn(0); 1477 } 1478 1479 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 1480 { 1481 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1482 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1483 Mat_SeqAIJCUSPARSEMultStruct *matstructT; 1484 thrust::device_ptr<PetscScalar> zptr; 1485 const PetscScalar *xarray; 1486 PetscScalar *zarray; 1487 PetscErrorCode ierr; 1488 cusparseStatus_t stat; 1489 1490 PetscFunctionBegin; 1491 /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */ 1492 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 1493 matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 1494 if (!matstructT) { 1495 ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr); 1496 matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 1497 } 1498 1499 try { 1500 ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr); 1501 ierr = VecCUDAGetArrayRead(xx,&xarray);CHKERRQ(ierr); 1502 ierr = VecCUDAGetArray(zz,&zarray);CHKERRQ(ierr); 1503 zptr = thrust::device_pointer_cast(zarray); 1504 1505 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1506 /* multiply add with matrix transpose */ 1507 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1508 CsrMatrix *mat = (CsrMatrix*)matstructT->mat; 1509 /* here we need to be careful to set the number of rows in the multiply to the 1510 number of compressed rows in the matrix ... which is equivalent to the 1511 size of the workVector */ 1512 stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1513 mat->num_rows, mat->num_cols, 1514 mat->num_entries, matstructT->alpha, matstructT->descr, 1515 mat->values->data().get(), mat->row_offsets->data().get(), 1516 mat->column_indices->data().get(), xarray, matstructT->beta_zero, 1517 cusparsestruct->workVector->data().get());CHKERRCUDA(stat); 1518 } else { 1519 cusparseHybMat_t hybMat = (cusparseHybMat_t)matstructT->mat; 1520 if (cusparsestruct->workVector->size()) { 1521 stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1522 matstructT->alpha, matstructT->descr, hybMat, 1523 xarray, matstructT->beta_zero, 1524 cusparsestruct->workVector->data().get());CHKERRCUDA(stat); 1525 } 1526 } 1527 1528 /* scatter the data from the temporary into the full vector with a += operation */ 1529 thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstructT->cprowIndices->begin()))), 1530 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstructT->cprowIndices->begin()))) + A->cmap->n, 1531 VecCUDAPlusEquals()); 1532 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1533 ierr = VecCUDARestoreArrayRead(xx,&xarray);CHKERRQ(ierr); 1534 ierr = VecCUDARestoreArray(zz,&zarray);CHKERRQ(ierr); 1535 1536 } catch(char *ex) { 1537 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 1538 } 1539 ierr = WaitForGPU();CHKERRCUDA(ierr); 1540 ierr = PetscLogGpuFlops(2.0*a->nz);CHKERRQ(ierr); 1541 PetscFunctionReturn(0); 1542 } 1543 1544 static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode) 1545 { 1546 PetscErrorCode ierr; 1547 1548 PetscFunctionBegin; 1549 ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr); 1550 if (A->factortype==MAT_FACTOR_NONE) { 1551 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 1552 } 1553 if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0); 1554 A->ops->mult = MatMult_SeqAIJCUSPARSE; 1555 A->ops->multadd = MatMultAdd_SeqAIJCUSPARSE; 1556 A->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE; 1557 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE; 1558 PetscFunctionReturn(0); 1559 } 1560 1561 /* --------------------------------------------------------------------------------*/ 1562 /*@ 1563 MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format 1564 (the default parallel PETSc format). This matrix will ultimately pushed down 1565 to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix 1566 assembly performance the user should preallocate the matrix storage by setting 1567 the parameter nz (or the array nnz). By setting these parameters accurately, 1568 performance during matrix assembly can be increased by more than a factor of 50. 1569 1570 Collective 1571 1572 Input Parameters: 1573 + comm - MPI communicator, set to PETSC_COMM_SELF 1574 . m - number of rows 1575 . n - number of columns 1576 . nz - number of nonzeros per row (same for all rows) 1577 - nnz - array containing the number of nonzeros in the various rows 1578 (possibly different for each row) or NULL 1579 1580 Output Parameter: 1581 . A - the matrix 1582 1583 It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 1584 MatXXXXSetPreallocation() paradgm instead of this routine directly. 1585 [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 1586 1587 Notes: 1588 If nnz is given then nz is ignored 1589 1590 The AIJ format (also called the Yale sparse matrix format or 1591 compressed row storage), is fully compatible with standard Fortran 77 1592 storage. That is, the stored row and column indices can begin at 1593 either one (as in Fortran) or zero. See the users' manual for details. 1594 1595 Specify the preallocated storage with either nz or nnz (not both). 1596 Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 1597 allocation. For large problems you MUST preallocate memory or you 1598 will get TERRIBLE performance, see the users' manual chapter on matrices. 1599 1600 By default, this format uses inodes (identical nodes) when possible, to 1601 improve numerical efficiency of matrix-vector products and solves. We 1602 search for consecutive rows with the same nonzero structure, thereby 1603 reusing matrix information to achieve increased efficiency. 1604 1605 Level: intermediate 1606 1607 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE 1608 @*/ 1609 PetscErrorCode MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A) 1610 { 1611 PetscErrorCode ierr; 1612 1613 PetscFunctionBegin; 1614 ierr = MatCreate(comm,A);CHKERRQ(ierr); 1615 ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr); 1616 ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 1617 ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr); 1618 PetscFunctionReturn(0); 1619 } 1620 1621 static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A) 1622 { 1623 PetscErrorCode ierr; 1624 1625 PetscFunctionBegin; 1626 if (A->factortype==MAT_FACTOR_NONE) { 1627 if (A->valid_GPU_matrix != PETSC_OFFLOAD_UNALLOCATED) { 1628 ierr = MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr);CHKERRQ(ierr); 1629 } 1630 } else { 1631 ierr = MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr);CHKERRQ(ierr); 1632 } 1633 ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr); 1634 PetscFunctionReturn(0); 1635 } 1636 1637 static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A,MatDuplicateOption cpvalues,Mat *B) 1638 { 1639 PetscErrorCode ierr; 1640 Mat C; 1641 cusparseStatus_t stat; 1642 cusparseHandle_t handle=0; 1643 1644 PetscFunctionBegin; 1645 ierr = MatDuplicate_SeqAIJ(A,cpvalues,B);CHKERRQ(ierr); 1646 C = *B; 1647 ierr = PetscFree(C->defaultvectype);CHKERRQ(ierr); 1648 ierr = PetscStrallocpy(VECCUDA,&C->defaultvectype);CHKERRQ(ierr); 1649 1650 /* inject CUSPARSE-specific stuff */ 1651 if (C->factortype==MAT_FACTOR_NONE) { 1652 /* you cannot check the inode.use flag here since the matrix was just created. 1653 now build a GPU matrix data structure */ 1654 C->spptr = new Mat_SeqAIJCUSPARSE; 1655 ((Mat_SeqAIJCUSPARSE*)C->spptr)->mat = 0; 1656 ((Mat_SeqAIJCUSPARSE*)C->spptr)->matTranspose = 0; 1657 ((Mat_SeqAIJCUSPARSE*)C->spptr)->workVector = 0; 1658 ((Mat_SeqAIJCUSPARSE*)C->spptr)->format = MAT_CUSPARSE_CSR; 1659 ((Mat_SeqAIJCUSPARSE*)C->spptr)->stream = 0; 1660 stat = cusparseCreate(&handle);CHKERRCUDA(stat); 1661 ((Mat_SeqAIJCUSPARSE*)C->spptr)->handle = handle; 1662 ((Mat_SeqAIJCUSPARSE*)C->spptr)->nonzerostate = 0; 1663 } else { 1664 /* NEXT, set the pointers to the triangular factors */ 1665 C->spptr = new Mat_SeqAIJCUSPARSETriFactors; 1666 ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->loTriFactorPtr = 0; 1667 ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->upTriFactorPtr = 0; 1668 ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->loTriFactorPtrTranspose = 0; 1669 ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->upTriFactorPtrTranspose = 0; 1670 ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->rpermIndices = 0; 1671 ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->cpermIndices = 0; 1672 ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->workVector = 0; 1673 ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->handle = 0; 1674 stat = cusparseCreate(&handle);CHKERRCUDA(stat); 1675 ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->handle = handle; 1676 ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->nnz = 0; 1677 } 1678 1679 C->ops->assemblyend = MatAssemblyEnd_SeqAIJCUSPARSE; 1680 C->ops->destroy = MatDestroy_SeqAIJCUSPARSE; 1681 C->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE; 1682 C->ops->mult = MatMult_SeqAIJCUSPARSE; 1683 C->ops->multadd = MatMultAdd_SeqAIJCUSPARSE; 1684 C->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE; 1685 C->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE; 1686 C->ops->duplicate = MatDuplicate_SeqAIJCUSPARSE; 1687 1688 ierr = PetscObjectChangeTypeName((PetscObject)C,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 1689 1690 C->valid_GPU_matrix = PETSC_OFFLOAD_UNALLOCATED; 1691 1692 ierr = PetscObjectComposeFunction((PetscObject)C, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr); 1693 PetscFunctionReturn(0); 1694 } 1695 1696 PETSC_EXTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat B) 1697 { 1698 PetscErrorCode ierr; 1699 cusparseStatus_t stat; 1700 cusparseHandle_t handle=0; 1701 1702 PetscFunctionBegin; 1703 ierr = PetscFree(B->defaultvectype);CHKERRQ(ierr); 1704 ierr = PetscStrallocpy(VECCUDA,&B->defaultvectype);CHKERRQ(ierr); 1705 1706 if (B->factortype==MAT_FACTOR_NONE) { 1707 /* you cannot check the inode.use flag here since the matrix was just created. 1708 now build a GPU matrix data structure */ 1709 B->spptr = new Mat_SeqAIJCUSPARSE; 1710 ((Mat_SeqAIJCUSPARSE*)B->spptr)->mat = 0; 1711 ((Mat_SeqAIJCUSPARSE*)B->spptr)->matTranspose = 0; 1712 ((Mat_SeqAIJCUSPARSE*)B->spptr)->workVector = 0; 1713 ((Mat_SeqAIJCUSPARSE*)B->spptr)->format = MAT_CUSPARSE_CSR; 1714 ((Mat_SeqAIJCUSPARSE*)B->spptr)->stream = 0; 1715 stat = cusparseCreate(&handle);CHKERRCUDA(stat); 1716 ((Mat_SeqAIJCUSPARSE*)B->spptr)->handle = handle; 1717 ((Mat_SeqAIJCUSPARSE*)B->spptr)->nonzerostate = 0; 1718 } else { 1719 /* NEXT, set the pointers to the triangular factors */ 1720 B->spptr = new Mat_SeqAIJCUSPARSETriFactors; 1721 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->loTriFactorPtr = 0; 1722 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->upTriFactorPtr = 0; 1723 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->loTriFactorPtrTranspose = 0; 1724 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->upTriFactorPtrTranspose = 0; 1725 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->rpermIndices = 0; 1726 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->cpermIndices = 0; 1727 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->workVector = 0; 1728 stat = cusparseCreate(&handle);CHKERRCUDA(stat); 1729 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->handle = handle; 1730 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->nnz = 0; 1731 } 1732 1733 B->ops->assemblyend = MatAssemblyEnd_SeqAIJCUSPARSE; 1734 B->ops->destroy = MatDestroy_SeqAIJCUSPARSE; 1735 B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE; 1736 B->ops->mult = MatMult_SeqAIJCUSPARSE; 1737 B->ops->multadd = MatMultAdd_SeqAIJCUSPARSE; 1738 B->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE; 1739 B->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE; 1740 B->ops->duplicate = MatDuplicate_SeqAIJCUSPARSE; 1741 1742 ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 1743 1744 B->valid_GPU_matrix = PETSC_OFFLOAD_UNALLOCATED; 1745 1746 ierr = PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr); 1747 PetscFunctionReturn(0); 1748 } 1749 1750 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B) 1751 { 1752 PetscErrorCode ierr; 1753 1754 PetscFunctionBegin; 1755 ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr); 1756 ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(B); 1757 PetscFunctionReturn(0); 1758 } 1759 1760 /*MC 1761 MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices. 1762 1763 A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either 1764 CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later. 1765 All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library. 1766 1767 Options Database Keys: 1768 + -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions() 1769 . -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 1770 - -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 1771 1772 Level: beginner 1773 1774 .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 1775 M*/ 1776 1777 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat,MatFactorType,Mat*); 1778 1779 1780 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void) 1781 { 1782 PetscErrorCode ierr; 1783 1784 PetscFunctionBegin; 1785 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 1786 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 1787 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 1788 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 1789 PetscFunctionReturn(0); 1790 } 1791 1792 1793 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct) 1794 { 1795 cusparseStatus_t stat; 1796 cusparseHandle_t handle; 1797 1798 PetscFunctionBegin; 1799 if (*cusparsestruct) { 1800 MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format); 1801 MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format); 1802 delete (*cusparsestruct)->workVector; 1803 if (handle = (*cusparsestruct)->handle) { 1804 stat = cusparseDestroy(handle);CHKERRCUDA(stat); 1805 } 1806 delete *cusparsestruct; 1807 *cusparsestruct = 0; 1808 } 1809 PetscFunctionReturn(0); 1810 } 1811 1812 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat) 1813 { 1814 PetscFunctionBegin; 1815 if (*mat) { 1816 delete (*mat)->values; 1817 delete (*mat)->column_indices; 1818 delete (*mat)->row_offsets; 1819 delete *mat; 1820 *mat = 0; 1821 } 1822 PetscFunctionReturn(0); 1823 } 1824 1825 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor) 1826 { 1827 cusparseStatus_t stat; 1828 PetscErrorCode ierr; 1829 1830 PetscFunctionBegin; 1831 if (*trifactor) { 1832 if ((*trifactor)->descr) { stat = cusparseDestroyMatDescr((*trifactor)->descr);CHKERRCUDA(stat); } 1833 if ((*trifactor)->solveInfo) { stat = cusparseDestroySolveAnalysisInfo((*trifactor)->solveInfo);CHKERRCUDA(stat); } 1834 ierr = CsrMatrix_Destroy(&(*trifactor)->csrMat);CHKERRQ(ierr); 1835 delete *trifactor; 1836 *trifactor = 0; 1837 } 1838 PetscFunctionReturn(0); 1839 } 1840 1841 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format) 1842 { 1843 CsrMatrix *mat; 1844 cusparseStatus_t stat; 1845 cudaError_t err; 1846 1847 PetscFunctionBegin; 1848 if (*matstruct) { 1849 if ((*matstruct)->mat) { 1850 if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) { 1851 cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat; 1852 stat = cusparseDestroyHybMat(hybMat);CHKERRCUDA(stat); 1853 } else { 1854 mat = (CsrMatrix*)(*matstruct)->mat; 1855 CsrMatrix_Destroy(&mat); 1856 } 1857 } 1858 if ((*matstruct)->descr) { stat = cusparseDestroyMatDescr((*matstruct)->descr);CHKERRCUDA(stat); } 1859 delete (*matstruct)->cprowIndices; 1860 if ((*matstruct)->alpha) { err=cudaFree((*matstruct)->alpha);CHKERRCUDA(err); } 1861 if ((*matstruct)->beta_zero) { err=cudaFree((*matstruct)->beta_zero);CHKERRCUDA(err); } 1862 if ((*matstruct)->beta_one) { err=cudaFree((*matstruct)->beta_one);CHKERRCUDA(err); } 1863 delete *matstruct; 1864 *matstruct = 0; 1865 } 1866 PetscFunctionReturn(0); 1867 } 1868 1869 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors) 1870 { 1871 cusparseHandle_t handle; 1872 cusparseStatus_t stat; 1873 1874 PetscFunctionBegin; 1875 if (*trifactors) { 1876 MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtr); 1877 MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtr); 1878 MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtrTranspose); 1879 MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtrTranspose); 1880 delete (*trifactors)->rpermIndices; 1881 delete (*trifactors)->cpermIndices; 1882 delete (*trifactors)->workVector; 1883 if (handle = (*trifactors)->handle) { 1884 stat = cusparseDestroy(handle);CHKERRCUDA(stat); 1885 } 1886 delete *trifactors; 1887 *trifactors = 0; 1888 } 1889 PetscFunctionReturn(0); 1890 } 1891 1892