1 /* 2 Defines the basic matrix operations for the AIJ (compressed row) 3 matrix storage format using the CUSPARSE library, 4 */ 5 #define PETSC_SKIP_SPINLOCK 6 7 #include <petscconf.h> 8 #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/ 9 #include <../src/mat/impls/sbaij/seq/sbaij.h> 10 #include <../src/vec/vec/impls/dvecimpl.h> 11 #include <petsc/private/vecimpl.h> 12 #undef VecType 13 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h> 14 15 const char *const MatCUSPARSEStorageFormats[] = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0}; 16 17 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*); 18 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*); 19 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*); 20 21 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 22 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 23 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*); 24 25 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec); 26 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 27 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 28 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 29 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat); 30 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec); 31 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 32 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 33 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 34 35 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**); 36 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**); 37 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat); 38 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**); 39 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**); 40 41 PetscErrorCode MatCUSPARSESetStream(Mat A,const cudaStream_t stream) 42 { 43 cusparseStatus_t stat; 44 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 45 46 PetscFunctionBegin; 47 cusparsestruct->stream = stream; 48 stat = cusparseSetStream(cusparsestruct->handle,cusparsestruct->stream);CHKERRCUDA(stat); 49 PetscFunctionReturn(0); 50 } 51 52 PetscErrorCode MatCUSPARSESetHandle(Mat A,const cusparseHandle_t handle) 53 { 54 cusparseStatus_t stat; 55 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 56 57 PetscFunctionBegin; 58 if (cusparsestruct->handle != handle) { 59 if (cusparsestruct->handle) { 60 stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUDA(stat); 61 } 62 cusparsestruct->handle = handle; 63 } 64 stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUDA(stat); 65 PetscFunctionReturn(0); 66 } 67 68 PetscErrorCode MatCUSPARSEClearHandle(Mat A) 69 { 70 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 71 PetscFunctionBegin; 72 if (cusparsestruct->handle) 73 cusparsestruct->handle = 0; 74 PetscFunctionReturn(0); 75 } 76 77 PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A,MatSolverType *type) 78 { 79 PetscFunctionBegin; 80 *type = MATSOLVERCUSPARSE; 81 PetscFunctionReturn(0); 82 } 83 84 /*MC 85 MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices 86 on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported 87 algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer 88 performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the 89 CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these 90 algorithms are not recommended. This class does NOT support direct solver operations. 91 92 Level: beginner 93 94 .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 95 M*/ 96 97 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B) 98 { 99 PetscErrorCode ierr; 100 PetscInt n = A->rmap->n; 101 102 PetscFunctionBegin; 103 ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr); 104 (*B)->factortype = ftype; 105 ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr); 106 ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 107 108 if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) { 109 ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr); 110 (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE; 111 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJCUSPARSE; 112 } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) { 113 (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJCUSPARSE; 114 (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE; 115 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types"); 116 117 ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr); 118 ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_cusparse);CHKERRQ(ierr); 119 PetscFunctionReturn(0); 120 } 121 122 PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 123 { 124 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 125 126 PetscFunctionBegin; 127 #if CUDA_VERSION>=4020 128 switch (op) { 129 case MAT_CUSPARSE_MULT: 130 cusparsestruct->format = format; 131 break; 132 case MAT_CUSPARSE_ALL: 133 cusparsestruct->format = format; 134 break; 135 default: 136 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op); 137 } 138 #else 139 if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"ELL (Ellpack) and HYB (Hybrid) storage format require CUDA 4.2 or later."); 140 #endif 141 PetscFunctionReturn(0); 142 } 143 144 /*@ 145 MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular 146 operation. Only the MatMult operation can use different GPU storage formats 147 for MPIAIJCUSPARSE matrices. 148 Not Collective 149 150 Input Parameters: 151 + A - Matrix of type SEQAIJCUSPARSE 152 . op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL. 153 - format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2) 154 155 Output Parameter: 156 157 Level: intermediate 158 159 .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 160 @*/ 161 PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 162 { 163 PetscErrorCode ierr; 164 165 PetscFunctionBegin; 166 PetscValidHeaderSpecific(A, MAT_CLASSID,1); 167 ierr = PetscTryMethod(A, "MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));CHKERRQ(ierr); 168 PetscFunctionReturn(0); 169 } 170 171 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A) 172 { 173 PetscErrorCode ierr; 174 MatCUSPARSEStorageFormat format; 175 PetscBool flg; 176 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 177 178 PetscFunctionBegin; 179 ierr = PetscOptionsHead(PetscOptionsObject,"SeqAIJCUSPARSE options");CHKERRQ(ierr); 180 if (A->factortype==MAT_FACTOR_NONE) { 181 ierr = PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV", 182 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 183 if (flg) { 184 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);CHKERRQ(ierr); 185 } 186 } 187 ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve", 188 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 189 if (flg) { 190 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr); 191 } 192 ierr = PetscOptionsTail();CHKERRQ(ierr); 193 PetscFunctionReturn(0); 194 195 } 196 197 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 198 { 199 PetscErrorCode ierr; 200 201 PetscFunctionBegin; 202 ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 203 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 204 PetscFunctionReturn(0); 205 } 206 207 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 208 { 209 PetscErrorCode ierr; 210 211 PetscFunctionBegin; 212 ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 213 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 214 PetscFunctionReturn(0); 215 } 216 217 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info) 218 { 219 PetscErrorCode ierr; 220 221 PetscFunctionBegin; 222 ierr = MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr); 223 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 224 PetscFunctionReturn(0); 225 } 226 227 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info) 228 { 229 PetscErrorCode ierr; 230 231 PetscFunctionBegin; 232 ierr = MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr); 233 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 234 PetscFunctionReturn(0); 235 } 236 237 static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A) 238 { 239 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 240 PetscInt n = A->rmap->n; 241 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 242 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 243 cusparseStatus_t stat; 244 const PetscInt *ai = a->i,*aj = a->j,*vi; 245 const MatScalar *aa = a->a,*v; 246 PetscInt *AiLo, *AjLo; 247 PetscScalar *AALo; 248 PetscInt i,nz, nzLower, offset, rowOffset; 249 PetscErrorCode ierr; 250 251 PetscFunctionBegin; 252 if (A->valid_GPU_matrix == PETSC_OFFLOAD_UNALLOCATED || A->valid_GPU_matrix == PETSC_OFFLOAD_CPU) { 253 try { 254 /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */ 255 nzLower=n+ai[n]-ai[1]; 256 257 /* Allocate Space for the lower triangular matrix */ 258 ierr = cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUDA(ierr); 259 ierr = cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUDA(ierr); 260 ierr = cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUDA(ierr); 261 262 /* Fill the lower triangular matrix */ 263 AiLo[0] = (PetscInt) 0; 264 AiLo[n] = nzLower; 265 AjLo[0] = (PetscInt) 0; 266 AALo[0] = (MatScalar) 1.0; 267 v = aa; 268 vi = aj; 269 offset = 1; 270 rowOffset= 1; 271 for (i=1; i<n; i++) { 272 nz = ai[i+1] - ai[i]; 273 /* additional 1 for the term on the diagonal */ 274 AiLo[i] = rowOffset; 275 rowOffset += nz+1; 276 277 ierr = PetscArraycpy(&(AjLo[offset]), vi, nz);CHKERRQ(ierr); 278 ierr = PetscArraycpy(&(AALo[offset]), v, nz);CHKERRQ(ierr); 279 280 offset += nz; 281 AjLo[offset] = (PetscInt) i; 282 AALo[offset] = (MatScalar) 1.0; 283 offset += 1; 284 285 v += nz; 286 vi += nz; 287 } 288 289 /* allocate space for the triangular factor information */ 290 loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct; 291 292 /* Create the matrix description */ 293 stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUDA(stat); 294 stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat); 295 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUDA(stat); 296 stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER);CHKERRCUDA(stat); 297 stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUDA(stat); 298 299 /* Create the solve analysis information */ 300 stat = cusparseCreateSolveAnalysisInfo(&loTriFactor->solveInfo);CHKERRCUDA(stat); 301 302 /* set the operation */ 303 loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 304 305 /* set the matrix */ 306 loTriFactor->csrMat = new CsrMatrix; 307 loTriFactor->csrMat->num_rows = n; 308 loTriFactor->csrMat->num_cols = n; 309 loTriFactor->csrMat->num_entries = nzLower; 310 311 loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1); 312 loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1); 313 314 loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower); 315 loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower); 316 317 loTriFactor->csrMat->values = new THRUSTARRAY(nzLower); 318 loTriFactor->csrMat->values->assign(AALo, AALo+nzLower); 319 320 /* perform the solve analysis */ 321 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, 322 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 323 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 324 loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo);CHKERRCUDA(stat); 325 326 /* assign the pointer. Is this really necessary? */ 327 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor; 328 329 ierr = cudaFreeHost(AiLo);CHKERRCUDA(ierr); 330 ierr = cudaFreeHost(AjLo);CHKERRCUDA(ierr); 331 ierr = cudaFreeHost(AALo);CHKERRCUDA(ierr); 332 } catch(char *ex) { 333 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 334 } 335 } 336 PetscFunctionReturn(0); 337 } 338 339 static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A) 340 { 341 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 342 PetscInt n = A->rmap->n; 343 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 344 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 345 cusparseStatus_t stat; 346 const PetscInt *aj = a->j,*adiag = a->diag,*vi; 347 const MatScalar *aa = a->a,*v; 348 PetscInt *AiUp, *AjUp; 349 PetscScalar *AAUp; 350 PetscInt i,nz, nzUpper, offset; 351 PetscErrorCode ierr; 352 353 PetscFunctionBegin; 354 if (A->valid_GPU_matrix == PETSC_OFFLOAD_UNALLOCATED || A->valid_GPU_matrix == PETSC_OFFLOAD_CPU) { 355 try { 356 /* next, figure out the number of nonzeros in the upper triangular matrix. */ 357 nzUpper = adiag[0]-adiag[n]; 358 359 /* Allocate Space for the upper triangular matrix */ 360 ierr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(ierr); 361 ierr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(ierr); 362 ierr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(ierr); 363 364 /* Fill the upper triangular matrix */ 365 AiUp[0]=(PetscInt) 0; 366 AiUp[n]=nzUpper; 367 offset = nzUpper; 368 for (i=n-1; i>=0; i--) { 369 v = aa + adiag[i+1] + 1; 370 vi = aj + adiag[i+1] + 1; 371 372 /* number of elements NOT on the diagonal */ 373 nz = adiag[i] - adiag[i+1]-1; 374 375 /* decrement the offset */ 376 offset -= (nz+1); 377 378 /* first, set the diagonal elements */ 379 AjUp[offset] = (PetscInt) i; 380 AAUp[offset] = (MatScalar)1./v[nz]; 381 AiUp[i] = AiUp[i+1] - (nz+1); 382 383 ierr = PetscArraycpy(&(AjUp[offset+1]), vi, nz);CHKERRQ(ierr); 384 ierr = PetscArraycpy(&(AAUp[offset+1]), v, nz);CHKERRQ(ierr); 385 } 386 387 /* allocate space for the triangular factor information */ 388 upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct; 389 390 /* Create the matrix description */ 391 stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUDA(stat); 392 stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat); 393 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUDA(stat); 394 stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUDA(stat); 395 stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUDA(stat); 396 397 /* Create the solve analysis information */ 398 stat = cusparseCreateSolveAnalysisInfo(&upTriFactor->solveInfo);CHKERRCUDA(stat); 399 400 /* set the operation */ 401 upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 402 403 /* set the matrix */ 404 upTriFactor->csrMat = new CsrMatrix; 405 upTriFactor->csrMat->num_rows = n; 406 upTriFactor->csrMat->num_cols = n; 407 upTriFactor->csrMat->num_entries = nzUpper; 408 409 upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1); 410 upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1); 411 412 upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper); 413 upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper); 414 415 upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper); 416 upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper); 417 418 /* perform the solve analysis */ 419 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, 420 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 421 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 422 upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo);CHKERRCUDA(stat); 423 424 /* assign the pointer. Is this really necessary? */ 425 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor; 426 427 ierr = cudaFreeHost(AiUp);CHKERRCUDA(ierr); 428 ierr = cudaFreeHost(AjUp);CHKERRCUDA(ierr); 429 ierr = cudaFreeHost(AAUp);CHKERRCUDA(ierr); 430 } catch(char *ex) { 431 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 432 } 433 } 434 PetscFunctionReturn(0); 435 } 436 437 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A) 438 { 439 PetscErrorCode ierr; 440 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 441 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 442 IS isrow = a->row,iscol = a->icol; 443 PetscBool row_identity,col_identity; 444 const PetscInt *r,*c; 445 PetscInt n = A->rmap->n; 446 447 PetscFunctionBegin; 448 ierr = MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);CHKERRQ(ierr); 449 ierr = MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);CHKERRQ(ierr); 450 451 cusparseTriFactors->workVector = new THRUSTARRAY(n); 452 cusparseTriFactors->nnz=a->nz; 453 454 A->valid_GPU_matrix = PETSC_OFFLOAD_BOTH; 455 /*lower triangular indices */ 456 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 457 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 458 if (!row_identity) { 459 cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 460 cusparseTriFactors->rpermIndices->assign(r, r+n); 461 } 462 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 463 464 /*upper triangular indices */ 465 ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr); 466 ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 467 if (!col_identity) { 468 cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 469 cusparseTriFactors->cpermIndices->assign(c, c+n); 470 } 471 ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr); 472 PetscFunctionReturn(0); 473 } 474 475 static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A) 476 { 477 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 478 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 479 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 480 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 481 cusparseStatus_t stat; 482 PetscErrorCode ierr; 483 PetscInt *AiUp, *AjUp; 484 PetscScalar *AAUp; 485 PetscScalar *AALo; 486 PetscInt nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j; 487 Mat_SeqSBAIJ *b = (Mat_SeqSBAIJ*)A->data; 488 const PetscInt *ai = b->i,*aj = b->j,*vj; 489 const MatScalar *aa = b->a,*v; 490 491 PetscFunctionBegin; 492 if (A->valid_GPU_matrix == PETSC_OFFLOAD_UNALLOCATED || A->valid_GPU_matrix == PETSC_OFFLOAD_CPU) { 493 try { 494 /* Allocate Space for the upper triangular matrix */ 495 ierr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(ierr); 496 ierr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(ierr); 497 ierr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(ierr); 498 ierr = cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUDA(ierr); 499 500 /* Fill the upper triangular matrix */ 501 AiUp[0]=(PetscInt) 0; 502 AiUp[n]=nzUpper; 503 offset = 0; 504 for (i=0; i<n; i++) { 505 /* set the pointers */ 506 v = aa + ai[i]; 507 vj = aj + ai[i]; 508 nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */ 509 510 /* first, set the diagonal elements */ 511 AjUp[offset] = (PetscInt) i; 512 AAUp[offset] = (MatScalar)1.0/v[nz]; 513 AiUp[i] = offset; 514 AALo[offset] = (MatScalar)1.0/v[nz]; 515 516 offset+=1; 517 if (nz>0) { 518 ierr = PetscArraycpy(&(AjUp[offset]), vj, nz);CHKERRQ(ierr); 519 ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr); 520 for (j=offset; j<offset+nz; j++) { 521 AAUp[j] = -AAUp[j]; 522 AALo[j] = AAUp[j]/v[nz]; 523 } 524 offset+=nz; 525 } 526 } 527 528 /* allocate space for the triangular factor information */ 529 upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct; 530 531 /* Create the matrix description */ 532 stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUDA(stat); 533 stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat); 534 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUDA(stat); 535 stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUDA(stat); 536 stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUDA(stat); 537 538 /* Create the solve analysis information */ 539 stat = cusparseCreateSolveAnalysisInfo(&upTriFactor->solveInfo);CHKERRCUDA(stat); 540 541 /* set the operation */ 542 upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 543 544 /* set the matrix */ 545 upTriFactor->csrMat = new CsrMatrix; 546 upTriFactor->csrMat->num_rows = A->rmap->n; 547 upTriFactor->csrMat->num_cols = A->cmap->n; 548 upTriFactor->csrMat->num_entries = a->nz; 549 550 upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 551 upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1); 552 553 upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 554 upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz); 555 556 upTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 557 upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz); 558 559 /* perform the solve analysis */ 560 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, 561 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 562 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 563 upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo);CHKERRCUDA(stat); 564 565 /* assign the pointer. Is this really necessary? */ 566 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor; 567 568 /* allocate space for the triangular factor information */ 569 loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct; 570 571 /* Create the matrix description */ 572 stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUDA(stat); 573 stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat); 574 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUDA(stat); 575 stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUDA(stat); 576 stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUDA(stat); 577 578 /* Create the solve analysis information */ 579 stat = cusparseCreateSolveAnalysisInfo(&loTriFactor->solveInfo);CHKERRCUDA(stat); 580 581 /* set the operation */ 582 loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE; 583 584 /* set the matrix */ 585 loTriFactor->csrMat = new CsrMatrix; 586 loTriFactor->csrMat->num_rows = A->rmap->n; 587 loTriFactor->csrMat->num_cols = A->cmap->n; 588 loTriFactor->csrMat->num_entries = a->nz; 589 590 loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 591 loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1); 592 593 loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 594 loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz); 595 596 loTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 597 loTriFactor->csrMat->values->assign(AALo, AALo+a->nz); 598 599 /* perform the solve analysis */ 600 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, 601 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 602 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 603 loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo);CHKERRCUDA(stat); 604 605 /* assign the pointer. Is this really necessary? */ 606 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor; 607 608 A->valid_GPU_matrix = PETSC_OFFLOAD_BOTH; 609 ierr = cudaFreeHost(AiUp);CHKERRCUDA(ierr); 610 ierr = cudaFreeHost(AjUp);CHKERRCUDA(ierr); 611 ierr = cudaFreeHost(AAUp);CHKERRCUDA(ierr); 612 ierr = cudaFreeHost(AALo);CHKERRCUDA(ierr); 613 } catch(char *ex) { 614 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 615 } 616 } 617 PetscFunctionReturn(0); 618 } 619 620 static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A) 621 { 622 PetscErrorCode ierr; 623 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 624 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 625 IS ip = a->row; 626 const PetscInt *rip; 627 PetscBool perm_identity; 628 PetscInt n = A->rmap->n; 629 630 PetscFunctionBegin; 631 ierr = MatSeqAIJCUSPARSEBuildICCTriMatrices(A);CHKERRQ(ierr); 632 cusparseTriFactors->workVector = new THRUSTARRAY(n); 633 cusparseTriFactors->nnz=(a->nz-n)*2 + n; 634 635 /*lower triangular indices */ 636 ierr = ISGetIndices(ip,&rip);CHKERRQ(ierr); 637 ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr); 638 if (!perm_identity) { 639 cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 640 cusparseTriFactors->rpermIndices->assign(rip, rip+n); 641 cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 642 cusparseTriFactors->cpermIndices->assign(rip, rip+n); 643 } 644 ierr = ISRestoreIndices(ip,&rip);CHKERRQ(ierr); 645 PetscFunctionReturn(0); 646 } 647 648 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info) 649 { 650 Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 651 IS isrow = b->row,iscol = b->col; 652 PetscBool row_identity,col_identity; 653 PetscErrorCode ierr; 654 655 PetscFunctionBegin; 656 ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 657 /* determine which version of MatSolve needs to be used. */ 658 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 659 ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 660 if (row_identity && col_identity) { 661 B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 662 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 663 } else { 664 B->ops->solve = MatSolve_SeqAIJCUSPARSE; 665 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 666 } 667 668 /* get the triangular factors */ 669 ierr = MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);CHKERRQ(ierr); 670 PetscFunctionReturn(0); 671 } 672 673 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info) 674 { 675 Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 676 IS ip = b->row; 677 PetscBool perm_identity; 678 PetscErrorCode ierr; 679 680 PetscFunctionBegin; 681 ierr = MatCholeskyFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 682 683 /* determine which version of MatSolve needs to be used. */ 684 ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr); 685 if (perm_identity) { 686 B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 687 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 688 } else { 689 B->ops->solve = MatSolve_SeqAIJCUSPARSE; 690 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 691 } 692 693 /* get the triangular factors */ 694 ierr = MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);CHKERRQ(ierr); 695 PetscFunctionReturn(0); 696 } 697 698 static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A) 699 { 700 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 701 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 702 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 703 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 704 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 705 cusparseStatus_t stat; 706 cusparseIndexBase_t indexBase; 707 cusparseMatrixType_t matrixType; 708 cusparseFillMode_t fillMode; 709 cusparseDiagType_t diagType; 710 711 PetscFunctionBegin; 712 713 /*********************************************/ 714 /* Now the Transpose of the Lower Tri Factor */ 715 /*********************************************/ 716 717 /* allocate space for the transpose of the lower triangular factor */ 718 loTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct; 719 720 /* set the matrix descriptors of the lower triangular factor */ 721 matrixType = cusparseGetMatType(loTriFactor->descr); 722 indexBase = cusparseGetMatIndexBase(loTriFactor->descr); 723 fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ? 724 CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 725 diagType = cusparseGetMatDiagType(loTriFactor->descr); 726 727 /* Create the matrix description */ 728 stat = cusparseCreateMatDescr(&loTriFactorT->descr);CHKERRCUDA(stat); 729 stat = cusparseSetMatIndexBase(loTriFactorT->descr, indexBase);CHKERRCUDA(stat); 730 stat = cusparseSetMatType(loTriFactorT->descr, matrixType);CHKERRCUDA(stat); 731 stat = cusparseSetMatFillMode(loTriFactorT->descr, fillMode);CHKERRCUDA(stat); 732 stat = cusparseSetMatDiagType(loTriFactorT->descr, diagType);CHKERRCUDA(stat); 733 734 /* Create the solve analysis information */ 735 stat = cusparseCreateSolveAnalysisInfo(&loTriFactorT->solveInfo);CHKERRCUDA(stat); 736 737 /* set the operation */ 738 loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 739 740 /* allocate GPU space for the CSC of the lower triangular factor*/ 741 loTriFactorT->csrMat = new CsrMatrix; 742 loTriFactorT->csrMat->num_rows = loTriFactor->csrMat->num_rows; 743 loTriFactorT->csrMat->num_cols = loTriFactor->csrMat->num_cols; 744 loTriFactorT->csrMat->num_entries = loTriFactor->csrMat->num_entries; 745 loTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(loTriFactor->csrMat->num_rows+1); 746 loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactor->csrMat->num_entries); 747 loTriFactorT->csrMat->values = new THRUSTARRAY(loTriFactor->csrMat->num_entries); 748 749 /* compute the transpose of the lower triangular factor, i.e. the CSC */ 750 stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, 751 loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, 752 loTriFactor->csrMat->values->data().get(), 753 loTriFactor->csrMat->row_offsets->data().get(), 754 loTriFactor->csrMat->column_indices->data().get(), 755 loTriFactorT->csrMat->values->data().get(), 756 loTriFactorT->csrMat->column_indices->data().get(), 757 loTriFactorT->csrMat->row_offsets->data().get(), 758 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUDA(stat); 759 760 /* perform the solve analysis on the transposed matrix */ 761 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp, 762 loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, 763 loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(), 764 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), 765 loTriFactorT->solveInfo);CHKERRCUDA(stat); 766 767 /* assign the pointer. Is this really necessary? */ 768 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT; 769 770 /*********************************************/ 771 /* Now the Transpose of the Upper Tri Factor */ 772 /*********************************************/ 773 774 /* allocate space for the transpose of the upper triangular factor */ 775 upTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct; 776 777 /* set the matrix descriptors of the upper triangular factor */ 778 matrixType = cusparseGetMatType(upTriFactor->descr); 779 indexBase = cusparseGetMatIndexBase(upTriFactor->descr); 780 fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ? 781 CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 782 diagType = cusparseGetMatDiagType(upTriFactor->descr); 783 784 /* Create the matrix description */ 785 stat = cusparseCreateMatDescr(&upTriFactorT->descr);CHKERRCUDA(stat); 786 stat = cusparseSetMatIndexBase(upTriFactorT->descr, indexBase);CHKERRCUDA(stat); 787 stat = cusparseSetMatType(upTriFactorT->descr, matrixType);CHKERRCUDA(stat); 788 stat = cusparseSetMatFillMode(upTriFactorT->descr, fillMode);CHKERRCUDA(stat); 789 stat = cusparseSetMatDiagType(upTriFactorT->descr, diagType);CHKERRCUDA(stat); 790 791 /* Create the solve analysis information */ 792 stat = cusparseCreateSolveAnalysisInfo(&upTriFactorT->solveInfo);CHKERRCUDA(stat); 793 794 /* set the operation */ 795 upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 796 797 /* allocate GPU space for the CSC of the upper triangular factor*/ 798 upTriFactorT->csrMat = new CsrMatrix; 799 upTriFactorT->csrMat->num_rows = upTriFactor->csrMat->num_rows; 800 upTriFactorT->csrMat->num_cols = upTriFactor->csrMat->num_cols; 801 upTriFactorT->csrMat->num_entries = upTriFactor->csrMat->num_entries; 802 upTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(upTriFactor->csrMat->num_rows+1); 803 upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactor->csrMat->num_entries); 804 upTriFactorT->csrMat->values = new THRUSTARRAY(upTriFactor->csrMat->num_entries); 805 806 /* compute the transpose of the upper triangular factor, i.e. the CSC */ 807 stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, 808 upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, 809 upTriFactor->csrMat->values->data().get(), 810 upTriFactor->csrMat->row_offsets->data().get(), 811 upTriFactor->csrMat->column_indices->data().get(), 812 upTriFactorT->csrMat->values->data().get(), 813 upTriFactorT->csrMat->column_indices->data().get(), 814 upTriFactorT->csrMat->row_offsets->data().get(), 815 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUDA(stat); 816 817 /* perform the solve analysis on the transposed matrix */ 818 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp, 819 upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, 820 upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(), 821 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), 822 upTriFactorT->solveInfo);CHKERRCUDA(stat); 823 824 /* assign the pointer. Is this really necessary? */ 825 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT; 826 PetscFunctionReturn(0); 827 } 828 829 static PetscErrorCode MatSeqAIJCUSPARSEGenerateTransposeForMult(Mat A) 830 { 831 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 832 Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 833 Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 834 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 835 cusparseStatus_t stat; 836 cusparseIndexBase_t indexBase; 837 cudaError_t err; 838 839 PetscFunctionBegin; 840 841 /* allocate space for the triangular factor information */ 842 matstructT = new Mat_SeqAIJCUSPARSEMultStruct; 843 stat = cusparseCreateMatDescr(&matstructT->descr);CHKERRCUDA(stat); 844 indexBase = cusparseGetMatIndexBase(matstruct->descr); 845 stat = cusparseSetMatIndexBase(matstructT->descr, indexBase);CHKERRCUDA(stat); 846 stat = cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUDA(stat); 847 848 /* set alpha and beta */ 849 err = cudaMalloc((void **)&(matstructT->alpha), sizeof(PetscScalar));CHKERRCUDA(err); 850 err = cudaMalloc((void **)&(matstructT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err); 851 err = cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar));CHKERRCUDA(err); 852 err = cudaMemcpy(matstructT->alpha, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 853 err = cudaMemcpy(matstructT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 854 err = cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 855 stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUDA(stat); 856 857 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 858 CsrMatrix *matrix = (CsrMatrix*)matstruct->mat; 859 CsrMatrix *matrixT= new CsrMatrix; 860 matrixT->num_rows = A->cmap->n; 861 matrixT->num_cols = A->rmap->n; 862 matrixT->num_entries = a->nz; 863 matrixT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 864 matrixT->column_indices = new THRUSTINTARRAY32(a->nz); 865 matrixT->values = new THRUSTARRAY(a->nz); 866 867 /* compute the transpose of the upper triangular factor, i.e. the CSC */ 868 indexBase = cusparseGetMatIndexBase(matstruct->descr); 869 stat = cusparse_csr2csc(cusparsestruct->handle, matrix->num_rows, 870 matrix->num_cols, matrix->num_entries, 871 matrix->values->data().get(), 872 matrix->row_offsets->data().get(), 873 matrix->column_indices->data().get(), 874 matrixT->values->data().get(), 875 matrixT->column_indices->data().get(), 876 matrixT->row_offsets->data().get(), 877 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUDA(stat); 878 879 /* assign the pointer */ 880 matstructT->mat = matrixT; 881 882 } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) { 883 #if CUDA_VERSION>=5000 884 /* First convert HYB to CSR */ 885 CsrMatrix *temp= new CsrMatrix; 886 temp->num_rows = A->rmap->n; 887 temp->num_cols = A->cmap->n; 888 temp->num_entries = a->nz; 889 temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 890 temp->column_indices = new THRUSTINTARRAY32(a->nz); 891 temp->values = new THRUSTARRAY(a->nz); 892 893 894 stat = cusparse_hyb2csr(cusparsestruct->handle, 895 matstruct->descr, (cusparseHybMat_t)matstruct->mat, 896 temp->values->data().get(), 897 temp->row_offsets->data().get(), 898 temp->column_indices->data().get());CHKERRCUDA(stat); 899 900 /* Next, convert CSR to CSC (i.e. the matrix transpose) */ 901 CsrMatrix *tempT= new CsrMatrix; 902 tempT->num_rows = A->rmap->n; 903 tempT->num_cols = A->cmap->n; 904 tempT->num_entries = a->nz; 905 tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 906 tempT->column_indices = new THRUSTINTARRAY32(a->nz); 907 tempT->values = new THRUSTARRAY(a->nz); 908 909 stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows, 910 temp->num_cols, temp->num_entries, 911 temp->values->data().get(), 912 temp->row_offsets->data().get(), 913 temp->column_indices->data().get(), 914 tempT->values->data().get(), 915 tempT->column_indices->data().get(), 916 tempT->row_offsets->data().get(), 917 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUDA(stat); 918 919 /* Last, convert CSC to HYB */ 920 cusparseHybMat_t hybMat; 921 stat = cusparseCreateHybMat(&hybMat);CHKERRCUDA(stat); 922 cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ? 923 CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 924 stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n, 925 matstructT->descr, tempT->values->data().get(), 926 tempT->row_offsets->data().get(), 927 tempT->column_indices->data().get(), 928 hybMat, 0, partition);CHKERRCUDA(stat); 929 930 /* assign the pointer */ 931 matstructT->mat = hybMat; 932 933 /* delete temporaries */ 934 if (tempT) { 935 if (tempT->values) delete (THRUSTARRAY*) tempT->values; 936 if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices; 937 if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets; 938 delete (CsrMatrix*) tempT; 939 } 940 if (temp) { 941 if (temp->values) delete (THRUSTARRAY*) temp->values; 942 if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices; 943 if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets; 944 delete (CsrMatrix*) temp; 945 } 946 #else 947 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"ELL (Ellpack) and HYB (Hybrid) storage format for the Matrix Transpose (in MatMultTranspose) require CUDA 5.0 or later."); 948 #endif 949 } 950 /* assign the compressed row indices */ 951 matstructT->cprowIndices = new THRUSTINTARRAY; 952 matstructT->cprowIndices->resize(A->cmap->n); 953 thrust::sequence(matstructT->cprowIndices->begin(), matstructT->cprowIndices->end()); 954 955 /* assign the pointer */ 956 ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT; 957 PetscFunctionReturn(0); 958 } 959 960 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 961 { 962 PetscInt n = xx->map->n; 963 const PetscScalar *barray; 964 PetscScalar *xarray; 965 thrust::device_ptr<const PetscScalar> bGPU; 966 thrust::device_ptr<PetscScalar> xGPU; 967 cusparseStatus_t stat; 968 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 969 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 970 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 971 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 972 PetscErrorCode ierr; 973 974 PetscFunctionBegin; 975 /* Analyze the matrix and create the transpose ... on the fly */ 976 if (!loTriFactorT && !upTriFactorT) { 977 ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr); 978 loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 979 upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 980 } 981 982 /* Get the GPU pointers */ 983 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 984 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 985 xGPU = thrust::device_pointer_cast(xarray); 986 bGPU = thrust::device_pointer_cast(barray); 987 988 /* First, reorder with the row permutation */ 989 thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), 990 thrust::make_permutation_iterator(bGPU+n, cusparseTriFactors->rpermIndices->end()), 991 xGPU); 992 993 /* First, solve U */ 994 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, 995 upTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactorT->descr, 996 upTriFactorT->csrMat->values->data().get(), 997 upTriFactorT->csrMat->row_offsets->data().get(), 998 upTriFactorT->csrMat->column_indices->data().get(), 999 upTriFactorT->solveInfo, 1000 xarray, tempGPU->data().get());CHKERRCUDA(stat); 1001 1002 /* Then, solve L */ 1003 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, 1004 loTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactorT->descr, 1005 loTriFactorT->csrMat->values->data().get(), 1006 loTriFactorT->csrMat->row_offsets->data().get(), 1007 loTriFactorT->csrMat->column_indices->data().get(), 1008 loTriFactorT->solveInfo, 1009 tempGPU->data().get(), xarray);CHKERRCUDA(stat); 1010 1011 /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */ 1012 thrust::copy(thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()), 1013 thrust::make_permutation_iterator(xGPU+n, cusparseTriFactors->cpermIndices->end()), 1014 tempGPU->begin()); 1015 1016 /* Copy the temporary to the full solution. */ 1017 thrust::copy(tempGPU->begin(), tempGPU->end(), xGPU); 1018 1019 /* restore */ 1020 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1021 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1022 ierr = WaitForGPU();CHKERRCUDA(ierr); 1023 1024 ierr = PetscLogFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1025 PetscFunctionReturn(0); 1026 } 1027 1028 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 1029 { 1030 const PetscScalar *barray; 1031 PetscScalar *xarray; 1032 cusparseStatus_t stat; 1033 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1034 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1035 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1036 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1037 PetscErrorCode ierr; 1038 1039 PetscFunctionBegin; 1040 /* Analyze the matrix and create the transpose ... on the fly */ 1041 if (!loTriFactorT && !upTriFactorT) { 1042 ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr); 1043 loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1044 upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1045 } 1046 1047 /* Get the GPU pointers */ 1048 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1049 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1050 1051 /* First, solve U */ 1052 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, 1053 upTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactorT->descr, 1054 upTriFactorT->csrMat->values->data().get(), 1055 upTriFactorT->csrMat->row_offsets->data().get(), 1056 upTriFactorT->csrMat->column_indices->data().get(), 1057 upTriFactorT->solveInfo, 1058 barray, tempGPU->data().get());CHKERRCUDA(stat); 1059 1060 /* Then, solve L */ 1061 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, 1062 loTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactorT->descr, 1063 loTriFactorT->csrMat->values->data().get(), 1064 loTriFactorT->csrMat->row_offsets->data().get(), 1065 loTriFactorT->csrMat->column_indices->data().get(), 1066 loTriFactorT->solveInfo, 1067 tempGPU->data().get(), xarray);CHKERRCUDA(stat); 1068 1069 /* restore */ 1070 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1071 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1072 ierr = WaitForGPU();CHKERRCUDA(ierr); 1073 ierr = PetscLogFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1074 PetscFunctionReturn(0); 1075 } 1076 1077 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 1078 { 1079 const PetscScalar *barray; 1080 PetscScalar *xarray; 1081 thrust::device_ptr<const PetscScalar> bGPU; 1082 thrust::device_ptr<PetscScalar> xGPU; 1083 cusparseStatus_t stat; 1084 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1085 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1086 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1087 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1088 PetscErrorCode ierr; 1089 1090 PetscFunctionBegin; 1091 1092 /* Get the GPU pointers */ 1093 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1094 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1095 xGPU = thrust::device_pointer_cast(xarray); 1096 bGPU = thrust::device_pointer_cast(barray); 1097 1098 /* First, reorder with the row permutation */ 1099 thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), 1100 thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()), 1101 xGPU); 1102 1103 /* Next, solve L */ 1104 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp, 1105 loTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactor->descr, 1106 loTriFactor->csrMat->values->data().get(), 1107 loTriFactor->csrMat->row_offsets->data().get(), 1108 loTriFactor->csrMat->column_indices->data().get(), 1109 loTriFactor->solveInfo, 1110 xarray, tempGPU->data().get());CHKERRCUDA(stat); 1111 1112 /* Then, solve U */ 1113 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp, 1114 upTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactor->descr, 1115 upTriFactor->csrMat->values->data().get(), 1116 upTriFactor->csrMat->row_offsets->data().get(), 1117 upTriFactor->csrMat->column_indices->data().get(), 1118 upTriFactor->solveInfo, 1119 tempGPU->data().get(), xarray);CHKERRCUDA(stat); 1120 1121 /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */ 1122 thrust::copy(thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()), 1123 thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->end()), 1124 tempGPU->begin()); 1125 1126 /* Copy the temporary to the full solution. */ 1127 thrust::copy(tempGPU->begin(), tempGPU->end(), xGPU); 1128 1129 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1130 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1131 ierr = WaitForGPU();CHKERRCUDA(ierr); 1132 ierr = PetscLogFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1133 PetscFunctionReturn(0); 1134 } 1135 1136 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 1137 { 1138 const PetscScalar *barray; 1139 PetscScalar *xarray; 1140 cusparseStatus_t stat; 1141 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1142 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1143 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1144 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1145 PetscErrorCode ierr; 1146 1147 PetscFunctionBegin; 1148 /* Get the GPU pointers */ 1149 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1150 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1151 1152 /* First, solve L */ 1153 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp, 1154 loTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactor->descr, 1155 loTriFactor->csrMat->values->data().get(), 1156 loTriFactor->csrMat->row_offsets->data().get(), 1157 loTriFactor->csrMat->column_indices->data().get(), 1158 loTriFactor->solveInfo, 1159 barray, tempGPU->data().get());CHKERRCUDA(stat); 1160 1161 /* Next, solve U */ 1162 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp, 1163 upTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactor->descr, 1164 upTriFactor->csrMat->values->data().get(), 1165 upTriFactor->csrMat->row_offsets->data().get(), 1166 upTriFactor->csrMat->column_indices->data().get(), 1167 upTriFactor->solveInfo, 1168 tempGPU->data().get(), xarray);CHKERRCUDA(stat); 1169 1170 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1171 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1172 ierr = WaitForGPU();CHKERRCUDA(ierr); 1173 ierr = PetscLogFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1174 PetscFunctionReturn(0); 1175 } 1176 1177 static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A) 1178 { 1179 1180 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1181 Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 1182 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1183 PetscInt m = A->rmap->n,*ii,*ridx; 1184 PetscErrorCode ierr; 1185 cusparseStatus_t stat; 1186 cudaError_t err; 1187 1188 PetscFunctionBegin; 1189 if (A->pinnedtocpu) PetscFunctionReturn(0); 1190 if (A->valid_GPU_matrix == PETSC_OFFLOAD_UNALLOCATED || A->valid_GPU_matrix == PETSC_OFFLOAD_CPU) { 1191 ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1192 if (A->assembled && A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) { 1193 CsrMatrix *matrix = (CsrMatrix*)matstruct->mat; 1194 /* copy values only */ 1195 matrix->values->assign(a->a, a->a+a->nz); 1196 } else { 1197 MatSeqAIJCUSPARSEMultStruct_Destroy(&matstruct,cusparsestruct->format); 1198 try { 1199 cusparsestruct->nonzerorow=0; 1200 for (int j = 0; j<m; j++) cusparsestruct->nonzerorow += ((a->i[j+1]-a->i[j])>0); 1201 1202 if (a->compressedrow.use) { 1203 m = a->compressedrow.nrows; 1204 ii = a->compressedrow.i; 1205 ridx = a->compressedrow.rindex; 1206 } else { 1207 /* Forcing compressed row on the GPU */ 1208 int k=0; 1209 ierr = PetscMalloc1(cusparsestruct->nonzerorow+1, &ii);CHKERRQ(ierr); 1210 ierr = PetscMalloc1(cusparsestruct->nonzerorow, &ridx);CHKERRQ(ierr); 1211 ii[0]=0; 1212 for (int j = 0; j<m; j++) { 1213 if ((a->i[j+1]-a->i[j])>0) { 1214 ii[k] = a->i[j]; 1215 ridx[k]= j; 1216 k++; 1217 } 1218 } 1219 ii[cusparsestruct->nonzerorow] = a->nz; 1220 m = cusparsestruct->nonzerorow; 1221 } 1222 1223 /* allocate space for the triangular factor information */ 1224 matstruct = new Mat_SeqAIJCUSPARSEMultStruct; 1225 stat = cusparseCreateMatDescr(&matstruct->descr);CHKERRCUDA(stat); 1226 stat = cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat); 1227 stat = cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUDA(stat); 1228 1229 err = cudaMalloc((void **)&(matstruct->alpha), sizeof(PetscScalar));CHKERRCUDA(err); 1230 err = cudaMalloc((void **)&(matstruct->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err); 1231 err = cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar));CHKERRCUDA(err); 1232 err = cudaMemcpy(matstruct->alpha, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1233 err = cudaMemcpy(matstruct->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1234 err = cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1235 stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUDA(stat); 1236 1237 /* Build a hybrid/ellpack matrix if this option is chosen for the storage */ 1238 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1239 /* set the matrix */ 1240 CsrMatrix *matrix= new CsrMatrix; 1241 matrix->num_rows = m; 1242 matrix->num_cols = A->cmap->n; 1243 matrix->num_entries = a->nz; 1244 matrix->row_offsets = new THRUSTINTARRAY32(m+1); 1245 matrix->row_offsets->assign(ii, ii + m+1); 1246 1247 matrix->column_indices = new THRUSTINTARRAY32(a->nz); 1248 matrix->column_indices->assign(a->j, a->j+a->nz); 1249 1250 matrix->values = new THRUSTARRAY(a->nz); 1251 matrix->values->assign(a->a, a->a+a->nz); 1252 1253 /* assign the pointer */ 1254 matstruct->mat = matrix; 1255 1256 } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) { 1257 #if CUDA_VERSION>=4020 1258 CsrMatrix *matrix= new CsrMatrix; 1259 matrix->num_rows = m; 1260 matrix->num_cols = A->cmap->n; 1261 matrix->num_entries = a->nz; 1262 matrix->row_offsets = new THRUSTINTARRAY32(m+1); 1263 matrix->row_offsets->assign(ii, ii + m+1); 1264 1265 matrix->column_indices = new THRUSTINTARRAY32(a->nz); 1266 matrix->column_indices->assign(a->j, a->j+a->nz); 1267 1268 matrix->values = new THRUSTARRAY(a->nz); 1269 matrix->values->assign(a->a, a->a+a->nz); 1270 1271 cusparseHybMat_t hybMat; 1272 stat = cusparseCreateHybMat(&hybMat);CHKERRCUDA(stat); 1273 cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ? 1274 CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 1275 stat = cusparse_csr2hyb(cusparsestruct->handle, matrix->num_rows, matrix->num_cols, 1276 matstruct->descr, matrix->values->data().get(), 1277 matrix->row_offsets->data().get(), 1278 matrix->column_indices->data().get(), 1279 hybMat, 0, partition);CHKERRCUDA(stat); 1280 /* assign the pointer */ 1281 matstruct->mat = hybMat; 1282 1283 if (matrix) { 1284 if (matrix->values) delete (THRUSTARRAY*)matrix->values; 1285 if (matrix->column_indices) delete (THRUSTINTARRAY32*)matrix->column_indices; 1286 if (matrix->row_offsets) delete (THRUSTINTARRAY32*)matrix->row_offsets; 1287 delete (CsrMatrix*)matrix; 1288 } 1289 #endif 1290 } 1291 1292 /* assign the compressed row indices */ 1293 matstruct->cprowIndices = new THRUSTINTARRAY(m); 1294 matstruct->cprowIndices->assign(ridx,ridx+m); 1295 1296 /* assign the pointer */ 1297 cusparsestruct->mat = matstruct; 1298 1299 if (!a->compressedrow.use) { 1300 ierr = PetscFree(ii);CHKERRQ(ierr); 1301 ierr = PetscFree(ridx);CHKERRQ(ierr); 1302 } 1303 cusparsestruct->workVector = new THRUSTARRAY(m); 1304 } catch(char *ex) { 1305 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 1306 } 1307 cusparsestruct->nonzerostate = A->nonzerostate; 1308 } 1309 ierr = WaitForGPU();CHKERRCUDA(ierr); 1310 A->valid_GPU_matrix = PETSC_OFFLOAD_BOTH; 1311 ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1312 } 1313 PetscFunctionReturn(0); 1314 } 1315 1316 struct VecCUDAPlusEquals 1317 { 1318 template <typename Tuple> 1319 __host__ __device__ 1320 void operator()(Tuple t) 1321 { 1322 thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t); 1323 } 1324 }; 1325 1326 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 1327 { 1328 PetscErrorCode ierr; 1329 1330 PetscFunctionBegin; 1331 ierr = MatMultAdd_SeqAIJCUSPARSE(A,xx,NULL,yy);CHKERRQ(ierr); 1332 PetscFunctionReturn(0); 1333 } 1334 1335 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 1336 { 1337 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1338 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1339 Mat_SeqAIJCUSPARSEMultStruct *matstructT; 1340 const PetscScalar *xarray; 1341 PetscScalar *yarray; 1342 PetscErrorCode ierr; 1343 cusparseStatus_t stat; 1344 1345 PetscFunctionBegin; 1346 /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */ 1347 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 1348 matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 1349 if (!matstructT) { 1350 ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr); 1351 matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 1352 } 1353 ierr = VecCUDAGetArrayRead(xx,&xarray);CHKERRQ(ierr); 1354 ierr = VecSet(yy,0);CHKERRQ(ierr); 1355 ierr = VecCUDAGetArrayWrite(yy,&yarray);CHKERRQ(ierr); 1356 1357 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1358 CsrMatrix *mat = (CsrMatrix*)matstructT->mat; 1359 stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1360 mat->num_rows, mat->num_cols, 1361 mat->num_entries, matstructT->alpha, matstructT->descr, 1362 mat->values->data().get(), mat->row_offsets->data().get(), 1363 mat->column_indices->data().get(), xarray, matstructT->beta_zero, 1364 yarray);CHKERRCUDA(stat); 1365 } else { 1366 #if CUDA_VERSION>=4020 1367 cusparseHybMat_t hybMat = (cusparseHybMat_t)matstructT->mat; 1368 stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1369 matstructT->alpha, matstructT->descr, hybMat, 1370 xarray, matstructT->beta_zero, 1371 yarray);CHKERRCUDA(stat); 1372 #endif 1373 } 1374 ierr = VecCUDARestoreArrayRead(xx,&xarray);CHKERRQ(ierr); 1375 ierr = VecCUDARestoreArrayWrite(yy,&yarray);CHKERRQ(ierr); 1376 if (!cusparsestruct->stream) { 1377 ierr = WaitForGPU();CHKERRCUDA(ierr); 1378 } 1379 ierr = PetscLogFlops(2.0*a->nz - cusparsestruct->nonzerorow);CHKERRQ(ierr); 1380 PetscFunctionReturn(0); 1381 } 1382 1383 1384 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 1385 { 1386 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1387 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1388 Mat_SeqAIJCUSPARSEMultStruct *matstruct; 1389 const PetscScalar *xarray; 1390 PetscScalar *zarray,*dptr,*beta; 1391 PetscErrorCode ierr; 1392 cusparseStatus_t stat; 1393 1394 PetscFunctionBegin; 1395 /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */ 1396 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 1397 matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 1398 try { 1399 ierr = VecCUDAGetArrayRead(xx,&xarray);CHKERRQ(ierr); 1400 ierr = VecCUDAGetArray(zz,&zarray);CHKERRQ(ierr); 1401 dptr = cusparsestruct->workVector->size() == (thrust::detail::vector_base<PetscScalar, thrust::device_malloc_allocator<PetscScalar> >::size_type)(A->rmap->n) ? zarray : cusparsestruct->workVector->data().get(); 1402 beta = (yy == zz && dptr == zarray) ? matstruct->beta_one : matstruct->beta_zero; 1403 1404 /* csr_spmv is multiply add */ 1405 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 1406 /* here we need to be careful to set the number of rows in the multiply to the 1407 number of compressed rows in the matrix ... which is equivalent to the 1408 size of the workVector */ 1409 CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 1410 stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1411 mat->num_rows, mat->num_cols, 1412 mat->num_entries, matstruct->alpha, matstruct->descr, 1413 mat->values->data().get(), mat->row_offsets->data().get(), 1414 mat->column_indices->data().get(), xarray, beta, 1415 dptr);CHKERRCUDA(stat); 1416 } else { 1417 #if CUDA_VERSION>=4020 1418 cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat; 1419 if (cusparsestruct->workVector->size()) { 1420 stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1421 matstruct->alpha, matstruct->descr, hybMat, 1422 xarray, beta, 1423 dptr);CHKERRCUDA(stat); 1424 } 1425 #endif 1426 } 1427 1428 if (yy) { 1429 if (dptr != zarray) { 1430 ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr); 1431 } else if (zz != yy) { 1432 ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); 1433 } 1434 } else if (dptr != zarray) { 1435 ierr = VecSet(zz,0);CHKERRQ(ierr); 1436 } 1437 /* scatter the data from the temporary into the full vector with a += operation */ 1438 if (dptr != zarray) { 1439 thrust::device_ptr<PetscScalar> zptr; 1440 1441 zptr = thrust::device_pointer_cast(zarray); 1442 thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))), 1443 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + cusparsestruct->workVector->size(), 1444 VecCUDAPlusEquals()); 1445 } 1446 ierr = VecCUDARestoreArrayRead(xx,&xarray);CHKERRQ(ierr); 1447 ierr = VecCUDARestoreArray(zz,&zarray);CHKERRQ(ierr); 1448 } catch(char *ex) { 1449 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 1450 } 1451 if (!yy) { /* MatMult */ 1452 if (!cusparsestruct->stream) { 1453 ierr = WaitForGPU();CHKERRCUDA(ierr); 1454 } 1455 } 1456 ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr); 1457 PetscFunctionReturn(0); 1458 } 1459 1460 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 1461 { 1462 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1463 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1464 Mat_SeqAIJCUSPARSEMultStruct *matstructT; 1465 thrust::device_ptr<PetscScalar> zptr; 1466 const PetscScalar *xarray; 1467 PetscScalar *zarray; 1468 PetscErrorCode ierr; 1469 cusparseStatus_t stat; 1470 1471 PetscFunctionBegin; 1472 /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */ 1473 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 1474 matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 1475 if (!matstructT) { 1476 ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr); 1477 matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 1478 } 1479 1480 try { 1481 ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr); 1482 ierr = VecCUDAGetArrayRead(xx,&xarray);CHKERRQ(ierr); 1483 ierr = VecCUDAGetArray(zz,&zarray);CHKERRQ(ierr); 1484 zptr = thrust::device_pointer_cast(zarray); 1485 1486 /* multiply add with matrix transpose */ 1487 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1488 CsrMatrix *mat = (CsrMatrix*)matstructT->mat; 1489 /* here we need to be careful to set the number of rows in the multiply to the 1490 number of compressed rows in the matrix ... which is equivalent to the 1491 size of the workVector */ 1492 stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1493 mat->num_rows, mat->num_cols, 1494 mat->num_entries, matstructT->alpha, matstructT->descr, 1495 mat->values->data().get(), mat->row_offsets->data().get(), 1496 mat->column_indices->data().get(), xarray, matstructT->beta_zero, 1497 cusparsestruct->workVector->data().get());CHKERRCUDA(stat); 1498 } else { 1499 #if CUDA_VERSION>=4020 1500 cusparseHybMat_t hybMat = (cusparseHybMat_t)matstructT->mat; 1501 if (cusparsestruct->workVector->size()) { 1502 stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1503 matstructT->alpha, matstructT->descr, hybMat, 1504 xarray, matstructT->beta_zero, 1505 cusparsestruct->workVector->data().get());CHKERRCUDA(stat); 1506 } 1507 #endif 1508 } 1509 1510 /* scatter the data from the temporary into the full vector with a += operation */ 1511 thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstructT->cprowIndices->begin()))), 1512 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstructT->cprowIndices->begin()))) + A->cmap->n, 1513 VecCUDAPlusEquals()); 1514 1515 ierr = VecCUDARestoreArrayRead(xx,&xarray);CHKERRQ(ierr); 1516 ierr = VecCUDARestoreArray(zz,&zarray);CHKERRQ(ierr); 1517 1518 } catch(char *ex) { 1519 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 1520 } 1521 ierr = WaitForGPU();CHKERRCUDA(ierr); 1522 ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr); 1523 PetscFunctionReturn(0); 1524 } 1525 1526 static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode) 1527 { 1528 PetscErrorCode ierr; 1529 1530 PetscFunctionBegin; 1531 ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr); 1532 if (A->factortype==MAT_FACTOR_NONE) { 1533 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 1534 } 1535 if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0); 1536 A->ops->mult = MatMult_SeqAIJCUSPARSE; 1537 A->ops->multadd = MatMultAdd_SeqAIJCUSPARSE; 1538 A->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE; 1539 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE; 1540 PetscFunctionReturn(0); 1541 } 1542 1543 /* --------------------------------------------------------------------------------*/ 1544 /*@ 1545 MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format 1546 (the default parallel PETSc format). This matrix will ultimately pushed down 1547 to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix 1548 assembly performance the user should preallocate the matrix storage by setting 1549 the parameter nz (or the array nnz). By setting these parameters accurately, 1550 performance during matrix assembly can be increased by more than a factor of 50. 1551 1552 Collective 1553 1554 Input Parameters: 1555 + comm - MPI communicator, set to PETSC_COMM_SELF 1556 . m - number of rows 1557 . n - number of columns 1558 . nz - number of nonzeros per row (same for all rows) 1559 - nnz - array containing the number of nonzeros in the various rows 1560 (possibly different for each row) or NULL 1561 1562 Output Parameter: 1563 . A - the matrix 1564 1565 It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 1566 MatXXXXSetPreallocation() paradgm instead of this routine directly. 1567 [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 1568 1569 Notes: 1570 If nnz is given then nz is ignored 1571 1572 The AIJ format (also called the Yale sparse matrix format or 1573 compressed row storage), is fully compatible with standard Fortran 77 1574 storage. That is, the stored row and column indices can begin at 1575 either one (as in Fortran) or zero. See the users' manual for details. 1576 1577 Specify the preallocated storage with either nz or nnz (not both). 1578 Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 1579 allocation. For large problems you MUST preallocate memory or you 1580 will get TERRIBLE performance, see the users' manual chapter on matrices. 1581 1582 By default, this format uses inodes (identical nodes) when possible, to 1583 improve numerical efficiency of matrix-vector products and solves. We 1584 search for consecutive rows with the same nonzero structure, thereby 1585 reusing matrix information to achieve increased efficiency. 1586 1587 Level: intermediate 1588 1589 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE 1590 @*/ 1591 PetscErrorCode MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A) 1592 { 1593 PetscErrorCode ierr; 1594 1595 PetscFunctionBegin; 1596 ierr = MatCreate(comm,A);CHKERRQ(ierr); 1597 ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr); 1598 ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 1599 ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr); 1600 PetscFunctionReturn(0); 1601 } 1602 1603 static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A) 1604 { 1605 PetscErrorCode ierr; 1606 1607 PetscFunctionBegin; 1608 if (A->factortype==MAT_FACTOR_NONE) { 1609 if (A->valid_GPU_matrix != PETSC_OFFLOAD_UNALLOCATED) { 1610 ierr = MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr);CHKERRQ(ierr); 1611 } 1612 } else { 1613 ierr = MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr);CHKERRQ(ierr); 1614 } 1615 ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr); 1616 PetscFunctionReturn(0); 1617 } 1618 1619 static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A,MatDuplicateOption cpvalues,Mat *B) 1620 { 1621 PetscErrorCode ierr; 1622 Mat C; 1623 cusparseStatus_t stat; 1624 cusparseHandle_t handle=0; 1625 1626 PetscFunctionBegin; 1627 ierr = MatDuplicate_SeqAIJ(A,cpvalues,B);CHKERRQ(ierr); 1628 C = *B; 1629 ierr = PetscFree(C->defaultvectype);CHKERRQ(ierr); 1630 ierr = PetscStrallocpy(VECCUDA,&C->defaultvectype);CHKERRQ(ierr); 1631 1632 /* inject CUSPARSE-specific stuff */ 1633 if (C->factortype==MAT_FACTOR_NONE) { 1634 /* you cannot check the inode.use flag here since the matrix was just created. 1635 now build a GPU matrix data structure */ 1636 C->spptr = new Mat_SeqAIJCUSPARSE; 1637 ((Mat_SeqAIJCUSPARSE*)C->spptr)->mat = 0; 1638 ((Mat_SeqAIJCUSPARSE*)C->spptr)->matTranspose = 0; 1639 ((Mat_SeqAIJCUSPARSE*)C->spptr)->workVector = 0; 1640 ((Mat_SeqAIJCUSPARSE*)C->spptr)->format = MAT_CUSPARSE_CSR; 1641 ((Mat_SeqAIJCUSPARSE*)C->spptr)->stream = 0; 1642 stat = cusparseCreate(&handle);CHKERRCUDA(stat); 1643 ((Mat_SeqAIJCUSPARSE*)C->spptr)->handle = handle; 1644 ((Mat_SeqAIJCUSPARSE*)C->spptr)->nonzerostate = 0; 1645 } else { 1646 /* NEXT, set the pointers to the triangular factors */ 1647 C->spptr = new Mat_SeqAIJCUSPARSETriFactors; 1648 ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->loTriFactorPtr = 0; 1649 ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->upTriFactorPtr = 0; 1650 ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->loTriFactorPtrTranspose = 0; 1651 ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->upTriFactorPtrTranspose = 0; 1652 ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->rpermIndices = 0; 1653 ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->cpermIndices = 0; 1654 ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->workVector = 0; 1655 ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->handle = 0; 1656 stat = cusparseCreate(&handle);CHKERRCUDA(stat); 1657 ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->handle = handle; 1658 ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->nnz = 0; 1659 } 1660 1661 C->ops->assemblyend = MatAssemblyEnd_SeqAIJCUSPARSE; 1662 C->ops->destroy = MatDestroy_SeqAIJCUSPARSE; 1663 C->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE; 1664 C->ops->mult = MatMult_SeqAIJCUSPARSE; 1665 C->ops->multadd = MatMultAdd_SeqAIJCUSPARSE; 1666 C->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE; 1667 C->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE; 1668 C->ops->duplicate = MatDuplicate_SeqAIJCUSPARSE; 1669 1670 ierr = PetscObjectChangeTypeName((PetscObject)C,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 1671 1672 C->valid_GPU_matrix = PETSC_OFFLOAD_UNALLOCATED; 1673 1674 ierr = PetscObjectComposeFunction((PetscObject)C, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr); 1675 PetscFunctionReturn(0); 1676 } 1677 1678 PETSC_EXTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat B) 1679 { 1680 PetscErrorCode ierr; 1681 cusparseStatus_t stat; 1682 cusparseHandle_t handle=0; 1683 1684 PetscFunctionBegin; 1685 ierr = PetscFree(B->defaultvectype);CHKERRQ(ierr); 1686 ierr = PetscStrallocpy(VECCUDA,&B->defaultvectype);CHKERRQ(ierr); 1687 1688 if (B->factortype==MAT_FACTOR_NONE) { 1689 /* you cannot check the inode.use flag here since the matrix was just created. 1690 now build a GPU matrix data structure */ 1691 B->spptr = new Mat_SeqAIJCUSPARSE; 1692 ((Mat_SeqAIJCUSPARSE*)B->spptr)->mat = 0; 1693 ((Mat_SeqAIJCUSPARSE*)B->spptr)->matTranspose = 0; 1694 ((Mat_SeqAIJCUSPARSE*)B->spptr)->workVector = 0; 1695 ((Mat_SeqAIJCUSPARSE*)B->spptr)->format = MAT_CUSPARSE_CSR; 1696 ((Mat_SeqAIJCUSPARSE*)B->spptr)->stream = 0; 1697 stat = cusparseCreate(&handle);CHKERRCUDA(stat); 1698 ((Mat_SeqAIJCUSPARSE*)B->spptr)->handle = handle; 1699 ((Mat_SeqAIJCUSPARSE*)B->spptr)->nonzerostate = 0; 1700 } else { 1701 /* NEXT, set the pointers to the triangular factors */ 1702 B->spptr = new Mat_SeqAIJCUSPARSETriFactors; 1703 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->loTriFactorPtr = 0; 1704 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->upTriFactorPtr = 0; 1705 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->loTriFactorPtrTranspose = 0; 1706 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->upTriFactorPtrTranspose = 0; 1707 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->rpermIndices = 0; 1708 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->cpermIndices = 0; 1709 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->workVector = 0; 1710 stat = cusparseCreate(&handle);CHKERRCUDA(stat); 1711 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->handle = handle; 1712 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->nnz = 0; 1713 } 1714 1715 B->ops->assemblyend = MatAssemblyEnd_SeqAIJCUSPARSE; 1716 B->ops->destroy = MatDestroy_SeqAIJCUSPARSE; 1717 B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE; 1718 B->ops->mult = MatMult_SeqAIJCUSPARSE; 1719 B->ops->multadd = MatMultAdd_SeqAIJCUSPARSE; 1720 B->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE; 1721 B->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE; 1722 B->ops->duplicate = MatDuplicate_SeqAIJCUSPARSE; 1723 1724 ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 1725 1726 B->valid_GPU_matrix = PETSC_OFFLOAD_UNALLOCATED; 1727 1728 ierr = PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr); 1729 PetscFunctionReturn(0); 1730 } 1731 1732 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B) 1733 { 1734 PetscErrorCode ierr; 1735 1736 PetscFunctionBegin; 1737 ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr); 1738 ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(B); 1739 PetscFunctionReturn(0); 1740 } 1741 1742 /*MC 1743 MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices. 1744 1745 A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either 1746 CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later. 1747 All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library. 1748 1749 Options Database Keys: 1750 + -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions() 1751 . -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 1752 . -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 1753 1754 Level: beginner 1755 1756 .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 1757 M*/ 1758 1759 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat,MatFactorType,Mat*); 1760 1761 1762 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void) 1763 { 1764 PetscErrorCode ierr; 1765 1766 PetscFunctionBegin; 1767 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 1768 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 1769 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 1770 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 1771 PetscFunctionReturn(0); 1772 } 1773 1774 1775 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct) 1776 { 1777 cusparseStatus_t stat; 1778 cusparseHandle_t handle; 1779 1780 PetscFunctionBegin; 1781 if (*cusparsestruct) { 1782 MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format); 1783 MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format); 1784 delete (*cusparsestruct)->workVector; 1785 if (handle = (*cusparsestruct)->handle) { 1786 stat = cusparseDestroy(handle);CHKERRCUDA(stat); 1787 } 1788 delete *cusparsestruct; 1789 *cusparsestruct = 0; 1790 } 1791 PetscFunctionReturn(0); 1792 } 1793 1794 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat) 1795 { 1796 PetscFunctionBegin; 1797 if (*mat) { 1798 delete (*mat)->values; 1799 delete (*mat)->column_indices; 1800 delete (*mat)->row_offsets; 1801 delete *mat; 1802 *mat = 0; 1803 } 1804 PetscFunctionReturn(0); 1805 } 1806 1807 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor) 1808 { 1809 cusparseStatus_t stat; 1810 PetscErrorCode ierr; 1811 1812 PetscFunctionBegin; 1813 if (*trifactor) { 1814 if ((*trifactor)->descr) { stat = cusparseDestroyMatDescr((*trifactor)->descr);CHKERRCUDA(stat); } 1815 if ((*trifactor)->solveInfo) { stat = cusparseDestroySolveAnalysisInfo((*trifactor)->solveInfo);CHKERRCUDA(stat); } 1816 ierr = CsrMatrix_Destroy(&(*trifactor)->csrMat);CHKERRQ(ierr); 1817 delete *trifactor; 1818 *trifactor = 0; 1819 } 1820 PetscFunctionReturn(0); 1821 } 1822 1823 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format) 1824 { 1825 CsrMatrix *mat; 1826 cusparseStatus_t stat; 1827 cudaError_t err; 1828 1829 PetscFunctionBegin; 1830 if (*matstruct) { 1831 if ((*matstruct)->mat) { 1832 if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) { 1833 cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat; 1834 stat = cusparseDestroyHybMat(hybMat);CHKERRCUDA(stat); 1835 } else { 1836 mat = (CsrMatrix*)(*matstruct)->mat; 1837 CsrMatrix_Destroy(&mat); 1838 } 1839 } 1840 if ((*matstruct)->descr) { stat = cusparseDestroyMatDescr((*matstruct)->descr);CHKERRCUDA(stat); } 1841 delete (*matstruct)->cprowIndices; 1842 if ((*matstruct)->alpha) { err=cudaFree((*matstruct)->alpha);CHKERRCUDA(err); } 1843 if ((*matstruct)->beta_zero) { err=cudaFree((*matstruct)->beta_zero);CHKERRCUDA(err); } 1844 if ((*matstruct)->beta_one) { err=cudaFree((*matstruct)->beta_one);CHKERRCUDA(err); } 1845 delete *matstruct; 1846 *matstruct = 0; 1847 } 1848 PetscFunctionReturn(0); 1849 } 1850 1851 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors) 1852 { 1853 cusparseHandle_t handle; 1854 cusparseStatus_t stat; 1855 1856 PetscFunctionBegin; 1857 if (*trifactors) { 1858 MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtr); 1859 MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtr); 1860 MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtrTranspose); 1861 MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtrTranspose); 1862 delete (*trifactors)->rpermIndices; 1863 delete (*trifactors)->cpermIndices; 1864 delete (*trifactors)->workVector; 1865 if (handle = (*trifactors)->handle) { 1866 stat = cusparseDestroy(handle);CHKERRCUDA(stat); 1867 } 1868 delete *trifactors; 1869 *trifactors = 0; 1870 } 1871 PetscFunctionReturn(0); 1872 } 1873 1874