1 /* 2 Defines the basic matrix operations for the AIJ (compressed row) 3 matrix storage format using the CUSPARSE library, 4 */ 5 #define PETSC_SKIP_SPINLOCK 6 7 #include <petscconf.h> 8 #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/ 9 #include <../src/mat/impls/sbaij/seq/sbaij.h> 10 #include <../src/vec/vec/impls/dvecimpl.h> 11 #include <petsc/private/vecimpl.h> 12 #undef VecType 13 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h> 14 15 const char *const MatCUSPARSEStorageFormats[] = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0}; 16 17 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*); 18 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*); 19 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*); 20 21 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 22 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 23 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*); 24 25 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec); 26 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 27 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 28 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 29 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat); 30 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec); 31 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 32 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 33 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 34 35 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**); 36 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**); 37 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat); 38 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**); 39 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**); 40 41 PetscErrorCode MatCUSPARSESetStream(Mat A,const cudaStream_t stream) 42 { 43 cusparseStatus_t stat; 44 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 45 46 PetscFunctionBegin; 47 cusparsestruct->stream = stream; 48 stat = cusparseSetStream(cusparsestruct->handle,cusparsestruct->stream);CHKERRCUDA(stat); 49 PetscFunctionReturn(0); 50 } 51 52 PetscErrorCode MatCUSPARSESetHandle(Mat A,const cusparseHandle_t handle) 53 { 54 cusparseStatus_t stat; 55 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 56 57 PetscFunctionBegin; 58 if (cusparsestruct->handle != handle) { 59 if (cusparsestruct->handle) { 60 stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUDA(stat); 61 } 62 cusparsestruct->handle = handle; 63 } 64 stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUDA(stat); 65 PetscFunctionReturn(0); 66 } 67 68 PetscErrorCode MatCUSPARSEClearHandle(Mat A) 69 { 70 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 71 PetscFunctionBegin; 72 if (cusparsestruct->handle) 73 cusparsestruct->handle = 0; 74 PetscFunctionReturn(0); 75 } 76 77 PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A,MatSolverType *type) 78 { 79 PetscFunctionBegin; 80 *type = MATSOLVERCUSPARSE; 81 PetscFunctionReturn(0); 82 } 83 84 /*MC 85 MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices 86 on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported 87 algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer 88 performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the 89 CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these 90 algorithms are not recommended. This class does NOT support direct solver operations. 91 92 Level: beginner 93 94 .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 95 M*/ 96 97 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B) 98 { 99 PetscErrorCode ierr; 100 PetscInt n = A->rmap->n; 101 102 PetscFunctionBegin; 103 ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr); 104 (*B)->factortype = ftype; 105 ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr); 106 ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 107 108 if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) { 109 ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr); 110 (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE; 111 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJCUSPARSE; 112 } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) { 113 (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJCUSPARSE; 114 (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE; 115 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types"); 116 117 ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr); 118 ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_cusparse);CHKERRQ(ierr); 119 PetscFunctionReturn(0); 120 } 121 122 PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 123 { 124 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 125 126 PetscFunctionBegin; 127 #if CUDA_VERSION>=4020 128 switch (op) { 129 case MAT_CUSPARSE_MULT: 130 cusparsestruct->format = format; 131 break; 132 case MAT_CUSPARSE_ALL: 133 cusparsestruct->format = format; 134 break; 135 default: 136 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op); 137 } 138 #else 139 if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"ELL (Ellpack) and HYB (Hybrid) storage format require CUDA 4.2 or later."); 140 #endif 141 PetscFunctionReturn(0); 142 } 143 144 /*@ 145 MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular 146 operation. Only the MatMult operation can use different GPU storage formats 147 for MPIAIJCUSPARSE matrices. 148 Not Collective 149 150 Input Parameters: 151 + A - Matrix of type SEQAIJCUSPARSE 152 . op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL. 153 - format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2) 154 155 Output Parameter: 156 157 Level: intermediate 158 159 .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 160 @*/ 161 PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 162 { 163 PetscErrorCode ierr; 164 165 PetscFunctionBegin; 166 PetscValidHeaderSpecific(A, MAT_CLASSID,1); 167 ierr = PetscTryMethod(A, "MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));CHKERRQ(ierr); 168 PetscFunctionReturn(0); 169 } 170 171 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A) 172 { 173 PetscErrorCode ierr; 174 MatCUSPARSEStorageFormat format; 175 PetscBool flg; 176 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 177 178 PetscFunctionBegin; 179 ierr = PetscOptionsHead(PetscOptionsObject,"SeqAIJCUSPARSE options");CHKERRQ(ierr); 180 if (A->factortype==MAT_FACTOR_NONE) { 181 ierr = PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV", 182 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 183 if (flg) { 184 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);CHKERRQ(ierr); 185 } 186 } 187 ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve", 188 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 189 if (flg) { 190 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr); 191 } 192 ierr = PetscOptionsTail();CHKERRQ(ierr); 193 PetscFunctionReturn(0); 194 195 } 196 197 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 198 { 199 PetscErrorCode ierr; 200 201 PetscFunctionBegin; 202 ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 203 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 204 PetscFunctionReturn(0); 205 } 206 207 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 208 { 209 PetscErrorCode ierr; 210 211 PetscFunctionBegin; 212 ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 213 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 214 PetscFunctionReturn(0); 215 } 216 217 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info) 218 { 219 PetscErrorCode ierr; 220 221 PetscFunctionBegin; 222 ierr = MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr); 223 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 224 PetscFunctionReturn(0); 225 } 226 227 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info) 228 { 229 PetscErrorCode ierr; 230 231 PetscFunctionBegin; 232 ierr = MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr); 233 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 234 PetscFunctionReturn(0); 235 } 236 237 static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A) 238 { 239 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 240 PetscInt n = A->rmap->n; 241 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 242 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 243 cusparseStatus_t stat; 244 const PetscInt *ai = a->i,*aj = a->j,*vi; 245 const MatScalar *aa = a->a,*v; 246 PetscInt *AiLo, *AjLo; 247 PetscScalar *AALo; 248 PetscInt i,nz, nzLower, offset, rowOffset; 249 PetscErrorCode ierr; 250 251 PetscFunctionBegin; 252 if (A->valid_GPU_matrix == PETSC_OFFLOAD_UNALLOCATED || A->valid_GPU_matrix == PETSC_OFFLOAD_CPU) { 253 try { 254 /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */ 255 nzLower=n+ai[n]-ai[1]; 256 257 /* Allocate Space for the lower triangular matrix */ 258 ierr = cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUDA(ierr); 259 ierr = cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUDA(ierr); 260 ierr = cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUDA(ierr); 261 262 /* Fill the lower triangular matrix */ 263 AiLo[0] = (PetscInt) 0; 264 AiLo[n] = nzLower; 265 AjLo[0] = (PetscInt) 0; 266 AALo[0] = (MatScalar) 1.0; 267 v = aa; 268 vi = aj; 269 offset = 1; 270 rowOffset= 1; 271 for (i=1; i<n; i++) { 272 nz = ai[i+1] - ai[i]; 273 /* additional 1 for the term on the diagonal */ 274 AiLo[i] = rowOffset; 275 rowOffset += nz+1; 276 277 ierr = PetscArraycpy(&(AjLo[offset]), vi, nz);CHKERRQ(ierr); 278 ierr = PetscArraycpy(&(AALo[offset]), v, nz);CHKERRQ(ierr); 279 280 offset += nz; 281 AjLo[offset] = (PetscInt) i; 282 AALo[offset] = (MatScalar) 1.0; 283 offset += 1; 284 285 v += nz; 286 vi += nz; 287 } 288 289 /* allocate space for the triangular factor information */ 290 loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct; 291 292 /* Create the matrix description */ 293 stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUDA(stat); 294 stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat); 295 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUDA(stat); 296 stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER);CHKERRCUDA(stat); 297 stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUDA(stat); 298 299 /* Create the solve analysis information */ 300 stat = cusparseCreateSolveAnalysisInfo(&loTriFactor->solveInfo);CHKERRCUDA(stat); 301 302 /* set the operation */ 303 loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 304 305 /* set the matrix */ 306 loTriFactor->csrMat = new CsrMatrix; 307 loTriFactor->csrMat->num_rows = n; 308 loTriFactor->csrMat->num_cols = n; 309 loTriFactor->csrMat->num_entries = nzLower; 310 311 loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1); 312 loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1); 313 314 loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower); 315 loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower); 316 317 loTriFactor->csrMat->values = new THRUSTARRAY(nzLower); 318 loTriFactor->csrMat->values->assign(AALo, AALo+nzLower); 319 320 /* perform the solve analysis */ 321 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, 322 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 323 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 324 loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo);CHKERRCUDA(stat); 325 326 /* assign the pointer. Is this really necessary? */ 327 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor; 328 329 ierr = cudaFreeHost(AiLo);CHKERRCUDA(ierr); 330 ierr = cudaFreeHost(AjLo);CHKERRCUDA(ierr); 331 ierr = cudaFreeHost(AALo);CHKERRCUDA(ierr); 332 } catch(char *ex) { 333 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 334 } 335 } 336 PetscFunctionReturn(0); 337 } 338 339 static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A) 340 { 341 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 342 PetscInt n = A->rmap->n; 343 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 344 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 345 cusparseStatus_t stat; 346 const PetscInt *aj = a->j,*adiag = a->diag,*vi; 347 const MatScalar *aa = a->a,*v; 348 PetscInt *AiUp, *AjUp; 349 PetscScalar *AAUp; 350 PetscInt i,nz, nzUpper, offset; 351 PetscErrorCode ierr; 352 353 PetscFunctionBegin; 354 if (A->valid_GPU_matrix == PETSC_OFFLOAD_UNALLOCATED || A->valid_GPU_matrix == PETSC_OFFLOAD_CPU) { 355 try { 356 /* next, figure out the number of nonzeros in the upper triangular matrix. */ 357 nzUpper = adiag[0]-adiag[n]; 358 359 /* Allocate Space for the upper triangular matrix */ 360 ierr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(ierr); 361 ierr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(ierr); 362 ierr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(ierr); 363 364 /* Fill the upper triangular matrix */ 365 AiUp[0]=(PetscInt) 0; 366 AiUp[n]=nzUpper; 367 offset = nzUpper; 368 for (i=n-1; i>=0; i--) { 369 v = aa + adiag[i+1] + 1; 370 vi = aj + adiag[i+1] + 1; 371 372 /* number of elements NOT on the diagonal */ 373 nz = adiag[i] - adiag[i+1]-1; 374 375 /* decrement the offset */ 376 offset -= (nz+1); 377 378 /* first, set the diagonal elements */ 379 AjUp[offset] = (PetscInt) i; 380 AAUp[offset] = (MatScalar)1./v[nz]; 381 AiUp[i] = AiUp[i+1] - (nz+1); 382 383 ierr = PetscArraycpy(&(AjUp[offset+1]), vi, nz);CHKERRQ(ierr); 384 ierr = PetscArraycpy(&(AAUp[offset+1]), v, nz);CHKERRQ(ierr); 385 } 386 387 /* allocate space for the triangular factor information */ 388 upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct; 389 390 /* Create the matrix description */ 391 stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUDA(stat); 392 stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat); 393 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUDA(stat); 394 stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUDA(stat); 395 stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUDA(stat); 396 397 /* Create the solve analysis information */ 398 stat = cusparseCreateSolveAnalysisInfo(&upTriFactor->solveInfo);CHKERRCUDA(stat); 399 400 /* set the operation */ 401 upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 402 403 /* set the matrix */ 404 upTriFactor->csrMat = new CsrMatrix; 405 upTriFactor->csrMat->num_rows = n; 406 upTriFactor->csrMat->num_cols = n; 407 upTriFactor->csrMat->num_entries = nzUpper; 408 409 upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1); 410 upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1); 411 412 upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper); 413 upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper); 414 415 upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper); 416 upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper); 417 418 /* perform the solve analysis */ 419 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, 420 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 421 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 422 upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo);CHKERRCUDA(stat); 423 424 /* assign the pointer. Is this really necessary? */ 425 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor; 426 427 ierr = cudaFreeHost(AiUp);CHKERRCUDA(ierr); 428 ierr = cudaFreeHost(AjUp);CHKERRCUDA(ierr); 429 ierr = cudaFreeHost(AAUp);CHKERRCUDA(ierr); 430 } catch(char *ex) { 431 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 432 } 433 } 434 PetscFunctionReturn(0); 435 } 436 437 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A) 438 { 439 PetscErrorCode ierr; 440 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 441 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 442 IS isrow = a->row,iscol = a->icol; 443 PetscBool row_identity,col_identity; 444 const PetscInt *r,*c; 445 PetscInt n = A->rmap->n; 446 447 PetscFunctionBegin; 448 ierr = MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);CHKERRQ(ierr); 449 ierr = MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);CHKERRQ(ierr); 450 451 cusparseTriFactors->workVector = new THRUSTARRAY(n); 452 cusparseTriFactors->nnz=a->nz; 453 454 A->valid_GPU_matrix = PETSC_OFFLOAD_BOTH; 455 /*lower triangular indices */ 456 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 457 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 458 if (!row_identity) { 459 cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 460 cusparseTriFactors->rpermIndices->assign(r, r+n); 461 } 462 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 463 464 /*upper triangular indices */ 465 ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr); 466 ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 467 if (!col_identity) { 468 cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 469 cusparseTriFactors->cpermIndices->assign(c, c+n); 470 } 471 ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr); 472 PetscFunctionReturn(0); 473 } 474 475 static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A) 476 { 477 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 478 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 479 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 480 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 481 cusparseStatus_t stat; 482 PetscErrorCode ierr; 483 PetscInt *AiUp, *AjUp; 484 PetscScalar *AAUp; 485 PetscScalar *AALo; 486 PetscInt nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j; 487 Mat_SeqSBAIJ *b = (Mat_SeqSBAIJ*)A->data; 488 const PetscInt *ai = b->i,*aj = b->j,*vj; 489 const MatScalar *aa = b->a,*v; 490 491 PetscFunctionBegin; 492 if (A->valid_GPU_matrix == PETSC_OFFLOAD_UNALLOCATED || A->valid_GPU_matrix == PETSC_OFFLOAD_CPU) { 493 try { 494 /* Allocate Space for the upper triangular matrix */ 495 ierr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(ierr); 496 ierr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(ierr); 497 ierr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(ierr); 498 ierr = cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUDA(ierr); 499 500 /* Fill the upper triangular matrix */ 501 AiUp[0]=(PetscInt) 0; 502 AiUp[n]=nzUpper; 503 offset = 0; 504 for (i=0; i<n; i++) { 505 /* set the pointers */ 506 v = aa + ai[i]; 507 vj = aj + ai[i]; 508 nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */ 509 510 /* first, set the diagonal elements */ 511 AjUp[offset] = (PetscInt) i; 512 AAUp[offset] = (MatScalar)1.0/v[nz]; 513 AiUp[i] = offset; 514 AALo[offset] = (MatScalar)1.0/v[nz]; 515 516 offset+=1; 517 if (nz>0) { 518 ierr = PetscArraycpy(&(AjUp[offset]), vj, nz);CHKERRQ(ierr); 519 ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr); 520 for (j=offset; j<offset+nz; j++) { 521 AAUp[j] = -AAUp[j]; 522 AALo[j] = AAUp[j]/v[nz]; 523 } 524 offset+=nz; 525 } 526 } 527 528 /* allocate space for the triangular factor information */ 529 upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct; 530 531 /* Create the matrix description */ 532 stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUDA(stat); 533 stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat); 534 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUDA(stat); 535 stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUDA(stat); 536 stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUDA(stat); 537 538 /* Create the solve analysis information */ 539 stat = cusparseCreateSolveAnalysisInfo(&upTriFactor->solveInfo);CHKERRCUDA(stat); 540 541 /* set the operation */ 542 upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 543 544 /* set the matrix */ 545 upTriFactor->csrMat = new CsrMatrix; 546 upTriFactor->csrMat->num_rows = A->rmap->n; 547 upTriFactor->csrMat->num_cols = A->cmap->n; 548 upTriFactor->csrMat->num_entries = a->nz; 549 550 upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 551 upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1); 552 553 upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 554 upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz); 555 556 upTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 557 upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz); 558 559 /* perform the solve analysis */ 560 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, 561 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 562 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 563 upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo);CHKERRCUDA(stat); 564 565 /* assign the pointer. Is this really necessary? */ 566 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor; 567 568 /* allocate space for the triangular factor information */ 569 loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct; 570 571 /* Create the matrix description */ 572 stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUDA(stat); 573 stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat); 574 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUDA(stat); 575 stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUDA(stat); 576 stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUDA(stat); 577 578 /* Create the solve analysis information */ 579 stat = cusparseCreateSolveAnalysisInfo(&loTriFactor->solveInfo);CHKERRCUDA(stat); 580 581 /* set the operation */ 582 loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE; 583 584 /* set the matrix */ 585 loTriFactor->csrMat = new CsrMatrix; 586 loTriFactor->csrMat->num_rows = A->rmap->n; 587 loTriFactor->csrMat->num_cols = A->cmap->n; 588 loTriFactor->csrMat->num_entries = a->nz; 589 590 loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 591 loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1); 592 593 loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 594 loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz); 595 596 loTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 597 loTriFactor->csrMat->values->assign(AALo, AALo+a->nz); 598 599 /* perform the solve analysis */ 600 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, 601 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 602 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 603 loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo);CHKERRCUDA(stat); 604 605 /* assign the pointer. Is this really necessary? */ 606 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor; 607 608 A->valid_GPU_matrix = PETSC_OFFLOAD_BOTH; 609 ierr = cudaFreeHost(AiUp);CHKERRCUDA(ierr); 610 ierr = cudaFreeHost(AjUp);CHKERRCUDA(ierr); 611 ierr = cudaFreeHost(AAUp);CHKERRCUDA(ierr); 612 ierr = cudaFreeHost(AALo);CHKERRCUDA(ierr); 613 } catch(char *ex) { 614 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 615 } 616 } 617 PetscFunctionReturn(0); 618 } 619 620 static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A) 621 { 622 PetscErrorCode ierr; 623 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 624 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 625 IS ip = a->row; 626 const PetscInt *rip; 627 PetscBool perm_identity; 628 PetscInt n = A->rmap->n; 629 630 PetscFunctionBegin; 631 ierr = MatSeqAIJCUSPARSEBuildICCTriMatrices(A);CHKERRQ(ierr); 632 cusparseTriFactors->workVector = new THRUSTARRAY(n); 633 cusparseTriFactors->nnz=(a->nz-n)*2 + n; 634 635 /*lower triangular indices */ 636 ierr = ISGetIndices(ip,&rip);CHKERRQ(ierr); 637 ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr); 638 if (!perm_identity) { 639 cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 640 cusparseTriFactors->rpermIndices->assign(rip, rip+n); 641 cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 642 cusparseTriFactors->cpermIndices->assign(rip, rip+n); 643 } 644 ierr = ISRestoreIndices(ip,&rip);CHKERRQ(ierr); 645 PetscFunctionReturn(0); 646 } 647 648 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info) 649 { 650 Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 651 IS isrow = b->row,iscol = b->col; 652 PetscBool row_identity,col_identity; 653 PetscErrorCode ierr; 654 655 PetscFunctionBegin; 656 ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 657 /* determine which version of MatSolve needs to be used. */ 658 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 659 ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 660 if (row_identity && col_identity) { 661 B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 662 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 663 } else { 664 B->ops->solve = MatSolve_SeqAIJCUSPARSE; 665 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 666 } 667 668 /* get the triangular factors */ 669 ierr = MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);CHKERRQ(ierr); 670 PetscFunctionReturn(0); 671 } 672 673 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info) 674 { 675 Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 676 IS ip = b->row; 677 PetscBool perm_identity; 678 PetscErrorCode ierr; 679 680 PetscFunctionBegin; 681 ierr = MatCholeskyFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 682 683 /* determine which version of MatSolve needs to be used. */ 684 ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr); 685 if (perm_identity) { 686 B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 687 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 688 } else { 689 B->ops->solve = MatSolve_SeqAIJCUSPARSE; 690 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 691 } 692 693 /* get the triangular factors */ 694 ierr = MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);CHKERRQ(ierr); 695 PetscFunctionReturn(0); 696 } 697 698 static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A) 699 { 700 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 701 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 702 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 703 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 704 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 705 cusparseStatus_t stat; 706 cusparseIndexBase_t indexBase; 707 cusparseMatrixType_t matrixType; 708 cusparseFillMode_t fillMode; 709 cusparseDiagType_t diagType; 710 711 PetscFunctionBegin; 712 713 /*********************************************/ 714 /* Now the Transpose of the Lower Tri Factor */ 715 /*********************************************/ 716 717 /* allocate space for the transpose of the lower triangular factor */ 718 loTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct; 719 720 /* set the matrix descriptors of the lower triangular factor */ 721 matrixType = cusparseGetMatType(loTriFactor->descr); 722 indexBase = cusparseGetMatIndexBase(loTriFactor->descr); 723 fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ? 724 CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 725 diagType = cusparseGetMatDiagType(loTriFactor->descr); 726 727 /* Create the matrix description */ 728 stat = cusparseCreateMatDescr(&loTriFactorT->descr);CHKERRCUDA(stat); 729 stat = cusparseSetMatIndexBase(loTriFactorT->descr, indexBase);CHKERRCUDA(stat); 730 stat = cusparseSetMatType(loTriFactorT->descr, matrixType);CHKERRCUDA(stat); 731 stat = cusparseSetMatFillMode(loTriFactorT->descr, fillMode);CHKERRCUDA(stat); 732 stat = cusparseSetMatDiagType(loTriFactorT->descr, diagType);CHKERRCUDA(stat); 733 734 /* Create the solve analysis information */ 735 stat = cusparseCreateSolveAnalysisInfo(&loTriFactorT->solveInfo);CHKERRCUDA(stat); 736 737 /* set the operation */ 738 loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 739 740 /* allocate GPU space for the CSC of the lower triangular factor*/ 741 loTriFactorT->csrMat = new CsrMatrix; 742 loTriFactorT->csrMat->num_rows = loTriFactor->csrMat->num_rows; 743 loTriFactorT->csrMat->num_cols = loTriFactor->csrMat->num_cols; 744 loTriFactorT->csrMat->num_entries = loTriFactor->csrMat->num_entries; 745 loTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(loTriFactor->csrMat->num_rows+1); 746 loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactor->csrMat->num_entries); 747 loTriFactorT->csrMat->values = new THRUSTARRAY(loTriFactor->csrMat->num_entries); 748 749 /* compute the transpose of the lower triangular factor, i.e. the CSC */ 750 stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, 751 loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, 752 loTriFactor->csrMat->values->data().get(), 753 loTriFactor->csrMat->row_offsets->data().get(), 754 loTriFactor->csrMat->column_indices->data().get(), 755 loTriFactorT->csrMat->values->data().get(), 756 loTriFactorT->csrMat->column_indices->data().get(), 757 loTriFactorT->csrMat->row_offsets->data().get(), 758 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUDA(stat); 759 760 /* perform the solve analysis on the transposed matrix */ 761 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp, 762 loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, 763 loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(), 764 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), 765 loTriFactorT->solveInfo);CHKERRCUDA(stat); 766 767 /* assign the pointer. Is this really necessary? */ 768 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT; 769 770 /*********************************************/ 771 /* Now the Transpose of the Upper Tri Factor */ 772 /*********************************************/ 773 774 /* allocate space for the transpose of the upper triangular factor */ 775 upTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct; 776 777 /* set the matrix descriptors of the upper triangular factor */ 778 matrixType = cusparseGetMatType(upTriFactor->descr); 779 indexBase = cusparseGetMatIndexBase(upTriFactor->descr); 780 fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ? 781 CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 782 diagType = cusparseGetMatDiagType(upTriFactor->descr); 783 784 /* Create the matrix description */ 785 stat = cusparseCreateMatDescr(&upTriFactorT->descr);CHKERRCUDA(stat); 786 stat = cusparseSetMatIndexBase(upTriFactorT->descr, indexBase);CHKERRCUDA(stat); 787 stat = cusparseSetMatType(upTriFactorT->descr, matrixType);CHKERRCUDA(stat); 788 stat = cusparseSetMatFillMode(upTriFactorT->descr, fillMode);CHKERRCUDA(stat); 789 stat = cusparseSetMatDiagType(upTriFactorT->descr, diagType);CHKERRCUDA(stat); 790 791 /* Create the solve analysis information */ 792 stat = cusparseCreateSolveAnalysisInfo(&upTriFactorT->solveInfo);CHKERRCUDA(stat); 793 794 /* set the operation */ 795 upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 796 797 /* allocate GPU space for the CSC of the upper triangular factor*/ 798 upTriFactorT->csrMat = new CsrMatrix; 799 upTriFactorT->csrMat->num_rows = upTriFactor->csrMat->num_rows; 800 upTriFactorT->csrMat->num_cols = upTriFactor->csrMat->num_cols; 801 upTriFactorT->csrMat->num_entries = upTriFactor->csrMat->num_entries; 802 upTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(upTriFactor->csrMat->num_rows+1); 803 upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactor->csrMat->num_entries); 804 upTriFactorT->csrMat->values = new THRUSTARRAY(upTriFactor->csrMat->num_entries); 805 806 /* compute the transpose of the upper triangular factor, i.e. the CSC */ 807 stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, 808 upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, 809 upTriFactor->csrMat->values->data().get(), 810 upTriFactor->csrMat->row_offsets->data().get(), 811 upTriFactor->csrMat->column_indices->data().get(), 812 upTriFactorT->csrMat->values->data().get(), 813 upTriFactorT->csrMat->column_indices->data().get(), 814 upTriFactorT->csrMat->row_offsets->data().get(), 815 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUDA(stat); 816 817 /* perform the solve analysis on the transposed matrix */ 818 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp, 819 upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, 820 upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(), 821 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), 822 upTriFactorT->solveInfo);CHKERRCUDA(stat); 823 824 /* assign the pointer. Is this really necessary? */ 825 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT; 826 PetscFunctionReturn(0); 827 } 828 829 static PetscErrorCode MatSeqAIJCUSPARSEGenerateTransposeForMult(Mat A) 830 { 831 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 832 Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 833 Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 834 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 835 cusparseStatus_t stat; 836 cusparseIndexBase_t indexBase; 837 cudaError_t err; 838 839 PetscFunctionBegin; 840 841 /* allocate space for the triangular factor information */ 842 matstructT = new Mat_SeqAIJCUSPARSEMultStruct; 843 stat = cusparseCreateMatDescr(&matstructT->descr);CHKERRCUDA(stat); 844 indexBase = cusparseGetMatIndexBase(matstruct->descr); 845 stat = cusparseSetMatIndexBase(matstructT->descr, indexBase);CHKERRCUDA(stat); 846 stat = cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUDA(stat); 847 848 /* set alpha and beta */ 849 err = cudaMalloc((void **)&(matstructT->alpha), sizeof(PetscScalar));CHKERRCUDA(err); 850 err = cudaMalloc((void **)&(matstructT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err); 851 err = cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar));CHKERRCUDA(err); 852 err = cudaMemcpy(matstructT->alpha, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 853 err = cudaMemcpy(matstructT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 854 err = cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 855 stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUDA(stat); 856 857 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 858 CsrMatrix *matrix = (CsrMatrix*)matstruct->mat; 859 CsrMatrix *matrixT= new CsrMatrix; 860 matrixT->num_rows = A->cmap->n; 861 matrixT->num_cols = A->rmap->n; 862 matrixT->num_entries = a->nz; 863 matrixT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 864 matrixT->column_indices = new THRUSTINTARRAY32(a->nz); 865 matrixT->values = new THRUSTARRAY(a->nz); 866 867 /* compute the transpose of the upper triangular factor, i.e. the CSC */ 868 indexBase = cusparseGetMatIndexBase(matstruct->descr); 869 stat = cusparse_csr2csc(cusparsestruct->handle, matrix->num_rows, 870 matrix->num_cols, matrix->num_entries, 871 matrix->values->data().get(), 872 matrix->row_offsets->data().get(), 873 matrix->column_indices->data().get(), 874 matrixT->values->data().get(), 875 matrixT->column_indices->data().get(), 876 matrixT->row_offsets->data().get(), 877 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUDA(stat); 878 879 /* assign the pointer */ 880 matstructT->mat = matrixT; 881 882 } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) { 883 #if CUDA_VERSION>=5000 884 /* First convert HYB to CSR */ 885 CsrMatrix *temp= new CsrMatrix; 886 temp->num_rows = A->rmap->n; 887 temp->num_cols = A->cmap->n; 888 temp->num_entries = a->nz; 889 temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 890 temp->column_indices = new THRUSTINTARRAY32(a->nz); 891 temp->values = new THRUSTARRAY(a->nz); 892 893 894 stat = cusparse_hyb2csr(cusparsestruct->handle, 895 matstruct->descr, (cusparseHybMat_t)matstruct->mat, 896 temp->values->data().get(), 897 temp->row_offsets->data().get(), 898 temp->column_indices->data().get());CHKERRCUDA(stat); 899 900 /* Next, convert CSR to CSC (i.e. the matrix transpose) */ 901 CsrMatrix *tempT= new CsrMatrix; 902 tempT->num_rows = A->rmap->n; 903 tempT->num_cols = A->cmap->n; 904 tempT->num_entries = a->nz; 905 tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 906 tempT->column_indices = new THRUSTINTARRAY32(a->nz); 907 tempT->values = new THRUSTARRAY(a->nz); 908 909 stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows, 910 temp->num_cols, temp->num_entries, 911 temp->values->data().get(), 912 temp->row_offsets->data().get(), 913 temp->column_indices->data().get(), 914 tempT->values->data().get(), 915 tempT->column_indices->data().get(), 916 tempT->row_offsets->data().get(), 917 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUDA(stat); 918 919 /* Last, convert CSC to HYB */ 920 cusparseHybMat_t hybMat; 921 stat = cusparseCreateHybMat(&hybMat);CHKERRCUDA(stat); 922 cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ? 923 CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 924 stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n, 925 matstructT->descr, tempT->values->data().get(), 926 tempT->row_offsets->data().get(), 927 tempT->column_indices->data().get(), 928 hybMat, 0, partition);CHKERRCUDA(stat); 929 930 /* assign the pointer */ 931 matstructT->mat = hybMat; 932 933 /* delete temporaries */ 934 if (tempT) { 935 if (tempT->values) delete (THRUSTARRAY*) tempT->values; 936 if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices; 937 if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets; 938 delete (CsrMatrix*) tempT; 939 } 940 if (temp) { 941 if (temp->values) delete (THRUSTARRAY*) temp->values; 942 if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices; 943 if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets; 944 delete (CsrMatrix*) temp; 945 } 946 #else 947 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"ELL (Ellpack) and HYB (Hybrid) storage format for the Matrix Transpose (in MatMultTranspose) require CUDA 5.0 or later."); 948 #endif 949 } 950 /* assign the compressed row indices */ 951 matstructT->cprowIndices = new THRUSTINTARRAY; 952 matstructT->cprowIndices->resize(A->cmap->n); 953 thrust::sequence(matstructT->cprowIndices->begin(), matstructT->cprowIndices->end()); 954 955 /* assign the pointer */ 956 ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT; 957 PetscFunctionReturn(0); 958 } 959 960 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 961 { 962 PetscInt n = xx->map->n; 963 const PetscScalar *barray; 964 PetscScalar *xarray; 965 thrust::device_ptr<const PetscScalar> bGPU; 966 thrust::device_ptr<PetscScalar> xGPU; 967 cusparseStatus_t stat; 968 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 969 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 970 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 971 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 972 PetscErrorCode ierr; 973 974 PetscFunctionBegin; 975 /* Analyze the matrix and create the transpose ... on the fly */ 976 if (!loTriFactorT && !upTriFactorT) { 977 ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr); 978 loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 979 upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 980 } 981 982 /* Get the GPU pointers */ 983 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 984 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 985 xGPU = thrust::device_pointer_cast(xarray); 986 bGPU = thrust::device_pointer_cast(barray); 987 988 /* First, reorder with the row permutation */ 989 thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), 990 thrust::make_permutation_iterator(bGPU+n, cusparseTriFactors->rpermIndices->end()), 991 xGPU); 992 993 /* First, solve U */ 994 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, 995 upTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactorT->descr, 996 upTriFactorT->csrMat->values->data().get(), 997 upTriFactorT->csrMat->row_offsets->data().get(), 998 upTriFactorT->csrMat->column_indices->data().get(), 999 upTriFactorT->solveInfo, 1000 xarray, tempGPU->data().get());CHKERRCUDA(stat); 1001 1002 /* Then, solve L */ 1003 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, 1004 loTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactorT->descr, 1005 loTriFactorT->csrMat->values->data().get(), 1006 loTriFactorT->csrMat->row_offsets->data().get(), 1007 loTriFactorT->csrMat->column_indices->data().get(), 1008 loTriFactorT->solveInfo, 1009 tempGPU->data().get(), xarray);CHKERRCUDA(stat); 1010 1011 /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */ 1012 thrust::copy(thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()), 1013 thrust::make_permutation_iterator(xGPU+n, cusparseTriFactors->cpermIndices->end()), 1014 tempGPU->begin()); 1015 1016 /* Copy the temporary to the full solution. */ 1017 thrust::copy(tempGPU->begin(), tempGPU->end(), xGPU); 1018 1019 /* restore */ 1020 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1021 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1022 ierr = WaitForGPU();CHKERRCUDA(ierr); 1023 1024 ierr = PetscLogFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1025 PetscFunctionReturn(0); 1026 } 1027 1028 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 1029 { 1030 const PetscScalar *barray; 1031 PetscScalar *xarray; 1032 cusparseStatus_t stat; 1033 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1034 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1035 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1036 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1037 PetscErrorCode ierr; 1038 1039 PetscFunctionBegin; 1040 /* Analyze the matrix and create the transpose ... on the fly */ 1041 if (!loTriFactorT && !upTriFactorT) { 1042 ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr); 1043 loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1044 upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1045 } 1046 1047 /* Get the GPU pointers */ 1048 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1049 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1050 1051 /* First, solve U */ 1052 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, 1053 upTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactorT->descr, 1054 upTriFactorT->csrMat->values->data().get(), 1055 upTriFactorT->csrMat->row_offsets->data().get(), 1056 upTriFactorT->csrMat->column_indices->data().get(), 1057 upTriFactorT->solveInfo, 1058 barray, tempGPU->data().get());CHKERRCUDA(stat); 1059 1060 /* Then, solve L */ 1061 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, 1062 loTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactorT->descr, 1063 loTriFactorT->csrMat->values->data().get(), 1064 loTriFactorT->csrMat->row_offsets->data().get(), 1065 loTriFactorT->csrMat->column_indices->data().get(), 1066 loTriFactorT->solveInfo, 1067 tempGPU->data().get(), xarray);CHKERRCUDA(stat); 1068 1069 /* restore */ 1070 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1071 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1072 ierr = WaitForGPU();CHKERRCUDA(ierr); 1073 ierr = PetscLogFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1074 PetscFunctionReturn(0); 1075 } 1076 1077 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 1078 { 1079 const PetscScalar *barray; 1080 PetscScalar *xarray; 1081 thrust::device_ptr<const PetscScalar> bGPU; 1082 thrust::device_ptr<PetscScalar> xGPU; 1083 cusparseStatus_t stat; 1084 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1085 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1086 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1087 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1088 PetscErrorCode ierr; 1089 1090 PetscFunctionBegin; 1091 1092 /* Get the GPU pointers */ 1093 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1094 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1095 xGPU = thrust::device_pointer_cast(xarray); 1096 bGPU = thrust::device_pointer_cast(barray); 1097 1098 /* First, reorder with the row permutation */ 1099 thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), 1100 thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()), 1101 xGPU); 1102 1103 /* Next, solve L */ 1104 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp, 1105 loTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactor->descr, 1106 loTriFactor->csrMat->values->data().get(), 1107 loTriFactor->csrMat->row_offsets->data().get(), 1108 loTriFactor->csrMat->column_indices->data().get(), 1109 loTriFactor->solveInfo, 1110 xarray, tempGPU->data().get());CHKERRCUDA(stat); 1111 1112 /* Then, solve U */ 1113 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp, 1114 upTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactor->descr, 1115 upTriFactor->csrMat->values->data().get(), 1116 upTriFactor->csrMat->row_offsets->data().get(), 1117 upTriFactor->csrMat->column_indices->data().get(), 1118 upTriFactor->solveInfo, 1119 tempGPU->data().get(), xarray);CHKERRCUDA(stat); 1120 1121 /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */ 1122 thrust::copy(thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()), 1123 thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->end()), 1124 tempGPU->begin()); 1125 1126 /* Copy the temporary to the full solution. */ 1127 thrust::copy(tempGPU->begin(), tempGPU->end(), xGPU); 1128 1129 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1130 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1131 ierr = WaitForGPU();CHKERRCUDA(ierr); 1132 ierr = PetscLogFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1133 PetscFunctionReturn(0); 1134 } 1135 1136 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 1137 { 1138 const PetscScalar *barray; 1139 PetscScalar *xarray; 1140 cusparseStatus_t stat; 1141 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1142 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1143 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1144 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1145 PetscErrorCode ierr; 1146 1147 PetscFunctionBegin; 1148 /* Get the GPU pointers */ 1149 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1150 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1151 1152 /* First, solve L */ 1153 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp, 1154 loTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactor->descr, 1155 loTriFactor->csrMat->values->data().get(), 1156 loTriFactor->csrMat->row_offsets->data().get(), 1157 loTriFactor->csrMat->column_indices->data().get(), 1158 loTriFactor->solveInfo, 1159 barray, tempGPU->data().get());CHKERRCUDA(stat); 1160 1161 /* Next, solve U */ 1162 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp, 1163 upTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactor->descr, 1164 upTriFactor->csrMat->values->data().get(), 1165 upTriFactor->csrMat->row_offsets->data().get(), 1166 upTriFactor->csrMat->column_indices->data().get(), 1167 upTriFactor->solveInfo, 1168 tempGPU->data().get(), xarray);CHKERRCUDA(stat); 1169 1170 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1171 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1172 ierr = WaitForGPU();CHKERRCUDA(ierr); 1173 ierr = PetscLogFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1174 PetscFunctionReturn(0); 1175 } 1176 1177 static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A) 1178 { 1179 1180 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1181 Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 1182 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1183 PetscInt m = A->rmap->n,*ii,*ridx; 1184 PetscErrorCode ierr; 1185 cusparseStatus_t stat; 1186 cudaError_t err; 1187 1188 PetscFunctionBegin; 1189 if (A->valid_GPU_matrix == PETSC_OFFLOAD_UNALLOCATED || A->valid_GPU_matrix == PETSC_OFFLOAD_CPU) { 1190 ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1191 if (A->assembled && A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) { 1192 CsrMatrix *matrix = (CsrMatrix*)matstruct->mat; 1193 /* copy values only */ 1194 matrix->values->assign(a->a, a->a+a->nz); 1195 } else { 1196 MatSeqAIJCUSPARSEMultStruct_Destroy(&matstruct,cusparsestruct->format); 1197 try { 1198 cusparsestruct->nonzerorow=0; 1199 for (int j = 0; j<m; j++) cusparsestruct->nonzerorow += ((a->i[j+1]-a->i[j])>0); 1200 1201 if (a->compressedrow.use) { 1202 m = a->compressedrow.nrows; 1203 ii = a->compressedrow.i; 1204 ridx = a->compressedrow.rindex; 1205 } else { 1206 /* Forcing compressed row on the GPU */ 1207 int k=0; 1208 ierr = PetscMalloc1(cusparsestruct->nonzerorow+1, &ii);CHKERRQ(ierr); 1209 ierr = PetscMalloc1(cusparsestruct->nonzerorow, &ridx);CHKERRQ(ierr); 1210 ii[0]=0; 1211 for (int j = 0; j<m; j++) { 1212 if ((a->i[j+1]-a->i[j])>0) { 1213 ii[k] = a->i[j]; 1214 ridx[k]= j; 1215 k++; 1216 } 1217 } 1218 ii[cusparsestruct->nonzerorow] = a->nz; 1219 m = cusparsestruct->nonzerorow; 1220 } 1221 1222 /* allocate space for the triangular factor information */ 1223 matstruct = new Mat_SeqAIJCUSPARSEMultStruct; 1224 stat = cusparseCreateMatDescr(&matstruct->descr);CHKERRCUDA(stat); 1225 stat = cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat); 1226 stat = cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUDA(stat); 1227 1228 err = cudaMalloc((void **)&(matstruct->alpha), sizeof(PetscScalar));CHKERRCUDA(err); 1229 err = cudaMalloc((void **)&(matstruct->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err); 1230 err = cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar));CHKERRCUDA(err); 1231 err = cudaMemcpy(matstruct->alpha, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1232 err = cudaMemcpy(matstruct->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1233 err = cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1234 stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUDA(stat); 1235 1236 /* Build a hybrid/ellpack matrix if this option is chosen for the storage */ 1237 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1238 /* set the matrix */ 1239 CsrMatrix *matrix= new CsrMatrix; 1240 matrix->num_rows = m; 1241 matrix->num_cols = A->cmap->n; 1242 matrix->num_entries = a->nz; 1243 matrix->row_offsets = new THRUSTINTARRAY32(m+1); 1244 matrix->row_offsets->assign(ii, ii + m+1); 1245 1246 matrix->column_indices = new THRUSTINTARRAY32(a->nz); 1247 matrix->column_indices->assign(a->j, a->j+a->nz); 1248 1249 matrix->values = new THRUSTARRAY(a->nz); 1250 matrix->values->assign(a->a, a->a+a->nz); 1251 1252 /* assign the pointer */ 1253 matstruct->mat = matrix; 1254 1255 } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) { 1256 #if CUDA_VERSION>=4020 1257 CsrMatrix *matrix= new CsrMatrix; 1258 matrix->num_rows = m; 1259 matrix->num_cols = A->cmap->n; 1260 matrix->num_entries = a->nz; 1261 matrix->row_offsets = new THRUSTINTARRAY32(m+1); 1262 matrix->row_offsets->assign(ii, ii + m+1); 1263 1264 matrix->column_indices = new THRUSTINTARRAY32(a->nz); 1265 matrix->column_indices->assign(a->j, a->j+a->nz); 1266 1267 matrix->values = new THRUSTARRAY(a->nz); 1268 matrix->values->assign(a->a, a->a+a->nz); 1269 1270 cusparseHybMat_t hybMat; 1271 stat = cusparseCreateHybMat(&hybMat);CHKERRCUDA(stat); 1272 cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ? 1273 CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 1274 stat = cusparse_csr2hyb(cusparsestruct->handle, matrix->num_rows, matrix->num_cols, 1275 matstruct->descr, matrix->values->data().get(), 1276 matrix->row_offsets->data().get(), 1277 matrix->column_indices->data().get(), 1278 hybMat, 0, partition);CHKERRCUDA(stat); 1279 /* assign the pointer */ 1280 matstruct->mat = hybMat; 1281 1282 if (matrix) { 1283 if (matrix->values) delete (THRUSTARRAY*)matrix->values; 1284 if (matrix->column_indices) delete (THRUSTINTARRAY32*)matrix->column_indices; 1285 if (matrix->row_offsets) delete (THRUSTINTARRAY32*)matrix->row_offsets; 1286 delete (CsrMatrix*)matrix; 1287 } 1288 #endif 1289 } 1290 1291 /* assign the compressed row indices */ 1292 matstruct->cprowIndices = new THRUSTINTARRAY(m); 1293 matstruct->cprowIndices->assign(ridx,ridx+m); 1294 1295 /* assign the pointer */ 1296 cusparsestruct->mat = matstruct; 1297 1298 if (!a->compressedrow.use) { 1299 ierr = PetscFree(ii);CHKERRQ(ierr); 1300 ierr = PetscFree(ridx);CHKERRQ(ierr); 1301 } 1302 cusparsestruct->workVector = new THRUSTARRAY(m); 1303 } catch(char *ex) { 1304 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 1305 } 1306 cusparsestruct->nonzerostate = A->nonzerostate; 1307 } 1308 ierr = WaitForGPU();CHKERRCUDA(ierr); 1309 A->valid_GPU_matrix = PETSC_OFFLOAD_BOTH; 1310 ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1311 } 1312 PetscFunctionReturn(0); 1313 } 1314 1315 struct VecCUDAPlusEquals 1316 { 1317 template <typename Tuple> 1318 __host__ __device__ 1319 void operator()(Tuple t) 1320 { 1321 thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t); 1322 } 1323 }; 1324 1325 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 1326 { 1327 PetscErrorCode ierr; 1328 1329 PetscFunctionBegin; 1330 ierr = MatMultAdd_SeqAIJCUSPARSE(A,xx,NULL,yy);CHKERRQ(ierr); 1331 PetscFunctionReturn(0); 1332 } 1333 1334 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 1335 { 1336 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1337 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1338 Mat_SeqAIJCUSPARSEMultStruct *matstructT; 1339 const PetscScalar *xarray; 1340 PetscScalar *yarray; 1341 PetscErrorCode ierr; 1342 cusparseStatus_t stat; 1343 1344 PetscFunctionBegin; 1345 /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */ 1346 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 1347 matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 1348 if (!matstructT) { 1349 ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr); 1350 matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 1351 } 1352 ierr = VecCUDAGetArrayRead(xx,&xarray);CHKERRQ(ierr); 1353 ierr = VecSet(yy,0);CHKERRQ(ierr); 1354 ierr = VecCUDAGetArrayWrite(yy,&yarray);CHKERRQ(ierr); 1355 1356 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1357 CsrMatrix *mat = (CsrMatrix*)matstructT->mat; 1358 stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1359 mat->num_rows, mat->num_cols, 1360 mat->num_entries, matstructT->alpha, matstructT->descr, 1361 mat->values->data().get(), mat->row_offsets->data().get(), 1362 mat->column_indices->data().get(), xarray, matstructT->beta_zero, 1363 yarray);CHKERRCUDA(stat); 1364 } else { 1365 #if CUDA_VERSION>=4020 1366 cusparseHybMat_t hybMat = (cusparseHybMat_t)matstructT->mat; 1367 stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1368 matstructT->alpha, matstructT->descr, hybMat, 1369 xarray, matstructT->beta_zero, 1370 yarray);CHKERRCUDA(stat); 1371 #endif 1372 } 1373 ierr = VecCUDARestoreArrayRead(xx,&xarray);CHKERRQ(ierr); 1374 ierr = VecCUDARestoreArrayWrite(yy,&yarray);CHKERRQ(ierr); 1375 if (!cusparsestruct->stream) { 1376 ierr = WaitForGPU();CHKERRCUDA(ierr); 1377 } 1378 ierr = PetscLogFlops(2.0*a->nz - cusparsestruct->nonzerorow);CHKERRQ(ierr); 1379 PetscFunctionReturn(0); 1380 } 1381 1382 1383 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 1384 { 1385 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1386 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1387 Mat_SeqAIJCUSPARSEMultStruct *matstruct; 1388 const PetscScalar *xarray; 1389 PetscScalar *zarray,*dptr,*beta; 1390 PetscErrorCode ierr; 1391 cusparseStatus_t stat; 1392 1393 PetscFunctionBegin; 1394 /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */ 1395 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 1396 matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 1397 try { 1398 ierr = VecCUDAGetArrayRead(xx,&xarray);CHKERRQ(ierr); 1399 ierr = VecCUDAGetArray(zz,&zarray);CHKERRQ(ierr); 1400 dptr = cusparsestruct->workVector->size() == (thrust::detail::vector_base<PetscScalar, thrust::device_malloc_allocator<PetscScalar> >::size_type)(A->rmap->n) ? zarray : cusparsestruct->workVector->data().get(); 1401 beta = (yy == zz && dptr == zarray) ? matstruct->beta_one : matstruct->beta_zero; 1402 1403 /* csr_spmv is multiply add */ 1404 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 1405 /* here we need to be careful to set the number of rows in the multiply to the 1406 number of compressed rows in the matrix ... which is equivalent to the 1407 size of the workVector */ 1408 CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 1409 stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1410 mat->num_rows, mat->num_cols, 1411 mat->num_entries, matstruct->alpha, matstruct->descr, 1412 mat->values->data().get(), mat->row_offsets->data().get(), 1413 mat->column_indices->data().get(), xarray, beta, 1414 dptr);CHKERRCUDA(stat); 1415 } else { 1416 #if CUDA_VERSION>=4020 1417 cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat; 1418 if (cusparsestruct->workVector->size()) { 1419 stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1420 matstruct->alpha, matstruct->descr, hybMat, 1421 xarray, beta, 1422 dptr);CHKERRCUDA(stat); 1423 } 1424 #endif 1425 } 1426 1427 if (yy) { 1428 if (dptr != zarray) { 1429 ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr); 1430 } else if (zz != yy) { 1431 ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); 1432 } 1433 } else if (dptr != zarray) { 1434 ierr = VecSet(zz,0);CHKERRQ(ierr); 1435 } 1436 /* scatter the data from the temporary into the full vector with a += operation */ 1437 if (dptr != zarray) { 1438 thrust::device_ptr<PetscScalar> zptr; 1439 1440 zptr = thrust::device_pointer_cast(zarray); 1441 thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))), 1442 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + cusparsestruct->workVector->size(), 1443 VecCUDAPlusEquals()); 1444 } 1445 ierr = VecCUDARestoreArrayRead(xx,&xarray);CHKERRQ(ierr); 1446 ierr = VecCUDARestoreArray(zz,&zarray);CHKERRQ(ierr); 1447 } catch(char *ex) { 1448 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 1449 } 1450 if (!yy) { /* MatMult */ 1451 if (!cusparsestruct->stream) { 1452 ierr = WaitForGPU();CHKERRCUDA(ierr); 1453 } 1454 } 1455 ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr); 1456 PetscFunctionReturn(0); 1457 } 1458 1459 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 1460 { 1461 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1462 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1463 Mat_SeqAIJCUSPARSEMultStruct *matstructT; 1464 thrust::device_ptr<PetscScalar> zptr; 1465 const PetscScalar *xarray; 1466 PetscScalar *zarray; 1467 PetscErrorCode ierr; 1468 cusparseStatus_t stat; 1469 1470 PetscFunctionBegin; 1471 /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */ 1472 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 1473 matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 1474 if (!matstructT) { 1475 ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr); 1476 matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 1477 } 1478 1479 try { 1480 ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr); 1481 ierr = VecCUDAGetArrayRead(xx,&xarray);CHKERRQ(ierr); 1482 ierr = VecCUDAGetArray(zz,&zarray);CHKERRQ(ierr); 1483 zptr = thrust::device_pointer_cast(zarray); 1484 1485 /* multiply add with matrix transpose */ 1486 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1487 CsrMatrix *mat = (CsrMatrix*)matstructT->mat; 1488 /* here we need to be careful to set the number of rows in the multiply to the 1489 number of compressed rows in the matrix ... which is equivalent to the 1490 size of the workVector */ 1491 stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1492 mat->num_rows, mat->num_cols, 1493 mat->num_entries, matstructT->alpha, matstructT->descr, 1494 mat->values->data().get(), mat->row_offsets->data().get(), 1495 mat->column_indices->data().get(), xarray, matstructT->beta_zero, 1496 cusparsestruct->workVector->data().get());CHKERRCUDA(stat); 1497 } else { 1498 #if CUDA_VERSION>=4020 1499 cusparseHybMat_t hybMat = (cusparseHybMat_t)matstructT->mat; 1500 if (cusparsestruct->workVector->size()) { 1501 stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1502 matstructT->alpha, matstructT->descr, hybMat, 1503 xarray, matstructT->beta_zero, 1504 cusparsestruct->workVector->data().get());CHKERRCUDA(stat); 1505 } 1506 #endif 1507 } 1508 1509 /* scatter the data from the temporary into the full vector with a += operation */ 1510 thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstructT->cprowIndices->begin()))), 1511 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstructT->cprowIndices->begin()))) + A->cmap->n, 1512 VecCUDAPlusEquals()); 1513 1514 ierr = VecCUDARestoreArrayRead(xx,&xarray);CHKERRQ(ierr); 1515 ierr = VecCUDARestoreArray(zz,&zarray);CHKERRQ(ierr); 1516 1517 } catch(char *ex) { 1518 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 1519 } 1520 ierr = WaitForGPU();CHKERRCUDA(ierr); 1521 ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr); 1522 PetscFunctionReturn(0); 1523 } 1524 1525 static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode) 1526 { 1527 PetscErrorCode ierr; 1528 1529 PetscFunctionBegin; 1530 ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr); 1531 if (A->factortype==MAT_FACTOR_NONE) { 1532 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 1533 } 1534 if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0); 1535 A->ops->mult = MatMult_SeqAIJCUSPARSE; 1536 A->ops->multadd = MatMultAdd_SeqAIJCUSPARSE; 1537 A->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE; 1538 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE; 1539 PetscFunctionReturn(0); 1540 } 1541 1542 /* --------------------------------------------------------------------------------*/ 1543 /*@ 1544 MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format 1545 (the default parallel PETSc format). This matrix will ultimately pushed down 1546 to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix 1547 assembly performance the user should preallocate the matrix storage by setting 1548 the parameter nz (or the array nnz). By setting these parameters accurately, 1549 performance during matrix assembly can be increased by more than a factor of 50. 1550 1551 Collective 1552 1553 Input Parameters: 1554 + comm - MPI communicator, set to PETSC_COMM_SELF 1555 . m - number of rows 1556 . n - number of columns 1557 . nz - number of nonzeros per row (same for all rows) 1558 - nnz - array containing the number of nonzeros in the various rows 1559 (possibly different for each row) or NULL 1560 1561 Output Parameter: 1562 . A - the matrix 1563 1564 It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 1565 MatXXXXSetPreallocation() paradgm instead of this routine directly. 1566 [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 1567 1568 Notes: 1569 If nnz is given then nz is ignored 1570 1571 The AIJ format (also called the Yale sparse matrix format or 1572 compressed row storage), is fully compatible with standard Fortran 77 1573 storage. That is, the stored row and column indices can begin at 1574 either one (as in Fortran) or zero. See the users' manual for details. 1575 1576 Specify the preallocated storage with either nz or nnz (not both). 1577 Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 1578 allocation. For large problems you MUST preallocate memory or you 1579 will get TERRIBLE performance, see the users' manual chapter on matrices. 1580 1581 By default, this format uses inodes (identical nodes) when possible, to 1582 improve numerical efficiency of matrix-vector products and solves. We 1583 search for consecutive rows with the same nonzero structure, thereby 1584 reusing matrix information to achieve increased efficiency. 1585 1586 Level: intermediate 1587 1588 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE 1589 @*/ 1590 PetscErrorCode MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A) 1591 { 1592 PetscErrorCode ierr; 1593 1594 PetscFunctionBegin; 1595 ierr = MatCreate(comm,A);CHKERRQ(ierr); 1596 ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr); 1597 ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 1598 ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr); 1599 PetscFunctionReturn(0); 1600 } 1601 1602 static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A) 1603 { 1604 PetscErrorCode ierr; 1605 1606 PetscFunctionBegin; 1607 if (A->factortype==MAT_FACTOR_NONE) { 1608 if (A->valid_GPU_matrix != PETSC_OFFLOAD_UNALLOCATED) { 1609 ierr = MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr);CHKERRQ(ierr); 1610 } 1611 } else { 1612 ierr = MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr);CHKERRQ(ierr); 1613 } 1614 ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr); 1615 PetscFunctionReturn(0); 1616 } 1617 1618 static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A,MatDuplicateOption cpvalues,Mat *B) 1619 { 1620 PetscErrorCode ierr; 1621 Mat C; 1622 cusparseStatus_t stat; 1623 cusparseHandle_t handle=0; 1624 1625 PetscFunctionBegin; 1626 ierr = MatDuplicate_SeqAIJ(A,cpvalues,B);CHKERRQ(ierr); 1627 C = *B; 1628 ierr = PetscFree(C->defaultvectype);CHKERRQ(ierr); 1629 ierr = PetscStrallocpy(VECCUDA,&C->defaultvectype);CHKERRQ(ierr); 1630 1631 /* inject CUSPARSE-specific stuff */ 1632 if (C->factortype==MAT_FACTOR_NONE) { 1633 /* you cannot check the inode.use flag here since the matrix was just created. 1634 now build a GPU matrix data structure */ 1635 C->spptr = new Mat_SeqAIJCUSPARSE; 1636 ((Mat_SeqAIJCUSPARSE*)C->spptr)->mat = 0; 1637 ((Mat_SeqAIJCUSPARSE*)C->spptr)->matTranspose = 0; 1638 ((Mat_SeqAIJCUSPARSE*)C->spptr)->workVector = 0; 1639 ((Mat_SeqAIJCUSPARSE*)C->spptr)->format = MAT_CUSPARSE_CSR; 1640 ((Mat_SeqAIJCUSPARSE*)C->spptr)->stream = 0; 1641 ((Mat_SeqAIJCUSPARSE*)C->spptr)->handle = 0; 1642 stat = cusparseCreate(&handle);CHKERRCUDA(stat); 1643 ((Mat_SeqAIJCUSPARSE*)C->spptr)->handle = handle; 1644 ((Mat_SeqAIJCUSPARSE*)C->spptr)->stream = 0; 1645 ((Mat_SeqAIJCUSPARSE*)C->spptr)->nonzerostate = 0; 1646 } else { 1647 /* NEXT, set the pointers to the triangular factors */ 1648 C->spptr = new Mat_SeqAIJCUSPARSETriFactors; 1649 ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->loTriFactorPtr = 0; 1650 ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->upTriFactorPtr = 0; 1651 ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->loTriFactorPtrTranspose = 0; 1652 ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->upTriFactorPtrTranspose = 0; 1653 ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->rpermIndices = 0; 1654 ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->cpermIndices = 0; 1655 ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->workVector = 0; 1656 ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->handle = 0; 1657 stat = cusparseCreate(&handle);CHKERRCUDA(stat); 1658 ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->handle = handle; 1659 ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->nnz = 0; 1660 } 1661 1662 C->ops->assemblyend = MatAssemblyEnd_SeqAIJCUSPARSE; 1663 C->ops->destroy = MatDestroy_SeqAIJCUSPARSE; 1664 C->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE; 1665 C->ops->mult = MatMult_SeqAIJCUSPARSE; 1666 C->ops->multadd = MatMultAdd_SeqAIJCUSPARSE; 1667 C->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE; 1668 C->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE; 1669 C->ops->duplicate = MatDuplicate_SeqAIJCUSPARSE; 1670 1671 ierr = PetscObjectChangeTypeName((PetscObject)C,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 1672 1673 C->valid_GPU_matrix = PETSC_OFFLOAD_UNALLOCATED; 1674 1675 ierr = PetscObjectComposeFunction((PetscObject)C, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr); 1676 PetscFunctionReturn(0); 1677 } 1678 1679 PETSC_EXTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat B) 1680 { 1681 PetscErrorCode ierr; 1682 cusparseStatus_t stat; 1683 cusparseHandle_t handle=0; 1684 1685 PetscFunctionBegin; 1686 ierr = PetscFree(B->defaultvectype);CHKERRQ(ierr); 1687 ierr = PetscStrallocpy(VECCUDA,&B->defaultvectype);CHKERRQ(ierr); 1688 1689 if (B->factortype==MAT_FACTOR_NONE) { 1690 /* you cannot check the inode.use flag here since the matrix was just created. 1691 now build a GPU matrix data structure */ 1692 B->spptr = new Mat_SeqAIJCUSPARSE; 1693 ((Mat_SeqAIJCUSPARSE*)B->spptr)->mat = 0; 1694 ((Mat_SeqAIJCUSPARSE*)B->spptr)->matTranspose = 0; 1695 ((Mat_SeqAIJCUSPARSE*)B->spptr)->workVector = 0; 1696 ((Mat_SeqAIJCUSPARSE*)B->spptr)->format = MAT_CUSPARSE_CSR; 1697 ((Mat_SeqAIJCUSPARSE*)B->spptr)->stream = 0; 1698 ((Mat_SeqAIJCUSPARSE*)B->spptr)->handle = 0; 1699 stat = cusparseCreate(&handle);CHKERRCUDA(stat); 1700 ((Mat_SeqAIJCUSPARSE*)B->spptr)->handle = handle; 1701 ((Mat_SeqAIJCUSPARSE*)B->spptr)->stream = 0; 1702 ((Mat_SeqAIJCUSPARSE*)B->spptr)->nonzerostate = 0; 1703 } else { 1704 /* NEXT, set the pointers to the triangular factors */ 1705 B->spptr = new Mat_SeqAIJCUSPARSETriFactors; 1706 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->loTriFactorPtr = 0; 1707 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->upTriFactorPtr = 0; 1708 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->loTriFactorPtrTranspose = 0; 1709 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->upTriFactorPtrTranspose = 0; 1710 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->rpermIndices = 0; 1711 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->cpermIndices = 0; 1712 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->workVector = 0; 1713 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->handle = 0; 1714 stat = cusparseCreate(&handle);CHKERRCUDA(stat); 1715 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->handle = handle; 1716 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->nnz = 0; 1717 } 1718 1719 B->ops->assemblyend = MatAssemblyEnd_SeqAIJCUSPARSE; 1720 B->ops->destroy = MatDestroy_SeqAIJCUSPARSE; 1721 B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE; 1722 B->ops->mult = MatMult_SeqAIJCUSPARSE; 1723 B->ops->multadd = MatMultAdd_SeqAIJCUSPARSE; 1724 B->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE; 1725 B->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE; 1726 B->ops->duplicate = MatDuplicate_SeqAIJCUSPARSE; 1727 1728 ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 1729 1730 B->valid_GPU_matrix = PETSC_OFFLOAD_UNALLOCATED; 1731 1732 ierr = PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr); 1733 PetscFunctionReturn(0); 1734 } 1735 1736 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B) 1737 { 1738 PetscErrorCode ierr; 1739 1740 PetscFunctionBegin; 1741 ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr); 1742 ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(B); 1743 PetscFunctionReturn(0); 1744 } 1745 1746 /*MC 1747 MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices. 1748 1749 A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either 1750 CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later. 1751 All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library. 1752 1753 Options Database Keys: 1754 + -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions() 1755 . -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 1756 . -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 1757 1758 Level: beginner 1759 1760 .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 1761 M*/ 1762 1763 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat,MatFactorType,Mat*); 1764 1765 1766 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void) 1767 { 1768 PetscErrorCode ierr; 1769 1770 PetscFunctionBegin; 1771 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 1772 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 1773 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 1774 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 1775 PetscFunctionReturn(0); 1776 } 1777 1778 1779 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct) 1780 { 1781 cusparseStatus_t stat; 1782 cusparseHandle_t handle; 1783 1784 PetscFunctionBegin; 1785 if (*cusparsestruct) { 1786 MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format); 1787 MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format); 1788 delete (*cusparsestruct)->workVector; 1789 if (handle = (*cusparsestruct)->handle) { 1790 stat = cusparseDestroy(handle);CHKERRCUDA(stat); 1791 } 1792 delete *cusparsestruct; 1793 *cusparsestruct = 0; 1794 } 1795 PetscFunctionReturn(0); 1796 } 1797 1798 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat) 1799 { 1800 PetscFunctionBegin; 1801 if (*mat) { 1802 delete (*mat)->values; 1803 delete (*mat)->column_indices; 1804 delete (*mat)->row_offsets; 1805 delete *mat; 1806 *mat = 0; 1807 } 1808 PetscFunctionReturn(0); 1809 } 1810 1811 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor) 1812 { 1813 cusparseStatus_t stat; 1814 PetscErrorCode ierr; 1815 1816 PetscFunctionBegin; 1817 if (*trifactor) { 1818 if ((*trifactor)->descr) { stat = cusparseDestroyMatDescr((*trifactor)->descr);CHKERRCUDA(stat); } 1819 if ((*trifactor)->solveInfo) { stat = cusparseDestroySolveAnalysisInfo((*trifactor)->solveInfo);CHKERRCUDA(stat); } 1820 ierr = CsrMatrix_Destroy(&(*trifactor)->csrMat);CHKERRQ(ierr); 1821 delete *trifactor; 1822 *trifactor = 0; 1823 } 1824 PetscFunctionReturn(0); 1825 } 1826 1827 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format) 1828 { 1829 CsrMatrix *mat; 1830 cusparseStatus_t stat; 1831 cudaError_t err; 1832 1833 PetscFunctionBegin; 1834 if (*matstruct) { 1835 if ((*matstruct)->mat) { 1836 if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) { 1837 cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat; 1838 stat = cusparseDestroyHybMat(hybMat);CHKERRCUDA(stat); 1839 } else { 1840 mat = (CsrMatrix*)(*matstruct)->mat; 1841 CsrMatrix_Destroy(&mat); 1842 } 1843 } 1844 if ((*matstruct)->descr) { stat = cusparseDestroyMatDescr((*matstruct)->descr);CHKERRCUDA(stat); } 1845 delete (*matstruct)->cprowIndices; 1846 if ((*matstruct)->alpha) { err=cudaFree((*matstruct)->alpha);CHKERRCUDA(err); } 1847 if ((*matstruct)->beta_zero) { err=cudaFree((*matstruct)->beta_zero);CHKERRCUDA(err); } 1848 if ((*matstruct)->beta_one) { err=cudaFree((*matstruct)->beta_one);CHKERRCUDA(err); } 1849 delete *matstruct; 1850 *matstruct = 0; 1851 } 1852 PetscFunctionReturn(0); 1853 } 1854 1855 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors) 1856 { 1857 cusparseHandle_t handle; 1858 cusparseStatus_t stat; 1859 1860 PetscFunctionBegin; 1861 if (*trifactors) { 1862 MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtr); 1863 MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtr); 1864 MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtrTranspose); 1865 MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtrTranspose); 1866 delete (*trifactors)->rpermIndices; 1867 delete (*trifactors)->cpermIndices; 1868 delete (*trifactors)->workVector; 1869 if (handle = (*trifactors)->handle) { 1870 stat = cusparseDestroy(handle);CHKERRCUDA(stat); 1871 } 1872 delete *trifactors; 1873 *trifactors = 0; 1874 } 1875 PetscFunctionReturn(0); 1876 } 1877 1878