1 /* 2 Defines the basic matrix operations for the AIJ (compressed row) 3 matrix storage format using the CUSPARSE library, 4 */ 5 #define PETSC_SKIP_SPINLOCK 6 7 #include <petscconf.h> 8 #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/ 9 #include <../src/mat/impls/sbaij/seq/sbaij.h> 10 #include <../src/vec/vec/impls/dvecimpl.h> 11 #include <petsc/private/vecimpl.h> 12 #undef VecType 13 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h> 14 15 const char *const MatCUSPARSEStorageFormats[] = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0}; 16 17 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*); 18 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*); 19 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*); 20 21 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 22 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 23 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*); 24 25 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec); 26 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 27 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 28 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 29 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat); 30 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec); 31 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 32 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 33 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 34 35 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**); 36 static PetscErrorCode Mat_SeqAIJCUSPARSETriFactorStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**); 37 static PetscErrorCode Mat_SeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat); 38 static PetscErrorCode Mat_SeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**); 39 static PetscErrorCode Mat_SeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**); 40 41 #undef __FUNCT__ 42 #define __FUNCT__ "MatCUSPARSESetStream" 43 PetscErrorCode MatCUSPARSESetStream(Mat A,const cudaStream_t stream) 44 { 45 cusparseStatus_t stat; 46 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 47 48 PetscFunctionBegin; 49 cusparsestruct->stream = stream; 50 stat = cusparseSetStream(cusparsestruct->handle,cusparsestruct->stream);CHKERRCUDA(stat); 51 PetscFunctionReturn(0); 52 } 53 54 #undef __FUNCT__ 55 #define __FUNCT__ "MatCUSPARSESetHandle" 56 PetscErrorCode MatCUSPARSESetHandle(Mat A,const cusparseHandle_t handle) 57 { 58 cusparseStatus_t stat; 59 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 60 61 PetscFunctionBegin; 62 if (cusparsestruct->handle) { 63 stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUDA(stat); 64 } 65 cusparsestruct->handle = handle; 66 stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUDA(stat); 67 PetscFunctionReturn(0); 68 } 69 70 #undef __FUNCT__ 71 #define __FUNCT__ "MatCUSPARSEClearHandle" 72 PetscErrorCode MatCUSPARSEClearHandle(Mat A) 73 { 74 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 75 PetscFunctionBegin; 76 if (cusparsestruct->handle) 77 cusparsestruct->handle = 0; 78 PetscFunctionReturn(0); 79 } 80 81 #undef __FUNCT__ 82 #define __FUNCT__ "MatFactorGetSolverPackage_seqaij_cusparse" 83 PetscErrorCode MatFactorGetSolverPackage_seqaij_cusparse(Mat A,const MatSolverPackage *type) 84 { 85 PetscFunctionBegin; 86 *type = MATSOLVERCUSPARSE; 87 PetscFunctionReturn(0); 88 } 89 90 /*MC 91 MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices 92 on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported 93 algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer 94 performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the 95 CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these 96 algorithms are not recommended. This class does NOT support direct solver operations. 97 98 Level: beginner 99 100 .seealso: PCFactorSetMatSolverPackage(), MatSolverPackage, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 101 M*/ 102 103 #undef __FUNCT__ 104 #define __FUNCT__ "MatGetFactor_seqaijcusparse_cusparse" 105 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B) 106 { 107 PetscErrorCode ierr; 108 PetscInt n = A->rmap->n; 109 110 PetscFunctionBegin; 111 ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr); 112 (*B)->factortype = ftype; 113 ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr); 114 ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 115 116 if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) { 117 ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr); 118 (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE; 119 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJCUSPARSE; 120 } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) { 121 (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJCUSPARSE; 122 (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE; 123 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types"); 124 125 ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr); 126 ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverPackage_C",MatFactorGetSolverPackage_seqaij_cusparse);CHKERRQ(ierr); 127 PetscFunctionReturn(0); 128 } 129 130 #undef __FUNCT__ 131 #define __FUNCT__ "MatCUSPARSESetFormat_SeqAIJCUSPARSE" 132 PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 133 { 134 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 135 136 PetscFunctionBegin; 137 #if CUDA_VERSION>=4020 138 switch (op) { 139 case MAT_CUSPARSE_MULT: 140 cusparsestruct->format = format; 141 break; 142 case MAT_CUSPARSE_ALL: 143 cusparsestruct->format = format; 144 break; 145 default: 146 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op); 147 } 148 #else 149 if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"ELL (Ellpack) and HYB (Hybrid) storage format require CUDA 4.2 or later."); 150 #endif 151 PetscFunctionReturn(0); 152 } 153 154 /*@ 155 MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular 156 operation. Only the MatMult operation can use different GPU storage formats 157 for MPIAIJCUSPARSE matrices. 158 Not Collective 159 160 Input Parameters: 161 + A - Matrix of type SEQAIJCUSPARSE 162 . op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL. 163 - format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2) 164 165 Output Parameter: 166 167 Level: intermediate 168 169 .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 170 @*/ 171 #undef __FUNCT__ 172 #define __FUNCT__ "MatCUSPARSESetFormat" 173 PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 174 { 175 PetscErrorCode ierr; 176 177 PetscFunctionBegin; 178 PetscValidHeaderSpecific(A, MAT_CLASSID,1); 179 ierr = PetscTryMethod(A, "MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));CHKERRQ(ierr); 180 PetscFunctionReturn(0); 181 } 182 183 #undef __FUNCT__ 184 #define __FUNCT__ "MatSetFromOptions_SeqAIJCUSPARSE" 185 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A) 186 { 187 PetscErrorCode ierr; 188 MatCUSPARSEStorageFormat format; 189 PetscBool flg; 190 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 191 192 PetscFunctionBegin; 193 ierr = PetscOptionsHead(PetscOptionsObject,"SeqAIJCUSPARSE options");CHKERRQ(ierr); 194 ierr = PetscObjectOptionsBegin((PetscObject)A); 195 if (A->factortype==MAT_FACTOR_NONE) { 196 ierr = PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV", 197 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 198 if (flg) { 199 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);CHKERRQ(ierr); 200 } 201 } 202 ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve", 203 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 204 if (flg) { 205 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr); 206 } 207 ierr = PetscOptionsEnd();CHKERRQ(ierr); 208 PetscFunctionReturn(0); 209 210 } 211 212 #undef __FUNCT__ 213 #define __FUNCT__ "MatILUFactorSymbolic_SeqAIJCUSPARSE" 214 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 215 { 216 PetscErrorCode ierr; 217 218 PetscFunctionBegin; 219 ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 220 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 221 PetscFunctionReturn(0); 222 } 223 224 #undef __FUNCT__ 225 #define __FUNCT__ "MatLUFactorSymbolic_SeqAIJCUSPARSE" 226 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 227 { 228 PetscErrorCode ierr; 229 230 PetscFunctionBegin; 231 ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 232 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 233 PetscFunctionReturn(0); 234 } 235 236 #undef __FUNCT__ 237 #define __FUNCT__ "MatICCFactorSymbolic_SeqAIJCUSPARSE" 238 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info) 239 { 240 PetscErrorCode ierr; 241 242 PetscFunctionBegin; 243 ierr = MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr); 244 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 245 PetscFunctionReturn(0); 246 } 247 248 #undef __FUNCT__ 249 #define __FUNCT__ "MatCholeskyFactorSymbolic_SeqAIJCUSPARSE" 250 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info) 251 { 252 PetscErrorCode ierr; 253 254 PetscFunctionBegin; 255 ierr = MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr); 256 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 257 PetscFunctionReturn(0); 258 } 259 260 #undef __FUNCT__ 261 #define __FUNCT__ "MatSeqAIJCUSPARSEBuildILULowerTriMatrix" 262 static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A) 263 { 264 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 265 PetscInt n = A->rmap->n; 266 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 267 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 268 cusparseStatus_t stat; 269 const PetscInt *ai = a->i,*aj = a->j,*vi; 270 const MatScalar *aa = a->a,*v; 271 PetscInt *AiLo, *AjLo; 272 PetscScalar *AALo; 273 PetscInt i,nz, nzLower, offset, rowOffset; 274 PetscErrorCode ierr; 275 276 PetscFunctionBegin; 277 if (A->valid_GPU_matrix == PETSC_CUDA_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUDA_CPU) { 278 try { 279 /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */ 280 nzLower=n+ai[n]-ai[1]; 281 282 /* Allocate Space for the lower triangular matrix */ 283 ierr = cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUDA(ierr); 284 ierr = cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUDA(ierr); 285 ierr = cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUDA(ierr); 286 287 /* Fill the lower triangular matrix */ 288 AiLo[0] = (PetscInt) 0; 289 AiLo[n] = nzLower; 290 AjLo[0] = (PetscInt) 0; 291 AALo[0] = (MatScalar) 1.0; 292 v = aa; 293 vi = aj; 294 offset = 1; 295 rowOffset= 1; 296 for (i=1; i<n; i++) { 297 nz = ai[i+1] - ai[i]; 298 /* additional 1 for the term on the diagonal */ 299 AiLo[i] = rowOffset; 300 rowOffset += nz+1; 301 302 ierr = PetscMemcpy(&(AjLo[offset]), vi, nz*sizeof(PetscInt));CHKERRQ(ierr); 303 ierr = PetscMemcpy(&(AALo[offset]), v, nz*sizeof(PetscScalar));CHKERRQ(ierr); 304 305 offset += nz; 306 AjLo[offset] = (PetscInt) i; 307 AALo[offset] = (MatScalar) 1.0; 308 offset += 1; 309 310 v += nz; 311 vi += nz; 312 } 313 314 /* allocate space for the triangular factor information */ 315 loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct; 316 317 /* Create the matrix description */ 318 stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUDA(stat); 319 stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat); 320 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUDA(stat); 321 stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER);CHKERRCUDA(stat); 322 stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUDA(stat); 323 324 /* Create the solve analysis information */ 325 stat = cusparseCreateSolveAnalysisInfo(&loTriFactor->solveInfo);CHKERRCUDA(stat); 326 327 /* set the operation */ 328 loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 329 330 /* set the matrix */ 331 loTriFactor->csrMat = new CsrMatrix; 332 loTriFactor->csrMat->num_rows = n; 333 loTriFactor->csrMat->num_cols = n; 334 loTriFactor->csrMat->num_entries = nzLower; 335 336 loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1); 337 loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1); 338 339 loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower); 340 loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower); 341 342 loTriFactor->csrMat->values = new THRUSTARRAY(nzLower); 343 loTriFactor->csrMat->values->assign(AALo, AALo+nzLower); 344 345 /* perform the solve analysis */ 346 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, 347 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 348 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 349 loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo);CHKERRCUDA(stat); 350 351 /* assign the pointer. Is this really necessary? */ 352 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor; 353 354 ierr = cudaFreeHost(AiLo);CHKERRCUDA(ierr); 355 ierr = cudaFreeHost(AjLo);CHKERRCUDA(ierr); 356 ierr = cudaFreeHost(AALo);CHKERRCUDA(ierr); 357 } catch(char *ex) { 358 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 359 } 360 } 361 PetscFunctionReturn(0); 362 } 363 364 #undef __FUNCT__ 365 #define __FUNCT__ "MatSeqAIJCUSPARSEBuildILUUpperTriMatrix" 366 static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A) 367 { 368 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 369 PetscInt n = A->rmap->n; 370 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 371 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 372 cusparseStatus_t stat; 373 const PetscInt *aj = a->j,*adiag = a->diag,*vi; 374 const MatScalar *aa = a->a,*v; 375 PetscInt *AiUp, *AjUp; 376 PetscScalar *AAUp; 377 PetscInt i,nz, nzUpper, offset; 378 PetscErrorCode ierr; 379 380 PetscFunctionBegin; 381 if (A->valid_GPU_matrix == PETSC_CUDA_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUDA_CPU) { 382 try { 383 /* next, figure out the number of nonzeros in the upper triangular matrix. */ 384 nzUpper = adiag[0]-adiag[n]; 385 386 /* Allocate Space for the upper triangular matrix */ 387 ierr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(ierr); 388 ierr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(ierr); 389 ierr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(ierr); 390 391 /* Fill the upper triangular matrix */ 392 AiUp[0]=(PetscInt) 0; 393 AiUp[n]=nzUpper; 394 offset = nzUpper; 395 for (i=n-1; i>=0; i--) { 396 v = aa + adiag[i+1] + 1; 397 vi = aj + adiag[i+1] + 1; 398 399 /* number of elements NOT on the diagonal */ 400 nz = adiag[i] - adiag[i+1]-1; 401 402 /* decrement the offset */ 403 offset -= (nz+1); 404 405 /* first, set the diagonal elements */ 406 AjUp[offset] = (PetscInt) i; 407 AAUp[offset] = (MatScalar)1./v[nz]; 408 AiUp[i] = AiUp[i+1] - (nz+1); 409 410 ierr = PetscMemcpy(&(AjUp[offset+1]), vi, nz*sizeof(PetscInt));CHKERRQ(ierr); 411 ierr = PetscMemcpy(&(AAUp[offset+1]), v, nz*sizeof(PetscScalar));CHKERRQ(ierr); 412 } 413 414 /* allocate space for the triangular factor information */ 415 upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct; 416 417 /* Create the matrix description */ 418 stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUDA(stat); 419 stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat); 420 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUDA(stat); 421 stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUDA(stat); 422 stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUDA(stat); 423 424 /* Create the solve analysis information */ 425 stat = cusparseCreateSolveAnalysisInfo(&upTriFactor->solveInfo);CHKERRCUDA(stat); 426 427 /* set the operation */ 428 upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 429 430 /* set the matrix */ 431 upTriFactor->csrMat = new CsrMatrix; 432 upTriFactor->csrMat->num_rows = n; 433 upTriFactor->csrMat->num_cols = n; 434 upTriFactor->csrMat->num_entries = nzUpper; 435 436 upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1); 437 upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1); 438 439 upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper); 440 upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper); 441 442 upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper); 443 upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper); 444 445 /* perform the solve analysis */ 446 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, 447 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 448 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 449 upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo);CHKERRCUDA(stat); 450 451 /* assign the pointer. Is this really necessary? */ 452 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor; 453 454 ierr = cudaFreeHost(AiUp);CHKERRCUDA(ierr); 455 ierr = cudaFreeHost(AjUp);CHKERRCUDA(ierr); 456 ierr = cudaFreeHost(AAUp);CHKERRCUDA(ierr); 457 } catch(char *ex) { 458 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 459 } 460 } 461 PetscFunctionReturn(0); 462 } 463 464 #undef __FUNCT__ 465 #define __FUNCT__ "MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU" 466 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A) 467 { 468 PetscErrorCode ierr; 469 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 470 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 471 IS isrow = a->row,iscol = a->icol; 472 PetscBool row_identity,col_identity; 473 const PetscInt *r,*c; 474 PetscInt n = A->rmap->n; 475 476 PetscFunctionBegin; 477 ierr = MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);CHKERRQ(ierr); 478 ierr = MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);CHKERRQ(ierr); 479 480 cusparseTriFactors->workVector = new THRUSTARRAY; 481 cusparseTriFactors->workVector->resize(n); 482 cusparseTriFactors->nnz=a->nz; 483 484 A->valid_GPU_matrix = PETSC_CUDA_BOTH; 485 /*lower triangular indices */ 486 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 487 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 488 if (!row_identity) { 489 cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 490 cusparseTriFactors->rpermIndices->assign(r, r+n); 491 } 492 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 493 494 /*upper triangular indices */ 495 ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr); 496 ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 497 if (!col_identity) { 498 cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 499 cusparseTriFactors->cpermIndices->assign(c, c+n); 500 } 501 ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr); 502 PetscFunctionReturn(0); 503 } 504 505 #undef __FUNCT__ 506 #define __FUNCT__ "MatSeqAIJCUSPARSEBuildICCTriMatrices" 507 static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A) 508 { 509 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 510 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 511 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 512 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 513 cusparseStatus_t stat; 514 PetscErrorCode ierr; 515 PetscInt *AiUp, *AjUp; 516 PetscScalar *AAUp; 517 PetscScalar *AALo; 518 PetscInt nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j; 519 Mat_SeqSBAIJ *b = (Mat_SeqSBAIJ*)A->data; 520 const PetscInt *ai = b->i,*aj = b->j,*vj; 521 const MatScalar *aa = b->a,*v; 522 523 PetscFunctionBegin; 524 if (A->valid_GPU_matrix == PETSC_CUDA_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUDA_CPU) { 525 try { 526 /* Allocate Space for the upper triangular matrix */ 527 ierr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(ierr); 528 ierr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(ierr); 529 ierr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(ierr); 530 ierr = cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUDA(ierr); 531 532 /* Fill the upper triangular matrix */ 533 AiUp[0]=(PetscInt) 0; 534 AiUp[n]=nzUpper; 535 offset = 0; 536 for (i=0; i<n; i++) { 537 /* set the pointers */ 538 v = aa + ai[i]; 539 vj = aj + ai[i]; 540 nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */ 541 542 /* first, set the diagonal elements */ 543 AjUp[offset] = (PetscInt) i; 544 AAUp[offset] = (MatScalar)1.0/v[nz]; 545 AiUp[i] = offset; 546 AALo[offset] = (MatScalar)1.0/v[nz]; 547 548 offset+=1; 549 if (nz>0) { 550 ierr = PetscMemcpy(&(AjUp[offset]), vj, nz*sizeof(PetscInt));CHKERRQ(ierr); 551 ierr = PetscMemcpy(&(AAUp[offset]), v, nz*sizeof(PetscScalar));CHKERRQ(ierr); 552 for (j=offset; j<offset+nz; j++) { 553 AAUp[j] = -AAUp[j]; 554 AALo[j] = AAUp[j]/v[nz]; 555 } 556 offset+=nz; 557 } 558 } 559 560 /* allocate space for the triangular factor information */ 561 upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct; 562 563 /* Create the matrix description */ 564 stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUDA(stat); 565 stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat); 566 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUDA(stat); 567 stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUDA(stat); 568 stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUDA(stat); 569 570 /* Create the solve analysis information */ 571 stat = cusparseCreateSolveAnalysisInfo(&upTriFactor->solveInfo);CHKERRCUDA(stat); 572 573 /* set the operation */ 574 upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 575 576 /* set the matrix */ 577 upTriFactor->csrMat = new CsrMatrix; 578 upTriFactor->csrMat->num_rows = A->rmap->n; 579 upTriFactor->csrMat->num_cols = A->cmap->n; 580 upTriFactor->csrMat->num_entries = a->nz; 581 582 upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 583 upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1); 584 585 upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 586 upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz); 587 588 upTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 589 upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz); 590 591 /* perform the solve analysis */ 592 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, 593 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 594 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 595 upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo);CHKERRCUDA(stat); 596 597 /* assign the pointer. Is this really necessary? */ 598 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor; 599 600 /* allocate space for the triangular factor information */ 601 loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct; 602 603 /* Create the matrix description */ 604 stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUDA(stat); 605 stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat); 606 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUDA(stat); 607 stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUDA(stat); 608 stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUDA(stat); 609 610 /* Create the solve analysis information */ 611 stat = cusparseCreateSolveAnalysisInfo(&loTriFactor->solveInfo);CHKERRCUDA(stat); 612 613 /* set the operation */ 614 loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE; 615 616 /* set the matrix */ 617 loTriFactor->csrMat = new CsrMatrix; 618 loTriFactor->csrMat->num_rows = A->rmap->n; 619 loTriFactor->csrMat->num_cols = A->cmap->n; 620 loTriFactor->csrMat->num_entries = a->nz; 621 622 loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 623 loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1); 624 625 loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 626 loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz); 627 628 loTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 629 loTriFactor->csrMat->values->assign(AALo, AALo+a->nz); 630 631 /* perform the solve analysis */ 632 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, 633 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 634 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 635 loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo);CHKERRCUDA(stat); 636 637 /* assign the pointer. Is this really necessary? */ 638 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor; 639 640 A->valid_GPU_matrix = PETSC_CUDA_BOTH; 641 ierr = cudaFreeHost(AiUp);CHKERRCUDA(ierr); 642 ierr = cudaFreeHost(AjUp);CHKERRCUDA(ierr); 643 ierr = cudaFreeHost(AAUp);CHKERRCUDA(ierr); 644 ierr = cudaFreeHost(AALo);CHKERRCUDA(ierr); 645 } catch(char *ex) { 646 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 647 } 648 } 649 PetscFunctionReturn(0); 650 } 651 652 #undef __FUNCT__ 653 #define __FUNCT__ "MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU" 654 static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A) 655 { 656 PetscErrorCode ierr; 657 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 658 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 659 IS ip = a->row; 660 const PetscInt *rip; 661 PetscBool perm_identity; 662 PetscInt n = A->rmap->n; 663 664 PetscFunctionBegin; 665 ierr = MatSeqAIJCUSPARSEBuildICCTriMatrices(A);CHKERRQ(ierr); 666 cusparseTriFactors->workVector = new THRUSTARRAY; 667 cusparseTriFactors->workVector->resize(n); 668 cusparseTriFactors->nnz=(a->nz-n)*2 + n; 669 670 /*lower triangular indices */ 671 ierr = ISGetIndices(ip,&rip);CHKERRQ(ierr); 672 ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr); 673 if (!perm_identity) { 674 cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 675 cusparseTriFactors->rpermIndices->assign(rip, rip+n); 676 cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 677 cusparseTriFactors->cpermIndices->assign(rip, rip+n); 678 } 679 ierr = ISRestoreIndices(ip,&rip);CHKERRQ(ierr); 680 PetscFunctionReturn(0); 681 } 682 683 #undef __FUNCT__ 684 #define __FUNCT__ "MatLUFactorNumeric_SeqAIJCUSPARSE" 685 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info) 686 { 687 Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 688 IS isrow = b->row,iscol = b->col; 689 PetscBool row_identity,col_identity; 690 PetscErrorCode ierr; 691 692 PetscFunctionBegin; 693 ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 694 /* determine which version of MatSolve needs to be used. */ 695 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 696 ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 697 if (row_identity && col_identity) { 698 B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 699 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 700 } else { 701 B->ops->solve = MatSolve_SeqAIJCUSPARSE; 702 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 703 } 704 705 /* get the triangular factors */ 706 ierr = MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);CHKERRQ(ierr); 707 PetscFunctionReturn(0); 708 } 709 710 #undef __FUNCT__ 711 #define __FUNCT__ "MatCholeskyFactorNumeric_SeqAIJCUSPARSE" 712 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info) 713 { 714 Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 715 IS ip = b->row; 716 PetscBool perm_identity; 717 PetscErrorCode ierr; 718 719 PetscFunctionBegin; 720 ierr = MatCholeskyFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 721 722 /* determine which version of MatSolve needs to be used. */ 723 ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr); 724 if (perm_identity) { 725 B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 726 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 727 } else { 728 B->ops->solve = MatSolve_SeqAIJCUSPARSE; 729 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 730 } 731 732 /* get the triangular factors */ 733 ierr = MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);CHKERRQ(ierr); 734 PetscFunctionReturn(0); 735 } 736 737 #undef __FUNCT__ 738 #define __FUNCT__ "MatSeqAIJCUSPARSEAnalyzeTransposeForSolve" 739 static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A) 740 { 741 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 742 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 743 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 744 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 745 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 746 cusparseStatus_t stat; 747 cusparseIndexBase_t indexBase; 748 cusparseMatrixType_t matrixType; 749 cusparseFillMode_t fillMode; 750 cusparseDiagType_t diagType; 751 752 PetscFunctionBegin; 753 754 /*********************************************/ 755 /* Now the Transpose of the Lower Tri Factor */ 756 /*********************************************/ 757 758 /* allocate space for the transpose of the lower triangular factor */ 759 loTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct; 760 761 /* set the matrix descriptors of the lower triangular factor */ 762 matrixType = cusparseGetMatType(loTriFactor->descr); 763 indexBase = cusparseGetMatIndexBase(loTriFactor->descr); 764 fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ? 765 CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 766 diagType = cusparseGetMatDiagType(loTriFactor->descr); 767 768 /* Create the matrix description */ 769 stat = cusparseCreateMatDescr(&loTriFactorT->descr);CHKERRCUDA(stat); 770 stat = cusparseSetMatIndexBase(loTriFactorT->descr, indexBase);CHKERRCUDA(stat); 771 stat = cusparseSetMatType(loTriFactorT->descr, matrixType);CHKERRCUDA(stat); 772 stat = cusparseSetMatFillMode(loTriFactorT->descr, fillMode);CHKERRCUDA(stat); 773 stat = cusparseSetMatDiagType(loTriFactorT->descr, diagType);CHKERRCUDA(stat); 774 775 /* Create the solve analysis information */ 776 stat = cusparseCreateSolveAnalysisInfo(&loTriFactorT->solveInfo);CHKERRCUDA(stat); 777 778 /* set the operation */ 779 loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 780 781 /* allocate GPU space for the CSC of the lower triangular factor*/ 782 loTriFactorT->csrMat = new CsrMatrix; 783 loTriFactorT->csrMat->num_rows = loTriFactor->csrMat->num_rows; 784 loTriFactorT->csrMat->num_cols = loTriFactor->csrMat->num_cols; 785 loTriFactorT->csrMat->num_entries = loTriFactor->csrMat->num_entries; 786 loTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(loTriFactor->csrMat->num_rows+1); 787 loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactor->csrMat->num_entries); 788 loTriFactorT->csrMat->values = new THRUSTARRAY(loTriFactor->csrMat->num_entries); 789 790 /* compute the transpose of the lower triangular factor, i.e. the CSC */ 791 stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, 792 loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, 793 loTriFactor->csrMat->values->data().get(), 794 loTriFactor->csrMat->row_offsets->data().get(), 795 loTriFactor->csrMat->column_indices->data().get(), 796 loTriFactorT->csrMat->values->data().get(), 797 loTriFactorT->csrMat->column_indices->data().get(), 798 loTriFactorT->csrMat->row_offsets->data().get(), 799 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUDA(stat); 800 801 /* perform the solve analysis on the transposed matrix */ 802 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp, 803 loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, 804 loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(), 805 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), 806 loTriFactorT->solveInfo);CHKERRCUDA(stat); 807 808 /* assign the pointer. Is this really necessary? */ 809 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT; 810 811 /*********************************************/ 812 /* Now the Transpose of the Upper Tri Factor */ 813 /*********************************************/ 814 815 /* allocate space for the transpose of the upper triangular factor */ 816 upTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct; 817 818 /* set the matrix descriptors of the upper triangular factor */ 819 matrixType = cusparseGetMatType(upTriFactor->descr); 820 indexBase = cusparseGetMatIndexBase(upTriFactor->descr); 821 fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ? 822 CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 823 diagType = cusparseGetMatDiagType(upTriFactor->descr); 824 825 /* Create the matrix description */ 826 stat = cusparseCreateMatDescr(&upTriFactorT->descr);CHKERRCUDA(stat); 827 stat = cusparseSetMatIndexBase(upTriFactorT->descr, indexBase);CHKERRCUDA(stat); 828 stat = cusparseSetMatType(upTriFactorT->descr, matrixType);CHKERRCUDA(stat); 829 stat = cusparseSetMatFillMode(upTriFactorT->descr, fillMode);CHKERRCUDA(stat); 830 stat = cusparseSetMatDiagType(upTriFactorT->descr, diagType);CHKERRCUDA(stat); 831 832 /* Create the solve analysis information */ 833 stat = cusparseCreateSolveAnalysisInfo(&upTriFactorT->solveInfo);CHKERRCUDA(stat); 834 835 /* set the operation */ 836 upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 837 838 /* allocate GPU space for the CSC of the upper triangular factor*/ 839 upTriFactorT->csrMat = new CsrMatrix; 840 upTriFactorT->csrMat->num_rows = upTriFactor->csrMat->num_rows; 841 upTriFactorT->csrMat->num_cols = upTriFactor->csrMat->num_cols; 842 upTriFactorT->csrMat->num_entries = upTriFactor->csrMat->num_entries; 843 upTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(upTriFactor->csrMat->num_rows+1); 844 upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactor->csrMat->num_entries); 845 upTriFactorT->csrMat->values = new THRUSTARRAY(upTriFactor->csrMat->num_entries); 846 847 /* compute the transpose of the upper triangular factor, i.e. the CSC */ 848 stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, 849 upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, 850 upTriFactor->csrMat->values->data().get(), 851 upTriFactor->csrMat->row_offsets->data().get(), 852 upTriFactor->csrMat->column_indices->data().get(), 853 upTriFactorT->csrMat->values->data().get(), 854 upTriFactorT->csrMat->column_indices->data().get(), 855 upTriFactorT->csrMat->row_offsets->data().get(), 856 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUDA(stat); 857 858 /* perform the solve analysis on the transposed matrix */ 859 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp, 860 upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, 861 upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(), 862 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), 863 upTriFactorT->solveInfo);CHKERRCUDA(stat); 864 865 /* assign the pointer. Is this really necessary? */ 866 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT; 867 PetscFunctionReturn(0); 868 } 869 870 #undef __FUNCT__ 871 #define __FUNCT__ "MatSeqAIJCUSPARSEGenerateTransposeForMult" 872 static PetscErrorCode MatSeqAIJCUSPARSEGenerateTransposeForMult(Mat A) 873 { 874 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 875 Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 876 Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 877 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 878 cusparseStatus_t stat; 879 cusparseIndexBase_t indexBase; 880 cudaError_t err; 881 882 PetscFunctionBegin; 883 884 /* allocate space for the triangular factor information */ 885 matstructT = new Mat_SeqAIJCUSPARSEMultStruct; 886 stat = cusparseCreateMatDescr(&matstructT->descr);CHKERRCUDA(stat); 887 indexBase = cusparseGetMatIndexBase(matstruct->descr); 888 stat = cusparseSetMatIndexBase(matstructT->descr, indexBase);CHKERRCUDA(stat); 889 stat = cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUDA(stat); 890 891 /* set alpha and beta */ 892 err = cudaMalloc((void **)&(matstructT->alpha),sizeof(PetscScalar));CHKERRCUDA(err); 893 err = cudaMemcpy(matstructT->alpha,&ALPHA,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 894 err = cudaMalloc((void **)&(matstructT->beta),sizeof(PetscScalar));CHKERRCUDA(err); 895 err = cudaMemcpy(matstructT->beta,&BETA,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 896 stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUDA(stat); 897 898 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 899 CsrMatrix *matrix = (CsrMatrix*)matstruct->mat; 900 CsrMatrix *matrixT= new CsrMatrix; 901 matrixT->num_rows = A->rmap->n; 902 matrixT->num_cols = A->cmap->n; 903 matrixT->num_entries = a->nz; 904 matrixT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 905 matrixT->column_indices = new THRUSTINTARRAY32(a->nz); 906 matrixT->values = new THRUSTARRAY(a->nz); 907 908 /* compute the transpose of the upper triangular factor, i.e. the CSC */ 909 indexBase = cusparseGetMatIndexBase(matstruct->descr); 910 stat = cusparse_csr2csc(cusparsestruct->handle, matrix->num_rows, 911 matrix->num_cols, matrix->num_entries, 912 matrix->values->data().get(), 913 matrix->row_offsets->data().get(), 914 matrix->column_indices->data().get(), 915 matrixT->values->data().get(), 916 matrixT->column_indices->data().get(), 917 matrixT->row_offsets->data().get(), 918 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUDA(stat); 919 920 /* assign the pointer */ 921 matstructT->mat = matrixT; 922 923 } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) { 924 #if CUDA_VERSION>=5000 925 /* First convert HYB to CSR */ 926 CsrMatrix *temp= new CsrMatrix; 927 temp->num_rows = A->rmap->n; 928 temp->num_cols = A->cmap->n; 929 temp->num_entries = a->nz; 930 temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 931 temp->column_indices = new THRUSTINTARRAY32(a->nz); 932 temp->values = new THRUSTARRAY(a->nz); 933 934 935 stat = cusparse_hyb2csr(cusparsestruct->handle, 936 matstruct->descr, (cusparseHybMat_t)matstruct->mat, 937 temp->values->data().get(), 938 temp->row_offsets->data().get(), 939 temp->column_indices->data().get());CHKERRCUDA(stat); 940 941 /* Next, convert CSR to CSC (i.e. the matrix transpose) */ 942 CsrMatrix *tempT= new CsrMatrix; 943 tempT->num_rows = A->rmap->n; 944 tempT->num_cols = A->cmap->n; 945 tempT->num_entries = a->nz; 946 tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 947 tempT->column_indices = new THRUSTINTARRAY32(a->nz); 948 tempT->values = new THRUSTARRAY(a->nz); 949 950 stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows, 951 temp->num_cols, temp->num_entries, 952 temp->values->data().get(), 953 temp->row_offsets->data().get(), 954 temp->column_indices->data().get(), 955 tempT->values->data().get(), 956 tempT->column_indices->data().get(), 957 tempT->row_offsets->data().get(), 958 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUDA(stat); 959 960 /* Last, convert CSC to HYB */ 961 cusparseHybMat_t hybMat; 962 stat = cusparseCreateHybMat(&hybMat);CHKERRCUDA(stat); 963 cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ? 964 CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 965 stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n, 966 matstructT->descr, tempT->values->data().get(), 967 tempT->row_offsets->data().get(), 968 tempT->column_indices->data().get(), 969 hybMat, 0, partition);CHKERRCUDA(stat); 970 971 /* assign the pointer */ 972 matstructT->mat = hybMat; 973 974 /* delete temporaries */ 975 if (tempT) { 976 if (tempT->values) delete (THRUSTARRAY*) tempT->values; 977 if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices; 978 if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets; 979 delete (CsrMatrix*) tempT; 980 } 981 if (temp) { 982 if (temp->values) delete (THRUSTARRAY*) temp->values; 983 if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices; 984 if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets; 985 delete (CsrMatrix*) temp; 986 } 987 #else 988 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"ELL (Ellpack) and HYB (Hybrid) storage format for the Matrix Transpose (in MatMultTranspose) require CUDA 5.0 or later."); 989 #endif 990 } 991 /* assign the compressed row indices */ 992 matstructT->cprowIndices = new THRUSTINTARRAY; 993 994 /* assign the pointer */ 995 ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT; 996 PetscFunctionReturn(0); 997 } 998 999 #undef __FUNCT__ 1000 #define __FUNCT__ "MatSolveTranspose_SeqAIJCUSPARSE" 1001 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 1002 { 1003 PetscInt n = xx->map->n; 1004 PetscScalar *xarray, *barray; 1005 thrust::device_ptr<PetscScalar> xGPU,bGPU; 1006 cusparseStatus_t stat; 1007 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1008 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1009 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1010 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1011 PetscErrorCode ierr; 1012 1013 PetscFunctionBegin; 1014 /* Analyze the matrix and create the transpose ... on the fly */ 1015 if (!loTriFactorT && !upTriFactorT) { 1016 ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr); 1017 loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1018 upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1019 } 1020 1021 /* Get the GPU pointers */ 1022 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1023 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1024 xGPU = thrust::device_pointer_cast(xarray); 1025 bGPU = thrust::device_pointer_cast(barray); 1026 1027 /* First, reorder with the row permutation */ 1028 thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), 1029 thrust::make_permutation_iterator(bGPU+n, cusparseTriFactors->rpermIndices->end()), 1030 xGPU); 1031 1032 /* First, solve U */ 1033 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, 1034 upTriFactorT->csrMat->num_rows, &ALPHA, upTriFactorT->descr, 1035 upTriFactorT->csrMat->values->data().get(), 1036 upTriFactorT->csrMat->row_offsets->data().get(), 1037 upTriFactorT->csrMat->column_indices->data().get(), 1038 upTriFactorT->solveInfo, 1039 xarray, tempGPU->data().get());CHKERRCUDA(stat); 1040 1041 /* Then, solve L */ 1042 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, 1043 loTriFactorT->csrMat->num_rows, &ALPHA, loTriFactorT->descr, 1044 loTriFactorT->csrMat->values->data().get(), 1045 loTriFactorT->csrMat->row_offsets->data().get(), 1046 loTriFactorT->csrMat->column_indices->data().get(), 1047 loTriFactorT->solveInfo, 1048 tempGPU->data().get(), xarray);CHKERRCUDA(stat); 1049 1050 /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */ 1051 thrust::copy(thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()), 1052 thrust::make_permutation_iterator(xGPU+n, cusparseTriFactors->cpermIndices->end()), 1053 tempGPU->begin()); 1054 1055 /* Copy the temporary to the full solution. */ 1056 thrust::copy(tempGPU->begin(), tempGPU->end(), xGPU); 1057 1058 /* restore */ 1059 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1060 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1061 ierr = WaitForGPU();CHKERRCUDA(ierr); 1062 1063 ierr = PetscLogFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1064 PetscFunctionReturn(0); 1065 } 1066 1067 #undef __FUNCT__ 1068 #define __FUNCT__ "MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering" 1069 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 1070 { 1071 PetscScalar *xarray, *barray; 1072 cusparseStatus_t stat; 1073 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1074 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1075 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1076 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1077 PetscErrorCode ierr; 1078 1079 PetscFunctionBegin; 1080 /* Analyze the matrix and create the transpose ... on the fly */ 1081 if (!loTriFactorT && !upTriFactorT) { 1082 ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr); 1083 loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1084 upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1085 } 1086 1087 /* Get the GPU pointers */ 1088 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1089 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1090 1091 /* First, solve U */ 1092 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, 1093 upTriFactorT->csrMat->num_rows, &ALPHA, upTriFactorT->descr, 1094 upTriFactorT->csrMat->values->data().get(), 1095 upTriFactorT->csrMat->row_offsets->data().get(), 1096 upTriFactorT->csrMat->column_indices->data().get(), 1097 upTriFactorT->solveInfo, 1098 barray, tempGPU->data().get());CHKERRCUDA(stat); 1099 1100 /* Then, solve L */ 1101 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, 1102 loTriFactorT->csrMat->num_rows, &ALPHA, loTriFactorT->descr, 1103 loTriFactorT->csrMat->values->data().get(), 1104 loTriFactorT->csrMat->row_offsets->data().get(), 1105 loTriFactorT->csrMat->column_indices->data().get(), 1106 loTriFactorT->solveInfo, 1107 tempGPU->data().get(), xarray);CHKERRCUDA(stat); 1108 1109 /* restore */ 1110 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1111 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1112 ierr = WaitForGPU();CHKERRCUDA(ierr); 1113 ierr = PetscLogFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1114 PetscFunctionReturn(0); 1115 } 1116 1117 #undef __FUNCT__ 1118 #define __FUNCT__ "MatSolve_SeqAIJCUSPARSE" 1119 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 1120 { 1121 PetscScalar *xarray, *barray; 1122 thrust::device_ptr<PetscScalar> xGPU,bGPU; 1123 cusparseStatus_t stat; 1124 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1125 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1126 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1127 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1128 PetscErrorCode ierr; 1129 VecType t; 1130 PetscBool flg; 1131 1132 PetscFunctionBegin; 1133 ierr = VecGetType(bb,&t);CHKERRQ(ierr); 1134 ierr = PetscStrcmp(t,VECSEQCUDA,&flg);CHKERRQ(ierr); 1135 if (!flg) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Vector of type %s passed into MatSolve_SeqAIJCUSPARSE (Arg #2). Can only deal with %s\n.",t,VECSEQCUDA); 1136 ierr = VecGetType(xx,&t);CHKERRQ(ierr); 1137 ierr = PetscStrcmp(t,VECSEQCUDA,&flg);CHKERRQ(ierr); 1138 if (!flg) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Vector of type %s passed into MatSolve_SeqAIJCUSPARSE (Arg #3). Can only deal with %s\n.",t,VECSEQCUDA); 1139 1140 /* Get the GPU pointers */ 1141 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1142 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1143 xGPU = thrust::device_pointer_cast(xarray); 1144 bGPU = thrust::device_pointer_cast(barray); 1145 1146 /* First, reorder with the row permutation */ 1147 thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), 1148 thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()), 1149 xGPU); 1150 1151 /* Next, solve L */ 1152 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp, 1153 loTriFactor->csrMat->num_rows, &ALPHA, loTriFactor->descr, 1154 loTriFactor->csrMat->values->data().get(), 1155 loTriFactor->csrMat->row_offsets->data().get(), 1156 loTriFactor->csrMat->column_indices->data().get(), 1157 loTriFactor->solveInfo, 1158 xarray, tempGPU->data().get());CHKERRCUDA(stat); 1159 1160 /* Then, solve U */ 1161 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp, 1162 upTriFactor->csrMat->num_rows, &ALPHA, upTriFactor->descr, 1163 upTriFactor->csrMat->values->data().get(), 1164 upTriFactor->csrMat->row_offsets->data().get(), 1165 upTriFactor->csrMat->column_indices->data().get(), 1166 upTriFactor->solveInfo, 1167 tempGPU->data().get(), xarray);CHKERRCUDA(stat); 1168 1169 /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */ 1170 thrust::copy(thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()), 1171 thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->end()), 1172 tempGPU->begin()); 1173 1174 /* Copy the temporary to the full solution. */ 1175 thrust::copy(tempGPU->begin(), tempGPU->end(), xGPU); 1176 1177 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1178 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1179 ierr = WaitForGPU();CHKERRCUDA(ierr); 1180 ierr = PetscLogFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1181 PetscFunctionReturn(0); 1182 } 1183 1184 #undef __FUNCT__ 1185 #define __FUNCT__ "MatSolve_SeqAIJCUSPARSE_NaturalOrdering" 1186 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 1187 { 1188 PetscScalar *xarray, *barray; 1189 cusparseStatus_t stat; 1190 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1191 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1192 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1193 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1194 PetscErrorCode ierr; 1195 1196 PetscFunctionBegin; 1197 /* Get the GPU pointers */ 1198 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1199 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1200 1201 /* First, solve L */ 1202 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp, 1203 loTriFactor->csrMat->num_rows, &ALPHA, loTriFactor->descr, 1204 loTriFactor->csrMat->values->data().get(), 1205 loTriFactor->csrMat->row_offsets->data().get(), 1206 loTriFactor->csrMat->column_indices->data().get(), 1207 loTriFactor->solveInfo, 1208 barray, tempGPU->data().get());CHKERRCUDA(stat); 1209 1210 /* Next, solve U */ 1211 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp, 1212 upTriFactor->csrMat->num_rows, &ALPHA, upTriFactor->descr, 1213 upTriFactor->csrMat->values->data().get(), 1214 upTriFactor->csrMat->row_offsets->data().get(), 1215 upTriFactor->csrMat->column_indices->data().get(), 1216 upTriFactor->solveInfo, 1217 tempGPU->data().get(), xarray);CHKERRCUDA(stat); 1218 1219 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1220 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1221 ierr = WaitForGPU();CHKERRCUDA(ierr); 1222 ierr = PetscLogFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1223 PetscFunctionReturn(0); 1224 } 1225 1226 #undef __FUNCT__ 1227 #define __FUNCT__ "MatSeqAIJCUSPARSECopyToGPU" 1228 static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A) 1229 { 1230 1231 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1232 Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 1233 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1234 PetscInt m = A->rmap->n,*ii,*ridx; 1235 PetscErrorCode ierr; 1236 cusparseStatus_t stat; 1237 cudaError_t err; 1238 1239 PetscFunctionBegin; 1240 if (A->valid_GPU_matrix == PETSC_CUDA_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUDA_CPU) { 1241 ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1242 Mat_SeqAIJCUSPARSEMultStruct_Destroy(&matstruct,cusparsestruct->format); 1243 try { 1244 cusparsestruct->nonzerorow=0; 1245 for (int j = 0; j<m; j++) cusparsestruct->nonzerorow += ((a->i[j+1]-a->i[j])>0); 1246 1247 if (a->compressedrow.use) { 1248 m = a->compressedrow.nrows; 1249 ii = a->compressedrow.i; 1250 ridx = a->compressedrow.rindex; 1251 } else { 1252 /* Forcing compressed row on the GPU */ 1253 int k=0; 1254 ierr = PetscMalloc1(cusparsestruct->nonzerorow+1, &ii);CHKERRQ(ierr); 1255 ierr = PetscMalloc1(cusparsestruct->nonzerorow, &ridx);CHKERRQ(ierr); 1256 ii[0]=0; 1257 for (int j = 0; j<m; j++) { 1258 if ((a->i[j+1]-a->i[j])>0) { 1259 ii[k] = a->i[j]; 1260 ridx[k]= j; 1261 k++; 1262 } 1263 } 1264 ii[cusparsestruct->nonzerorow] = a->nz; 1265 m = cusparsestruct->nonzerorow; 1266 } 1267 1268 /* allocate space for the triangular factor information */ 1269 matstruct = new Mat_SeqAIJCUSPARSEMultStruct; 1270 stat = cusparseCreateMatDescr(&matstruct->descr);CHKERRCUDA(stat); 1271 stat = cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat); 1272 stat = cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUDA(stat); 1273 1274 err = cudaMalloc((void **)&(matstruct->alpha),sizeof(PetscScalar));CHKERRCUDA(err); 1275 err = cudaMemcpy(matstruct->alpha,&ALPHA,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1276 err = cudaMalloc((void **)&(matstruct->beta),sizeof(PetscScalar));CHKERRCUDA(err); 1277 err = cudaMemcpy(matstruct->beta,&BETA,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1278 stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUDA(stat); 1279 1280 /* Build a hybrid/ellpack matrix if this option is chosen for the storage */ 1281 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1282 /* set the matrix */ 1283 CsrMatrix *matrix= new CsrMatrix; 1284 matrix->num_rows = m; 1285 matrix->num_cols = A->cmap->n; 1286 matrix->num_entries = a->nz; 1287 matrix->row_offsets = new THRUSTINTARRAY32(m+1); 1288 matrix->row_offsets->assign(ii, ii + m+1); 1289 1290 matrix->column_indices = new THRUSTINTARRAY32(a->nz); 1291 matrix->column_indices->assign(a->j, a->j+a->nz); 1292 1293 matrix->values = new THRUSTARRAY(a->nz); 1294 matrix->values->assign(a->a, a->a+a->nz); 1295 1296 /* assign the pointer */ 1297 matstruct->mat = matrix; 1298 1299 } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) { 1300 #if CUDA_VERSION>=4020 1301 CsrMatrix *matrix= new CsrMatrix; 1302 matrix->num_rows = m; 1303 matrix->num_cols = A->cmap->n; 1304 matrix->num_entries = a->nz; 1305 matrix->row_offsets = new THRUSTINTARRAY32(m+1); 1306 matrix->row_offsets->assign(ii, ii + m+1); 1307 1308 matrix->column_indices = new THRUSTINTARRAY32(a->nz); 1309 matrix->column_indices->assign(a->j, a->j+a->nz); 1310 1311 matrix->values = new THRUSTARRAY(a->nz); 1312 matrix->values->assign(a->a, a->a+a->nz); 1313 1314 cusparseHybMat_t hybMat; 1315 stat = cusparseCreateHybMat(&hybMat);CHKERRCUDA(stat); 1316 cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ? 1317 CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 1318 stat = cusparse_csr2hyb(cusparsestruct->handle, matrix->num_rows, matrix->num_cols, 1319 matstruct->descr, matrix->values->data().get(), 1320 matrix->row_offsets->data().get(), 1321 matrix->column_indices->data().get(), 1322 hybMat, 0, partition);CHKERRCUDA(stat); 1323 /* assign the pointer */ 1324 matstruct->mat = hybMat; 1325 1326 if (matrix) { 1327 if (matrix->values) delete (THRUSTARRAY*)matrix->values; 1328 if (matrix->column_indices) delete (THRUSTINTARRAY32*)matrix->column_indices; 1329 if (matrix->row_offsets) delete (THRUSTINTARRAY32*)matrix->row_offsets; 1330 delete (CsrMatrix*)matrix; 1331 } 1332 #endif 1333 } 1334 1335 /* assign the compressed row indices */ 1336 matstruct->cprowIndices = new THRUSTINTARRAY(m); 1337 matstruct->cprowIndices->assign(ridx,ridx+m); 1338 1339 /* assign the pointer */ 1340 cusparsestruct->mat = matstruct; 1341 1342 if (!a->compressedrow.use) { 1343 ierr = PetscFree(ii);CHKERRQ(ierr); 1344 ierr = PetscFree(ridx);CHKERRQ(ierr); 1345 } 1346 cusparsestruct->workVector = new THRUSTARRAY; 1347 cusparsestruct->workVector->resize(m); 1348 } catch(char *ex) { 1349 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 1350 } 1351 ierr = WaitForGPU();CHKERRCUDA(ierr); 1352 1353 A->valid_GPU_matrix = PETSC_CUDA_BOTH; 1354 1355 ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1356 } 1357 PetscFunctionReturn(0); 1358 } 1359 1360 #undef __FUNCT__ 1361 #define __FUNCT__ "MatCreateVecs_SeqAIJCUSPARSE" 1362 static PetscErrorCode MatCreateVecs_SeqAIJCUSPARSE(Mat mat, Vec *right, Vec *left) 1363 { 1364 PetscErrorCode ierr; 1365 PetscInt rbs,cbs; 1366 1367 PetscFunctionBegin; 1368 ierr = MatGetBlockSizes(mat,&rbs,&cbs);CHKERRQ(ierr); 1369 if (right) { 1370 ierr = VecCreate(PetscObjectComm((PetscObject)mat),right);CHKERRQ(ierr); 1371 ierr = VecSetSizes(*right,mat->cmap->n,PETSC_DETERMINE);CHKERRQ(ierr); 1372 ierr = VecSetBlockSize(*right,cbs);CHKERRQ(ierr); 1373 ierr = VecSetType(*right,VECSEQCUDA);CHKERRQ(ierr); 1374 ierr = PetscLayoutReference(mat->cmap,&(*right)->map);CHKERRQ(ierr); 1375 } 1376 if (left) { 1377 ierr = VecCreate(PetscObjectComm((PetscObject)mat),left);CHKERRQ(ierr); 1378 ierr = VecSetSizes(*left,mat->rmap->n,PETSC_DETERMINE);CHKERRQ(ierr); 1379 ierr = VecSetBlockSize(*left,rbs);CHKERRQ(ierr); 1380 ierr = VecSetType(*left,VECSEQCUDA);CHKERRQ(ierr); 1381 ierr = PetscLayoutReference(mat->rmap,&(*left)->map);CHKERRQ(ierr); 1382 } 1383 PetscFunctionReturn(0); 1384 } 1385 1386 struct VecCUDAPlusEquals 1387 { 1388 template <typename Tuple> 1389 __host__ __device__ 1390 void operator()(Tuple t) 1391 { 1392 thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t); 1393 } 1394 }; 1395 1396 #undef __FUNCT__ 1397 #define __FUNCT__ "MatMult_SeqAIJCUSPARSE" 1398 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 1399 { 1400 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1401 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1402 Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 1403 PetscScalar *xarray,*yarray; 1404 PetscErrorCode ierr; 1405 cusparseStatus_t stat; 1406 1407 PetscFunctionBegin; 1408 /* The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE 1409 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); */ 1410 ierr = VecCUDAGetArrayRead(xx,&xarray);CHKERRQ(ierr); 1411 ierr = VecCUDAGetArrayWrite(yy,&yarray);CHKERRQ(ierr); 1412 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1413 CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 1414 stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1415 mat->num_rows, mat->num_cols, mat->num_entries, 1416 matstruct->alpha, matstruct->descr, mat->values->data().get(), mat->row_offsets->data().get(), 1417 mat->column_indices->data().get(), xarray, matstruct->beta, 1418 yarray);CHKERRCUDA(stat); 1419 } else { 1420 #if CUDA_VERSION>=4020 1421 cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat; 1422 stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1423 matstruct->alpha, matstruct->descr, hybMat, 1424 xarray, matstruct->beta, 1425 yarray);CHKERRCUDA(stat); 1426 #endif 1427 } 1428 ierr = VecCUDARestoreArrayRead(xx,&xarray);CHKERRQ(ierr); 1429 ierr = VecCUDARestoreArrayWrite(yy,&yarray);CHKERRQ(ierr); 1430 if (!cusparsestruct->stream) { 1431 ierr = WaitForGPU();CHKERRCUDA(ierr); 1432 } 1433 ierr = PetscLogFlops(2.0*a->nz - cusparsestruct->nonzerorow);CHKERRQ(ierr); 1434 PetscFunctionReturn(0); 1435 } 1436 1437 #undef __FUNCT__ 1438 #define __FUNCT__ "MatMultTranspose_SeqAIJCUSPARSE" 1439 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 1440 { 1441 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1442 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1443 Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 1444 PetscScalar *xarray,*yarray; 1445 PetscErrorCode ierr; 1446 cusparseStatus_t stat; 1447 1448 PetscFunctionBegin; 1449 /* The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE 1450 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); */ 1451 if (!matstructT) { 1452 ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr); 1453 matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 1454 } 1455 ierr = VecCUDAGetArrayRead(xx,&xarray);CHKERRQ(ierr); 1456 ierr = VecCUDAGetArrayWrite(yy,&yarray);CHKERRQ(ierr); 1457 1458 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1459 CsrMatrix *mat = (CsrMatrix*)matstructT->mat; 1460 stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1461 mat->num_rows, mat->num_cols, 1462 mat->num_entries, matstructT->alpha, matstructT->descr, 1463 mat->values->data().get(), mat->row_offsets->data().get(), 1464 mat->column_indices->data().get(), xarray, matstructT->beta, 1465 yarray);CHKERRCUDA(stat); 1466 } else { 1467 #if CUDA_VERSION>=4020 1468 cusparseHybMat_t hybMat = (cusparseHybMat_t)matstructT->mat; 1469 stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1470 matstructT->alpha, matstructT->descr, hybMat, 1471 xarray, matstructT->beta, 1472 yarray);CHKERRCUDA(stat); 1473 #endif 1474 } 1475 ierr = VecCUDARestoreArrayRead(xx,&xarray);CHKERRQ(ierr); 1476 ierr = VecCUDARestoreArrayWrite(yy,&yarray);CHKERRQ(ierr); 1477 if (!cusparsestruct->stream) { 1478 ierr = WaitForGPU();CHKERRCUDA(ierr); 1479 } 1480 ierr = PetscLogFlops(2.0*a->nz - cusparsestruct->nonzerorow);CHKERRQ(ierr); 1481 PetscFunctionReturn(0); 1482 } 1483 1484 1485 #undef __FUNCT__ 1486 #define __FUNCT__ "MatMultAdd_SeqAIJCUSPARSE" 1487 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 1488 { 1489 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1490 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1491 Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 1492 thrust::device_ptr<PetscScalar> zptr; 1493 PetscScalar *xarray,*zarray; 1494 PetscErrorCode ierr; 1495 cusparseStatus_t stat; 1496 1497 PetscFunctionBegin; 1498 /* The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE 1499 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); */ 1500 try { 1501 ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr); 1502 ierr = VecCUDAGetArrayRead(xx,&xarray);CHKERRQ(ierr); 1503 ierr = VecCUDAGetArrayWrite(zz,&zarray);CHKERRQ(ierr); 1504 zptr = thrust::device_pointer_cast(zarray); 1505 1506 /* multiply add */ 1507 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1508 CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 1509 /* here we need to be careful to set the number of rows in the multiply to the 1510 number of compressed rows in the matrix ... which is equivalent to the 1511 size of the workVector */ 1512 stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1513 mat->num_rows, mat->num_cols, 1514 mat->num_entries, matstruct->alpha, matstruct->descr, 1515 mat->values->data().get(), mat->row_offsets->data().get(), 1516 mat->column_indices->data().get(), xarray, matstruct->beta, 1517 cusparsestruct->workVector->data().get());CHKERRCUDA(stat); 1518 } else { 1519 #if CUDA_VERSION>=4020 1520 cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat; 1521 if (cusparsestruct->workVector->size()) { 1522 stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1523 matstruct->alpha, matstruct->descr, hybMat, 1524 xarray, matstruct->beta, 1525 cusparsestruct->workVector->data().get());CHKERRCUDA(stat); 1526 } 1527 #endif 1528 } 1529 1530 /* scatter the data from the temporary into the full vector with a += operation */ 1531 thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))), 1532 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + cusparsestruct->workVector->size(), 1533 VecCUDAPlusEquals()); 1534 ierr = VecCUDARestoreArrayRead(xx,&xarray);CHKERRQ(ierr); 1535 ierr = VecCUDARestoreArrayWrite(zz,&zarray);CHKERRQ(ierr); 1536 1537 } catch(char *ex) { 1538 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 1539 } 1540 ierr = WaitForGPU();CHKERRCUDA(ierr); 1541 ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr); 1542 PetscFunctionReturn(0); 1543 } 1544 1545 #undef __FUNCT__ 1546 #define __FUNCT__ "MatMultTransposeAdd_SeqAIJCUSPARSE" 1547 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 1548 { 1549 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1550 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1551 Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 1552 thrust::device_ptr<PetscScalar> zptr; 1553 PetscScalar *xarray,*zarray; 1554 PetscErrorCode ierr; 1555 cusparseStatus_t stat; 1556 1557 PetscFunctionBegin; 1558 /* The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE 1559 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); */ 1560 if (!matstructT) { 1561 ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr); 1562 matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 1563 } 1564 1565 try { 1566 ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr); 1567 ierr = VecCUDAGetArrayRead(xx,&xarray);CHKERRQ(ierr); 1568 ierr = VecCUDAGetArrayWrite(zz,&zarray);CHKERRQ(ierr); 1569 zptr = thrust::device_pointer_cast(zarray); 1570 1571 /* multiply add with matrix transpose */ 1572 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1573 CsrMatrix *mat = (CsrMatrix*)matstructT->mat; 1574 /* here we need to be careful to set the number of rows in the multiply to the 1575 number of compressed rows in the matrix ... which is equivalent to the 1576 size of the workVector */ 1577 stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1578 mat->num_rows, mat->num_cols, 1579 mat->num_entries, matstructT->alpha, matstructT->descr, 1580 mat->values->data().get(), mat->row_offsets->data().get(), 1581 mat->column_indices->data().get(), xarray, matstructT->beta, 1582 cusparsestruct->workVector->data().get());CHKERRCUDA(stat); 1583 } else { 1584 #if CUDA_VERSION>=4020 1585 cusparseHybMat_t hybMat = (cusparseHybMat_t)matstructT->mat; 1586 if (cusparsestruct->workVector->size()) { 1587 stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1588 matstructT->alpha, matstructT->descr, hybMat, 1589 xarray, matstructT->beta, 1590 cusparsestruct->workVector->data().get());CHKERRCUDA(stat); 1591 } 1592 #endif 1593 } 1594 1595 /* scatter the data from the temporary into the full vector with a += operation */ 1596 thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstructT->cprowIndices->begin()))), 1597 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstructT->cprowIndices->begin()))) + cusparsestruct->workVector->size(), 1598 VecCUDAPlusEquals()); 1599 1600 ierr = VecCUDARestoreArrayRead(xx,&xarray);CHKERRQ(ierr); 1601 ierr = VecCUDARestoreArrayWrite(zz,&zarray);CHKERRQ(ierr); 1602 1603 } catch(char *ex) { 1604 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 1605 } 1606 ierr = WaitForGPU();CHKERRCUDA(ierr); 1607 ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr); 1608 PetscFunctionReturn(0); 1609 } 1610 1611 #undef __FUNCT__ 1612 #define __FUNCT__ "MatAssemblyEnd_SeqAIJCUSPARSE" 1613 static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode) 1614 { 1615 PetscErrorCode ierr; 1616 1617 PetscFunctionBegin; 1618 ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr); 1619 if (A->factortype==MAT_FACTOR_NONE) { 1620 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 1621 } 1622 if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0); 1623 A->ops->mult = MatMult_SeqAIJCUSPARSE; 1624 A->ops->multadd = MatMultAdd_SeqAIJCUSPARSE; 1625 A->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE; 1626 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE; 1627 PetscFunctionReturn(0); 1628 } 1629 1630 /* --------------------------------------------------------------------------------*/ 1631 #undef __FUNCT__ 1632 #define __FUNCT__ "MatCreateSeqAIJCUSPARSE" 1633 /*@ 1634 MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format 1635 (the default parallel PETSc format). This matrix will ultimately pushed down 1636 to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix 1637 assembly performance the user should preallocate the matrix storage by setting 1638 the parameter nz (or the array nnz). By setting these parameters accurately, 1639 performance during matrix assembly can be increased by more than a factor of 50. 1640 1641 Collective on MPI_Comm 1642 1643 Input Parameters: 1644 + comm - MPI communicator, set to PETSC_COMM_SELF 1645 . m - number of rows 1646 . n - number of columns 1647 . nz - number of nonzeros per row (same for all rows) 1648 - nnz - array containing the number of nonzeros in the various rows 1649 (possibly different for each row) or NULL 1650 1651 Output Parameter: 1652 . A - the matrix 1653 1654 It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 1655 MatXXXXSetPreallocation() paradgm instead of this routine directly. 1656 [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 1657 1658 Notes: 1659 If nnz is given then nz is ignored 1660 1661 The AIJ format (also called the Yale sparse matrix format or 1662 compressed row storage), is fully compatible with standard Fortran 77 1663 storage. That is, the stored row and column indices can begin at 1664 either one (as in Fortran) or zero. See the users' manual for details. 1665 1666 Specify the preallocated storage with either nz or nnz (not both). 1667 Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 1668 allocation. For large problems you MUST preallocate memory or you 1669 will get TERRIBLE performance, see the users' manual chapter on matrices. 1670 1671 By default, this format uses inodes (identical nodes) when possible, to 1672 improve numerical efficiency of matrix-vector products and solves. We 1673 search for consecutive rows with the same nonzero structure, thereby 1674 reusing matrix information to achieve increased efficiency. 1675 1676 Level: intermediate 1677 1678 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE 1679 @*/ 1680 PetscErrorCode MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A) 1681 { 1682 PetscErrorCode ierr; 1683 1684 PetscFunctionBegin; 1685 ierr = MatCreate(comm,A);CHKERRQ(ierr); 1686 ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr); 1687 ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 1688 ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr); 1689 PetscFunctionReturn(0); 1690 } 1691 1692 #undef __FUNCT__ 1693 #define __FUNCT__ "MatDestroy_SeqAIJCUSPARSE" 1694 static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A) 1695 { 1696 PetscErrorCode ierr; 1697 1698 PetscFunctionBegin; 1699 if (A->factortype==MAT_FACTOR_NONE) { 1700 if (A->valid_GPU_matrix != PETSC_CUDA_UNALLOCATED) { 1701 ierr = Mat_SeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr);CHKERRQ(ierr); 1702 } 1703 } else { 1704 ierr = Mat_SeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr);CHKERRQ(ierr); 1705 } 1706 ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr); 1707 PetscFunctionReturn(0); 1708 } 1709 1710 #undef __FUNCT__ 1711 #define __FUNCT__ "MatCreate_SeqAIJCUSPARSE" 1712 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B) 1713 { 1714 PetscErrorCode ierr; 1715 cusparseStatus_t stat; 1716 cusparseHandle_t handle=0; 1717 1718 PetscFunctionBegin; 1719 ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr); 1720 if (B->factortype==MAT_FACTOR_NONE) { 1721 /* you cannot check the inode.use flag here since the matrix was just created. 1722 now build a GPU matrix data structure */ 1723 B->spptr = new Mat_SeqAIJCUSPARSE; 1724 ((Mat_SeqAIJCUSPARSE*)B->spptr)->mat = 0; 1725 ((Mat_SeqAIJCUSPARSE*)B->spptr)->matTranspose = 0; 1726 ((Mat_SeqAIJCUSPARSE*)B->spptr)->workVector = 0; 1727 ((Mat_SeqAIJCUSPARSE*)B->spptr)->format = MAT_CUSPARSE_CSR; 1728 ((Mat_SeqAIJCUSPARSE*)B->spptr)->stream = 0; 1729 ((Mat_SeqAIJCUSPARSE*)B->spptr)->handle = 0; 1730 stat = cusparseCreate(&handle);CHKERRCUDA(stat); 1731 ((Mat_SeqAIJCUSPARSE*)B->spptr)->handle = handle; 1732 ((Mat_SeqAIJCUSPARSE*)B->spptr)->stream = 0; 1733 } else { 1734 /* NEXT, set the pointers to the triangular factors */ 1735 B->spptr = new Mat_SeqAIJCUSPARSETriFactors; 1736 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->loTriFactorPtr = 0; 1737 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->upTriFactorPtr = 0; 1738 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->loTriFactorPtrTranspose = 0; 1739 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->upTriFactorPtrTranspose = 0; 1740 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->rpermIndices = 0; 1741 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->cpermIndices = 0; 1742 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->workVector = 0; 1743 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->handle = 0; 1744 stat = cusparseCreate(&handle);CHKERRCUDA(stat); 1745 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->handle = handle; 1746 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->nnz = 0; 1747 } 1748 1749 B->ops->assemblyend = MatAssemblyEnd_SeqAIJCUSPARSE; 1750 B->ops->destroy = MatDestroy_SeqAIJCUSPARSE; 1751 B->ops->getvecs = MatCreateVecs_SeqAIJCUSPARSE; 1752 B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE; 1753 B->ops->mult = MatMult_SeqAIJCUSPARSE; 1754 B->ops->multadd = MatMultAdd_SeqAIJCUSPARSE; 1755 B->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE; 1756 B->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE; 1757 1758 ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 1759 1760 B->valid_GPU_matrix = PETSC_CUDA_UNALLOCATED; 1761 1762 ierr = PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr); 1763 PetscFunctionReturn(0); 1764 } 1765 1766 /*M 1767 MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices. 1768 1769 A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either 1770 CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later. 1771 All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library. 1772 1773 Options Database Keys: 1774 + -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions() 1775 . -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 1776 . -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 1777 1778 Level: beginner 1779 1780 .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 1781 M*/ 1782 1783 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat,MatFactorType,Mat*); 1784 1785 1786 #undef __FUNCT__ 1787 #define __FUNCT__ "MatSolverPackageRegister_CUSPARSE" 1788 PETSC_EXTERN PetscErrorCode MatSolverPackageRegister_CUSPARSE(void) 1789 { 1790 PetscErrorCode ierr; 1791 1792 PetscFunctionBegin; 1793 ierr = MatSolverPackageRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 1794 ierr = MatSolverPackageRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 1795 ierr = MatSolverPackageRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 1796 ierr = MatSolverPackageRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 1797 PetscFunctionReturn(0); 1798 } 1799 1800 1801 #undef __FUNCT__ 1802 #define __FUNCT__ "Mat_SeqAIJCUSPARSE_Destroy" 1803 static PetscErrorCode Mat_SeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct) 1804 { 1805 cusparseStatus_t stat; 1806 cusparseHandle_t handle; 1807 1808 PetscFunctionBegin; 1809 if (*cusparsestruct) { 1810 Mat_SeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format); 1811 Mat_SeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format); 1812 delete (*cusparsestruct)->workVector; 1813 if (handle = (*cusparsestruct)->handle) { 1814 stat = cusparseDestroy(handle);CHKERRCUDA(stat); 1815 } 1816 delete *cusparsestruct; 1817 *cusparsestruct = 0; 1818 } 1819 PetscFunctionReturn(0); 1820 } 1821 1822 #undef __FUNCT__ 1823 #define __FUNCT__ "CsrMatrix_Destroy" 1824 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat) 1825 { 1826 PetscFunctionBegin; 1827 if (*mat) { 1828 delete (*mat)->values; 1829 delete (*mat)->column_indices; 1830 delete (*mat)->row_offsets; 1831 delete *mat; 1832 *mat = 0; 1833 } 1834 PetscFunctionReturn(0); 1835 } 1836 1837 #undef __FUNCT__ 1838 #define __FUNCT__ "Mat_SeqAIJCUSPARSETriFactorStruct_Destroy" 1839 static PetscErrorCode Mat_SeqAIJCUSPARSETriFactorStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor) 1840 { 1841 cusparseStatus_t stat; 1842 PetscErrorCode ierr; 1843 1844 PetscFunctionBegin; 1845 if (*trifactor) { 1846 if ((*trifactor)->descr) { stat = cusparseDestroyMatDescr((*trifactor)->descr);CHKERRCUDA(stat); } 1847 if ((*trifactor)->solveInfo) { stat = cusparseDestroySolveAnalysisInfo((*trifactor)->solveInfo);CHKERRCUDA(stat); } 1848 ierr = CsrMatrix_Destroy(&(*trifactor)->csrMat);CHKERRQ(ierr); 1849 delete *trifactor; 1850 *trifactor = 0; 1851 } 1852 PetscFunctionReturn(0); 1853 } 1854 1855 #undef __FUNCT__ 1856 #define __FUNCT__ "Mat_SeqAIJCUSPARSEMultStruct_Destroy" 1857 static PetscErrorCode Mat_SeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format) 1858 { 1859 CsrMatrix *mat; 1860 cusparseStatus_t stat; 1861 cudaError_t err; 1862 1863 PetscFunctionBegin; 1864 if (*matstruct) { 1865 if ((*matstruct)->mat) { 1866 if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) { 1867 cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat; 1868 stat = cusparseDestroyHybMat(hybMat);CHKERRCUDA(stat); 1869 } else { 1870 mat = (CsrMatrix*)(*matstruct)->mat; 1871 CsrMatrix_Destroy(&mat); 1872 } 1873 } 1874 if ((*matstruct)->descr) { stat = cusparseDestroyMatDescr((*matstruct)->descr);CHKERRCUDA(stat); } 1875 delete (*matstruct)->cprowIndices; 1876 if ((*matstruct)->alpha) { err=cudaFree((*matstruct)->alpha);CHKERRCUDA(err); } 1877 if ((*matstruct)->beta) { err=cudaFree((*matstruct)->beta);CHKERRCUDA(err); } 1878 delete *matstruct; 1879 *matstruct = 0; 1880 } 1881 PetscFunctionReturn(0); 1882 } 1883 1884 #undef __FUNCT__ 1885 #define __FUNCT__ "Mat_SeqAIJCUSPARSETriFactors_Destroy" 1886 static PetscErrorCode Mat_SeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors) 1887 { 1888 cusparseHandle_t handle; 1889 cusparseStatus_t stat; 1890 1891 PetscFunctionBegin; 1892 if (*trifactors) { 1893 Mat_SeqAIJCUSPARSETriFactorStruct_Destroy(&(*trifactors)->loTriFactorPtr); 1894 Mat_SeqAIJCUSPARSETriFactorStruct_Destroy(&(*trifactors)->upTriFactorPtr); 1895 Mat_SeqAIJCUSPARSETriFactorStruct_Destroy(&(*trifactors)->loTriFactorPtrTranspose); 1896 Mat_SeqAIJCUSPARSETriFactorStruct_Destroy(&(*trifactors)->upTriFactorPtrTranspose); 1897 delete (*trifactors)->rpermIndices; 1898 delete (*trifactors)->cpermIndices; 1899 delete (*trifactors)->workVector; 1900 if (handle = (*trifactors)->handle) { 1901 stat = cusparseDestroy(handle);CHKERRCUDA(stat); 1902 } 1903 delete *trifactors; 1904 *trifactors = 0; 1905 } 1906 PetscFunctionReturn(0); 1907 } 1908 1909