1 /* 2 Defines the basic matrix operations for the AIJ (compressed row) 3 matrix storage format using the CUSPARSE library, 4 */ 5 #define PETSC_SKIP_SPINLOCK 6 7 #include <petscconf.h> 8 #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/ 9 #include <../src/mat/impls/sbaij/seq/sbaij.h> 10 #include <../src/vec/vec/impls/dvecimpl.h> 11 #include <petsc/private/vecimpl.h> 12 #undef VecType 13 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h> 14 15 const char *const MatCUSPARSEStorageFormats[] = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0}; 16 17 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*); 18 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*); 19 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*); 20 21 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 22 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 23 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*); 24 25 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec); 26 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 27 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 28 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 29 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat); 30 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec); 31 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 32 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 33 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 34 35 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**); 36 static PetscErrorCode Mat_SeqAIJCUSPARSETriFactorStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**); 37 static PetscErrorCode Mat_SeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat); 38 static PetscErrorCode Mat_SeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**); 39 static PetscErrorCode Mat_SeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**); 40 41 #undef __FUNCT__ 42 #define __FUNCT__ "MatCUSPARSESetStream" 43 PetscErrorCode MatCUSPARSESetStream(Mat A,const cudaStream_t stream) 44 { 45 cusparseStatus_t stat; 46 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 47 48 PetscFunctionBegin; 49 cusparsestruct->stream = stream; 50 stat = cusparseSetStream(cusparsestruct->handle,cusparsestruct->stream);CHKERRCUDA(stat); 51 PetscFunctionReturn(0); 52 } 53 54 #undef __FUNCT__ 55 #define __FUNCT__ "MatCUSPARSESetHandle" 56 PetscErrorCode MatCUSPARSESetHandle(Mat A,const cusparseHandle_t handle) 57 { 58 cusparseStatus_t stat; 59 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 60 61 PetscFunctionBegin; 62 if (cusparsestruct->handle) { 63 stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUDA(stat); 64 } 65 cusparsestruct->handle = handle; 66 stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUDA(stat); 67 PetscFunctionReturn(0); 68 } 69 70 #undef __FUNCT__ 71 #define __FUNCT__ "MatCUSPARSEClearHandle" 72 PetscErrorCode MatCUSPARSEClearHandle(Mat A) 73 { 74 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 75 PetscFunctionBegin; 76 if (cusparsestruct->handle) 77 cusparsestruct->handle = 0; 78 PetscFunctionReturn(0); 79 } 80 81 #undef __FUNCT__ 82 #define __FUNCT__ "MatFactorGetSolverPackage_seqaij_cusparse" 83 PetscErrorCode MatFactorGetSolverPackage_seqaij_cusparse(Mat A,const MatSolverPackage *type) 84 { 85 PetscFunctionBegin; 86 *type = MATSOLVERCUSPARSE; 87 PetscFunctionReturn(0); 88 } 89 90 /*MC 91 MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices 92 on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported 93 algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer 94 performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the 95 CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these 96 algorithms are not recommended. This class does NOT support direct solver operations. 97 98 Level: beginner 99 100 .seealso: PCFactorSetMatSolverPackage(), MatSolverPackage, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 101 M*/ 102 103 #undef __FUNCT__ 104 #define __FUNCT__ "MatGetFactor_seqaijcusparse_cusparse" 105 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B) 106 { 107 PetscErrorCode ierr; 108 PetscInt n = A->rmap->n; 109 110 PetscFunctionBegin; 111 ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr); 112 (*B)->factortype = ftype; 113 ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr); 114 ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 115 116 if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) { 117 ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr); 118 (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE; 119 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJCUSPARSE; 120 } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) { 121 (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJCUSPARSE; 122 (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE; 123 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types"); 124 125 ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr); 126 ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverPackage_C",MatFactorGetSolverPackage_seqaij_cusparse);CHKERRQ(ierr); 127 PetscFunctionReturn(0); 128 } 129 130 #undef __FUNCT__ 131 #define __FUNCT__ "MatCUSPARSESetFormat_SeqAIJCUSPARSE" 132 PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 133 { 134 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 135 136 PetscFunctionBegin; 137 #if CUDA_VERSION>=4020 138 switch (op) { 139 case MAT_CUSPARSE_MULT: 140 cusparsestruct->format = format; 141 break; 142 case MAT_CUSPARSE_ALL: 143 cusparsestruct->format = format; 144 break; 145 default: 146 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op); 147 } 148 #else 149 if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"ELL (Ellpack) and HYB (Hybrid) storage format require CUDA 4.2 or later."); 150 #endif 151 PetscFunctionReturn(0); 152 } 153 154 /*@ 155 MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular 156 operation. Only the MatMult operation can use different GPU storage formats 157 for MPIAIJCUSPARSE matrices. 158 Not Collective 159 160 Input Parameters: 161 + A - Matrix of type SEQAIJCUSPARSE 162 . op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL. 163 - format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2) 164 165 Output Parameter: 166 167 Level: intermediate 168 169 .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 170 @*/ 171 #undef __FUNCT__ 172 #define __FUNCT__ "MatCUSPARSESetFormat" 173 PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 174 { 175 PetscErrorCode ierr; 176 177 PetscFunctionBegin; 178 PetscValidHeaderSpecific(A, MAT_CLASSID,1); 179 ierr = PetscTryMethod(A, "MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));CHKERRQ(ierr); 180 PetscFunctionReturn(0); 181 } 182 183 #undef __FUNCT__ 184 #define __FUNCT__ "MatSetFromOptions_SeqAIJCUSPARSE" 185 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A) 186 { 187 PetscErrorCode ierr; 188 MatCUSPARSEStorageFormat format; 189 PetscBool flg; 190 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 191 192 PetscFunctionBegin; 193 ierr = PetscOptionsHead(PetscOptionsObject,"SeqAIJCUSPARSE options");CHKERRQ(ierr); 194 ierr = PetscObjectOptionsBegin((PetscObject)A); 195 if (A->factortype==MAT_FACTOR_NONE) { 196 ierr = PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV", 197 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 198 if (flg) { 199 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);CHKERRQ(ierr); 200 } 201 } 202 ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve", 203 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 204 if (flg) { 205 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr); 206 } 207 ierr = PetscOptionsEnd();CHKERRQ(ierr); 208 PetscFunctionReturn(0); 209 210 } 211 212 #undef __FUNCT__ 213 #define __FUNCT__ "MatILUFactorSymbolic_SeqAIJCUSPARSE" 214 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 215 { 216 PetscErrorCode ierr; 217 218 PetscFunctionBegin; 219 ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 220 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 221 PetscFunctionReturn(0); 222 } 223 224 #undef __FUNCT__ 225 #define __FUNCT__ "MatLUFactorSymbolic_SeqAIJCUSPARSE" 226 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 227 { 228 PetscErrorCode ierr; 229 230 PetscFunctionBegin; 231 ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 232 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 233 PetscFunctionReturn(0); 234 } 235 236 #undef __FUNCT__ 237 #define __FUNCT__ "MatICCFactorSymbolic_SeqAIJCUSPARSE" 238 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info) 239 { 240 PetscErrorCode ierr; 241 242 PetscFunctionBegin; 243 ierr = MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr); 244 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 245 PetscFunctionReturn(0); 246 } 247 248 #undef __FUNCT__ 249 #define __FUNCT__ "MatCholeskyFactorSymbolic_SeqAIJCUSPARSE" 250 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info) 251 { 252 PetscErrorCode ierr; 253 254 PetscFunctionBegin; 255 ierr = MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr); 256 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 257 PetscFunctionReturn(0); 258 } 259 260 #undef __FUNCT__ 261 #define __FUNCT__ "MatSeqAIJCUSPARSEBuildILULowerTriMatrix" 262 static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A) 263 { 264 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 265 PetscInt n = A->rmap->n; 266 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 267 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 268 cusparseStatus_t stat; 269 const PetscInt *ai = a->i,*aj = a->j,*vi; 270 const MatScalar *aa = a->a,*v; 271 PetscInt *AiLo, *AjLo; 272 PetscScalar *AALo; 273 PetscInt i,nz, nzLower, offset, rowOffset; 274 PetscErrorCode ierr; 275 276 PetscFunctionBegin; 277 if (A->valid_GPU_matrix == PETSC_CUDA_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUDA_CPU) { 278 try { 279 /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */ 280 nzLower=n+ai[n]-ai[1]; 281 282 /* Allocate Space for the lower triangular matrix */ 283 ierr = cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUDA(ierr); 284 ierr = cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUDA(ierr); 285 ierr = cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUDA(ierr); 286 287 /* Fill the lower triangular matrix */ 288 AiLo[0] = (PetscInt) 0; 289 AiLo[n] = nzLower; 290 AjLo[0] = (PetscInt) 0; 291 AALo[0] = (MatScalar) 1.0; 292 v = aa; 293 vi = aj; 294 offset = 1; 295 rowOffset= 1; 296 for (i=1; i<n; i++) { 297 nz = ai[i+1] - ai[i]; 298 /* additional 1 for the term on the diagonal */ 299 AiLo[i] = rowOffset; 300 rowOffset += nz+1; 301 302 ierr = PetscMemcpy(&(AjLo[offset]), vi, nz*sizeof(PetscInt));CHKERRQ(ierr); 303 ierr = PetscMemcpy(&(AALo[offset]), v, nz*sizeof(PetscScalar));CHKERRQ(ierr); 304 305 offset += nz; 306 AjLo[offset] = (PetscInt) i; 307 AALo[offset] = (MatScalar) 1.0; 308 offset += 1; 309 310 v += nz; 311 vi += nz; 312 } 313 314 /* allocate space for the triangular factor information */ 315 loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct; 316 317 /* Create the matrix description */ 318 stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUDA(stat); 319 stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat); 320 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUDA(stat); 321 stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER);CHKERRCUDA(stat); 322 stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUDA(stat); 323 324 /* Create the solve analysis information */ 325 stat = cusparseCreateSolveAnalysisInfo(&loTriFactor->solveInfo);CHKERRCUDA(stat); 326 327 /* set the operation */ 328 loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 329 330 /* set the matrix */ 331 loTriFactor->csrMat = new CsrMatrix; 332 loTriFactor->csrMat->num_rows = n; 333 loTriFactor->csrMat->num_cols = n; 334 loTriFactor->csrMat->num_entries = nzLower; 335 336 loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1); 337 loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1); 338 339 loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower); 340 loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower); 341 342 loTriFactor->csrMat->values = new THRUSTARRAY(nzLower); 343 loTriFactor->csrMat->values->assign(AALo, AALo+nzLower); 344 345 /* perform the solve analysis */ 346 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, 347 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 348 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 349 loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo);CHKERRCUDA(stat); 350 351 /* assign the pointer. Is this really necessary? */ 352 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor; 353 354 ierr = cudaFreeHost(AiLo);CHKERRCUDA(ierr); 355 ierr = cudaFreeHost(AjLo);CHKERRCUDA(ierr); 356 ierr = cudaFreeHost(AALo);CHKERRCUDA(ierr); 357 } catch(char *ex) { 358 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 359 } 360 } 361 PetscFunctionReturn(0); 362 } 363 364 #undef __FUNCT__ 365 #define __FUNCT__ "MatSeqAIJCUSPARSEBuildILUUpperTriMatrix" 366 static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A) 367 { 368 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 369 PetscInt n = A->rmap->n; 370 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 371 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 372 cusparseStatus_t stat; 373 const PetscInt *aj = a->j,*adiag = a->diag,*vi; 374 const MatScalar *aa = a->a,*v; 375 PetscInt *AiUp, *AjUp; 376 PetscScalar *AAUp; 377 PetscInt i,nz, nzUpper, offset; 378 PetscErrorCode ierr; 379 380 PetscFunctionBegin; 381 if (A->valid_GPU_matrix == PETSC_CUDA_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUDA_CPU) { 382 try { 383 /* next, figure out the number of nonzeros in the upper triangular matrix. */ 384 nzUpper = adiag[0]-adiag[n]; 385 386 /* Allocate Space for the upper triangular matrix */ 387 ierr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(ierr); 388 ierr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(ierr); 389 ierr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(ierr); 390 391 /* Fill the upper triangular matrix */ 392 AiUp[0]=(PetscInt) 0; 393 AiUp[n]=nzUpper; 394 offset = nzUpper; 395 for (i=n-1; i>=0; i--) { 396 v = aa + adiag[i+1] + 1; 397 vi = aj + adiag[i+1] + 1; 398 399 /* number of elements NOT on the diagonal */ 400 nz = adiag[i] - adiag[i+1]-1; 401 402 /* decrement the offset */ 403 offset -= (nz+1); 404 405 /* first, set the diagonal elements */ 406 AjUp[offset] = (PetscInt) i; 407 AAUp[offset] = (MatScalar)1./v[nz]; 408 AiUp[i] = AiUp[i+1] - (nz+1); 409 410 ierr = PetscMemcpy(&(AjUp[offset+1]), vi, nz*sizeof(PetscInt));CHKERRQ(ierr); 411 ierr = PetscMemcpy(&(AAUp[offset+1]), v, nz*sizeof(PetscScalar));CHKERRQ(ierr); 412 } 413 414 /* allocate space for the triangular factor information */ 415 upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct; 416 417 /* Create the matrix description */ 418 stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUDA(stat); 419 stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat); 420 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUDA(stat); 421 stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUDA(stat); 422 stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUDA(stat); 423 424 /* Create the solve analysis information */ 425 stat = cusparseCreateSolveAnalysisInfo(&upTriFactor->solveInfo);CHKERRCUDA(stat); 426 427 /* set the operation */ 428 upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 429 430 /* set the matrix */ 431 upTriFactor->csrMat = new CsrMatrix; 432 upTriFactor->csrMat->num_rows = n; 433 upTriFactor->csrMat->num_cols = n; 434 upTriFactor->csrMat->num_entries = nzUpper; 435 436 upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1); 437 upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1); 438 439 upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper); 440 upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper); 441 442 upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper); 443 upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper); 444 445 /* perform the solve analysis */ 446 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, 447 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 448 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 449 upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo);CHKERRCUDA(stat); 450 451 /* assign the pointer. Is this really necessary? */ 452 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor; 453 454 ierr = cudaFreeHost(AiUp);CHKERRCUDA(ierr); 455 ierr = cudaFreeHost(AjUp);CHKERRCUDA(ierr); 456 ierr = cudaFreeHost(AAUp);CHKERRCUDA(ierr); 457 } catch(char *ex) { 458 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 459 } 460 } 461 PetscFunctionReturn(0); 462 } 463 464 #undef __FUNCT__ 465 #define __FUNCT__ "MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU" 466 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A) 467 { 468 PetscErrorCode ierr; 469 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 470 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 471 IS isrow = a->row,iscol = a->icol; 472 PetscBool row_identity,col_identity; 473 const PetscInt *r,*c; 474 PetscInt n = A->rmap->n; 475 476 PetscFunctionBegin; 477 ierr = MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);CHKERRQ(ierr); 478 ierr = MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);CHKERRQ(ierr); 479 480 cusparseTriFactors->workVector = new THRUSTARRAY; 481 cusparseTriFactors->workVector->resize(n); 482 cusparseTriFactors->nnz=a->nz; 483 484 A->valid_GPU_matrix = PETSC_CUDA_BOTH; 485 /*lower triangular indices */ 486 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 487 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 488 if (!row_identity) { 489 cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 490 cusparseTriFactors->rpermIndices->assign(r, r+n); 491 } 492 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 493 494 /*upper triangular indices */ 495 ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr); 496 ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 497 if (!col_identity) { 498 cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 499 cusparseTriFactors->cpermIndices->assign(c, c+n); 500 } 501 ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr); 502 PetscFunctionReturn(0); 503 } 504 505 #undef __FUNCT__ 506 #define __FUNCT__ "MatSeqAIJCUSPARSEBuildICCTriMatrices" 507 static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A) 508 { 509 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 510 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 511 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 512 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 513 cusparseStatus_t stat; 514 PetscErrorCode ierr; 515 PetscInt *AiUp, *AjUp; 516 PetscScalar *AAUp; 517 PetscScalar *AALo; 518 PetscInt nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j; 519 Mat_SeqSBAIJ *b = (Mat_SeqSBAIJ*)A->data; 520 const PetscInt *ai = b->i,*aj = b->j,*vj; 521 const MatScalar *aa = b->a,*v; 522 523 PetscFunctionBegin; 524 if (A->valid_GPU_matrix == PETSC_CUDA_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUDA_CPU) { 525 try { 526 /* Allocate Space for the upper triangular matrix */ 527 ierr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(ierr); 528 ierr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(ierr); 529 ierr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(ierr); 530 ierr = cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUDA(ierr); 531 532 /* Fill the upper triangular matrix */ 533 AiUp[0]=(PetscInt) 0; 534 AiUp[n]=nzUpper; 535 offset = 0; 536 for (i=0; i<n; i++) { 537 /* set the pointers */ 538 v = aa + ai[i]; 539 vj = aj + ai[i]; 540 nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */ 541 542 /* first, set the diagonal elements */ 543 AjUp[offset] = (PetscInt) i; 544 AAUp[offset] = (MatScalar)1.0/v[nz]; 545 AiUp[i] = offset; 546 AALo[offset] = (MatScalar)1.0/v[nz]; 547 548 offset+=1; 549 if (nz>0) { 550 ierr = PetscMemcpy(&(AjUp[offset]), vj, nz*sizeof(PetscInt));CHKERRQ(ierr); 551 ierr = PetscMemcpy(&(AAUp[offset]), v, nz*sizeof(PetscScalar));CHKERRQ(ierr); 552 for (j=offset; j<offset+nz; j++) { 553 AAUp[j] = -AAUp[j]; 554 AALo[j] = AAUp[j]/v[nz]; 555 } 556 offset+=nz; 557 } 558 } 559 560 /* allocate space for the triangular factor information */ 561 upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct; 562 563 /* Create the matrix description */ 564 stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUDA(stat); 565 stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat); 566 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUDA(stat); 567 stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUDA(stat); 568 stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUDA(stat); 569 570 /* Create the solve analysis information */ 571 stat = cusparseCreateSolveAnalysisInfo(&upTriFactor->solveInfo);CHKERRCUDA(stat); 572 573 /* set the operation */ 574 upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 575 576 /* set the matrix */ 577 upTriFactor->csrMat = new CsrMatrix; 578 upTriFactor->csrMat->num_rows = A->rmap->n; 579 upTriFactor->csrMat->num_cols = A->cmap->n; 580 upTriFactor->csrMat->num_entries = a->nz; 581 582 upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 583 upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1); 584 585 upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 586 upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz); 587 588 upTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 589 upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz); 590 591 /* perform the solve analysis */ 592 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, 593 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 594 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 595 upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo);CHKERRCUDA(stat); 596 597 /* assign the pointer. Is this really necessary? */ 598 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor; 599 600 /* allocate space for the triangular factor information */ 601 loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct; 602 603 /* Create the matrix description */ 604 stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUDA(stat); 605 stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat); 606 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUDA(stat); 607 stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUDA(stat); 608 stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUDA(stat); 609 610 /* Create the solve analysis information */ 611 stat = cusparseCreateSolveAnalysisInfo(&loTriFactor->solveInfo);CHKERRCUDA(stat); 612 613 /* set the operation */ 614 loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE; 615 616 /* set the matrix */ 617 loTriFactor->csrMat = new CsrMatrix; 618 loTriFactor->csrMat->num_rows = A->rmap->n; 619 loTriFactor->csrMat->num_cols = A->cmap->n; 620 loTriFactor->csrMat->num_entries = a->nz; 621 622 loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 623 loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1); 624 625 loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 626 loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz); 627 628 loTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 629 loTriFactor->csrMat->values->assign(AALo, AALo+a->nz); 630 631 /* perform the solve analysis */ 632 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, 633 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 634 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 635 loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo);CHKERRCUDA(stat); 636 637 /* assign the pointer. Is this really necessary? */ 638 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor; 639 640 A->valid_GPU_matrix = PETSC_CUDA_BOTH; 641 ierr = cudaFreeHost(AiUp);CHKERRCUDA(ierr); 642 ierr = cudaFreeHost(AjUp);CHKERRCUDA(ierr); 643 ierr = cudaFreeHost(AAUp);CHKERRCUDA(ierr); 644 ierr = cudaFreeHost(AALo);CHKERRCUDA(ierr); 645 } catch(char *ex) { 646 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 647 } 648 } 649 PetscFunctionReturn(0); 650 } 651 652 #undef __FUNCT__ 653 #define __FUNCT__ "MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU" 654 static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A) 655 { 656 PetscErrorCode ierr; 657 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 658 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 659 IS ip = a->row; 660 const PetscInt *rip; 661 PetscBool perm_identity; 662 PetscInt n = A->rmap->n; 663 664 PetscFunctionBegin; 665 ierr = MatSeqAIJCUSPARSEBuildICCTriMatrices(A);CHKERRQ(ierr); 666 cusparseTriFactors->workVector = new THRUSTARRAY; 667 cusparseTriFactors->workVector->resize(n); 668 cusparseTriFactors->nnz=(a->nz-n)*2 + n; 669 670 /*lower triangular indices */ 671 ierr = ISGetIndices(ip,&rip);CHKERRQ(ierr); 672 ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr); 673 if (!perm_identity) { 674 cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 675 cusparseTriFactors->rpermIndices->assign(rip, rip+n); 676 cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 677 cusparseTriFactors->cpermIndices->assign(rip, rip+n); 678 } 679 ierr = ISRestoreIndices(ip,&rip);CHKERRQ(ierr); 680 PetscFunctionReturn(0); 681 } 682 683 #undef __FUNCT__ 684 #define __FUNCT__ "MatLUFactorNumeric_SeqAIJCUSPARSE" 685 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info) 686 { 687 Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 688 IS isrow = b->row,iscol = b->col; 689 PetscBool row_identity,col_identity; 690 PetscErrorCode ierr; 691 692 PetscFunctionBegin; 693 ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 694 /* determine which version of MatSolve needs to be used. */ 695 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 696 ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 697 if (row_identity && col_identity) { 698 B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 699 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 700 } else { 701 B->ops->solve = MatSolve_SeqAIJCUSPARSE; 702 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 703 } 704 705 /* get the triangular factors */ 706 ierr = MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);CHKERRQ(ierr); 707 PetscFunctionReturn(0); 708 } 709 710 #undef __FUNCT__ 711 #define __FUNCT__ "MatCholeskyFactorNumeric_SeqAIJCUSPARSE" 712 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info) 713 { 714 Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 715 IS ip = b->row; 716 PetscBool perm_identity; 717 PetscErrorCode ierr; 718 719 PetscFunctionBegin; 720 ierr = MatCholeskyFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 721 722 /* determine which version of MatSolve needs to be used. */ 723 ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr); 724 if (perm_identity) { 725 B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 726 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 727 } else { 728 B->ops->solve = MatSolve_SeqAIJCUSPARSE; 729 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 730 } 731 732 /* get the triangular factors */ 733 ierr = MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);CHKERRQ(ierr); 734 PetscFunctionReturn(0); 735 } 736 737 #undef __FUNCT__ 738 #define __FUNCT__ "MatSeqAIJCUSPARSEAnalyzeTransposeForSolve" 739 static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A) 740 { 741 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 742 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 743 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 744 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 745 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 746 cusparseStatus_t stat; 747 cusparseIndexBase_t indexBase; 748 cusparseMatrixType_t matrixType; 749 cusparseFillMode_t fillMode; 750 cusparseDiagType_t diagType; 751 752 PetscFunctionBegin; 753 754 /*********************************************/ 755 /* Now the Transpose of the Lower Tri Factor */ 756 /*********************************************/ 757 758 /* allocate space for the transpose of the lower triangular factor */ 759 loTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct; 760 761 /* set the matrix descriptors of the lower triangular factor */ 762 matrixType = cusparseGetMatType(loTriFactor->descr); 763 indexBase = cusparseGetMatIndexBase(loTriFactor->descr); 764 fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ? 765 CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 766 diagType = cusparseGetMatDiagType(loTriFactor->descr); 767 768 /* Create the matrix description */ 769 stat = cusparseCreateMatDescr(&loTriFactorT->descr);CHKERRCUDA(stat); 770 stat = cusparseSetMatIndexBase(loTriFactorT->descr, indexBase);CHKERRCUDA(stat); 771 stat = cusparseSetMatType(loTriFactorT->descr, matrixType);CHKERRCUDA(stat); 772 stat = cusparseSetMatFillMode(loTriFactorT->descr, fillMode);CHKERRCUDA(stat); 773 stat = cusparseSetMatDiagType(loTriFactorT->descr, diagType);CHKERRCUDA(stat); 774 775 /* Create the solve analysis information */ 776 stat = cusparseCreateSolveAnalysisInfo(&loTriFactorT->solveInfo);CHKERRCUDA(stat); 777 778 /* set the operation */ 779 loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 780 781 /* allocate GPU space for the CSC of the lower triangular factor*/ 782 loTriFactorT->csrMat = new CsrMatrix; 783 loTriFactorT->csrMat->num_rows = loTriFactor->csrMat->num_rows; 784 loTriFactorT->csrMat->num_cols = loTriFactor->csrMat->num_cols; 785 loTriFactorT->csrMat->num_entries = loTriFactor->csrMat->num_entries; 786 loTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(loTriFactor->csrMat->num_rows+1); 787 loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactor->csrMat->num_entries); 788 loTriFactorT->csrMat->values = new THRUSTARRAY(loTriFactor->csrMat->num_entries); 789 790 /* compute the transpose of the lower triangular factor, i.e. the CSC */ 791 stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, 792 loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, 793 loTriFactor->csrMat->values->data().get(), 794 loTriFactor->csrMat->row_offsets->data().get(), 795 loTriFactor->csrMat->column_indices->data().get(), 796 loTriFactorT->csrMat->values->data().get(), 797 loTriFactorT->csrMat->column_indices->data().get(), 798 loTriFactorT->csrMat->row_offsets->data().get(), 799 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUDA(stat); 800 801 /* perform the solve analysis on the transposed matrix */ 802 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp, 803 loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, 804 loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(), 805 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), 806 loTriFactorT->solveInfo);CHKERRCUDA(stat); 807 808 /* assign the pointer. Is this really necessary? */ 809 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT; 810 811 /*********************************************/ 812 /* Now the Transpose of the Upper Tri Factor */ 813 /*********************************************/ 814 815 /* allocate space for the transpose of the upper triangular factor */ 816 upTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct; 817 818 /* set the matrix descriptors of the upper triangular factor */ 819 matrixType = cusparseGetMatType(upTriFactor->descr); 820 indexBase = cusparseGetMatIndexBase(upTriFactor->descr); 821 fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ? 822 CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 823 diagType = cusparseGetMatDiagType(upTriFactor->descr); 824 825 /* Create the matrix description */ 826 stat = cusparseCreateMatDescr(&upTriFactorT->descr);CHKERRCUDA(stat); 827 stat = cusparseSetMatIndexBase(upTriFactorT->descr, indexBase);CHKERRCUDA(stat); 828 stat = cusparseSetMatType(upTriFactorT->descr, matrixType);CHKERRCUDA(stat); 829 stat = cusparseSetMatFillMode(upTriFactorT->descr, fillMode);CHKERRCUDA(stat); 830 stat = cusparseSetMatDiagType(upTriFactorT->descr, diagType);CHKERRCUDA(stat); 831 832 /* Create the solve analysis information */ 833 stat = cusparseCreateSolveAnalysisInfo(&upTriFactorT->solveInfo);CHKERRCUDA(stat); 834 835 /* set the operation */ 836 upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 837 838 /* allocate GPU space for the CSC of the upper triangular factor*/ 839 upTriFactorT->csrMat = new CsrMatrix; 840 upTriFactorT->csrMat->num_rows = upTriFactor->csrMat->num_rows; 841 upTriFactorT->csrMat->num_cols = upTriFactor->csrMat->num_cols; 842 upTriFactorT->csrMat->num_entries = upTriFactor->csrMat->num_entries; 843 upTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(upTriFactor->csrMat->num_rows+1); 844 upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactor->csrMat->num_entries); 845 upTriFactorT->csrMat->values = new THRUSTARRAY(upTriFactor->csrMat->num_entries); 846 847 /* compute the transpose of the upper triangular factor, i.e. the CSC */ 848 stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, 849 upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, 850 upTriFactor->csrMat->values->data().get(), 851 upTriFactor->csrMat->row_offsets->data().get(), 852 upTriFactor->csrMat->column_indices->data().get(), 853 upTriFactorT->csrMat->values->data().get(), 854 upTriFactorT->csrMat->column_indices->data().get(), 855 upTriFactorT->csrMat->row_offsets->data().get(), 856 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUDA(stat); 857 858 /* perform the solve analysis on the transposed matrix */ 859 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp, 860 upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, 861 upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(), 862 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), 863 upTriFactorT->solveInfo);CHKERRCUDA(stat); 864 865 /* assign the pointer. Is this really necessary? */ 866 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT; 867 PetscFunctionReturn(0); 868 } 869 870 #undef __FUNCT__ 871 #define __FUNCT__ "MatSeqAIJCUSPARSEGenerateTransposeForMult" 872 static PetscErrorCode MatSeqAIJCUSPARSEGenerateTransposeForMult(Mat A) 873 { 874 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 875 Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 876 Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 877 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 878 cusparseStatus_t stat; 879 cusparseIndexBase_t indexBase; 880 cudaError_t err; 881 882 PetscFunctionBegin; 883 884 /* allocate space for the triangular factor information */ 885 matstructT = new Mat_SeqAIJCUSPARSEMultStruct; 886 stat = cusparseCreateMatDescr(&matstructT->descr);CHKERRCUDA(stat); 887 indexBase = cusparseGetMatIndexBase(matstruct->descr); 888 stat = cusparseSetMatIndexBase(matstructT->descr, indexBase);CHKERRCUDA(stat); 889 stat = cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUDA(stat); 890 891 /* set alpha and beta */ 892 err = cudaMalloc((void **)&(matstructT->alpha),sizeof(PetscScalar));CHKERRCUDA(err); 893 err = cudaMemcpy(matstructT->alpha,&ALPHA,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 894 err = cudaMalloc((void **)&(matstructT->beta),sizeof(PetscScalar));CHKERRCUDA(err); 895 err = cudaMemcpy(matstructT->beta,&BETA,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 896 stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUDA(stat); 897 898 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 899 CsrMatrix *matrix = (CsrMatrix*)matstruct->mat; 900 CsrMatrix *matrixT= new CsrMatrix; 901 matrixT->num_rows = A->rmap->n; 902 matrixT->num_cols = A->cmap->n; 903 matrixT->num_entries = a->nz; 904 matrixT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 905 matrixT->column_indices = new THRUSTINTARRAY32(a->nz); 906 matrixT->values = new THRUSTARRAY(a->nz); 907 908 /* compute the transpose of the upper triangular factor, i.e. the CSC */ 909 indexBase = cusparseGetMatIndexBase(matstruct->descr); 910 stat = cusparse_csr2csc(cusparsestruct->handle, matrix->num_rows, 911 matrix->num_cols, matrix->num_entries, 912 matrix->values->data().get(), 913 matrix->row_offsets->data().get(), 914 matrix->column_indices->data().get(), 915 matrixT->values->data().get(), 916 matrixT->column_indices->data().get(), 917 matrixT->row_offsets->data().get(), 918 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUDA(stat); 919 920 /* assign the pointer */ 921 matstructT->mat = matrixT; 922 923 } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) { 924 #if CUDA_VERSION>=5000 925 /* First convert HYB to CSR */ 926 CsrMatrix *temp= new CsrMatrix; 927 temp->num_rows = A->rmap->n; 928 temp->num_cols = A->cmap->n; 929 temp->num_entries = a->nz; 930 temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 931 temp->column_indices = new THRUSTINTARRAY32(a->nz); 932 temp->values = new THRUSTARRAY(a->nz); 933 934 935 stat = cusparse_hyb2csr(cusparsestruct->handle, 936 matstruct->descr, (cusparseHybMat_t)matstruct->mat, 937 temp->values->data().get(), 938 temp->row_offsets->data().get(), 939 temp->column_indices->data().get());CHKERRCUDA(stat); 940 941 /* Next, convert CSR to CSC (i.e. the matrix transpose) */ 942 CsrMatrix *tempT= new CsrMatrix; 943 tempT->num_rows = A->rmap->n; 944 tempT->num_cols = A->cmap->n; 945 tempT->num_entries = a->nz; 946 tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 947 tempT->column_indices = new THRUSTINTARRAY32(a->nz); 948 tempT->values = new THRUSTARRAY(a->nz); 949 950 stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows, 951 temp->num_cols, temp->num_entries, 952 temp->values->data().get(), 953 temp->row_offsets->data().get(), 954 temp->column_indices->data().get(), 955 tempT->values->data().get(), 956 tempT->column_indices->data().get(), 957 tempT->row_offsets->data().get(), 958 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUDA(stat); 959 960 /* Last, convert CSC to HYB */ 961 cusparseHybMat_t hybMat; 962 stat = cusparseCreateHybMat(&hybMat);CHKERRCUDA(stat); 963 cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ? 964 CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 965 stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n, 966 matstructT->descr, tempT->values->data().get(), 967 tempT->row_offsets->data().get(), 968 tempT->column_indices->data().get(), 969 hybMat, 0, partition);CHKERRCUDA(stat); 970 971 /* assign the pointer */ 972 matstructT->mat = hybMat; 973 974 /* delete temporaries */ 975 if (tempT) { 976 if (tempT->values) delete (THRUSTARRAY*) tempT->values; 977 if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices; 978 if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets; 979 delete (CsrMatrix*) tempT; 980 } 981 if (temp) { 982 if (temp->values) delete (THRUSTARRAY*) temp->values; 983 if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices; 984 if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets; 985 delete (CsrMatrix*) temp; 986 } 987 #else 988 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"ELL (Ellpack) and HYB (Hybrid) storage format for the Matrix Transpose (in MatMultTranspose) require CUDA 5.0 or later."); 989 #endif 990 } 991 /* assign the compressed row indices */ 992 matstructT->cprowIndices = new THRUSTINTARRAY; 993 994 /* assign the pointer */ 995 ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT; 996 PetscFunctionReturn(0); 997 } 998 999 #undef __FUNCT__ 1000 #define __FUNCT__ "MatSolveTranspose_SeqAIJCUSPARSE" 1001 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 1002 { 1003 PetscInt n = xx->map->n; 1004 const PetscScalar *barray; 1005 PetscScalar *xarray; 1006 thrust::device_ptr<const PetscScalar> bGPU; 1007 thrust::device_ptr<PetscScalar> xGPU; 1008 cusparseStatus_t stat; 1009 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1010 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1011 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1012 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1013 PetscErrorCode ierr; 1014 1015 PetscFunctionBegin; 1016 /* Analyze the matrix and create the transpose ... on the fly */ 1017 if (!loTriFactorT && !upTriFactorT) { 1018 ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr); 1019 loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1020 upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1021 } 1022 1023 /* Get the GPU pointers */ 1024 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1025 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1026 xGPU = thrust::device_pointer_cast(xarray); 1027 bGPU = thrust::device_pointer_cast(barray); 1028 1029 /* First, reorder with the row permutation */ 1030 thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), 1031 thrust::make_permutation_iterator(bGPU+n, cusparseTriFactors->rpermIndices->end()), 1032 xGPU); 1033 1034 /* First, solve U */ 1035 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, 1036 upTriFactorT->csrMat->num_rows, &ALPHA, upTriFactorT->descr, 1037 upTriFactorT->csrMat->values->data().get(), 1038 upTriFactorT->csrMat->row_offsets->data().get(), 1039 upTriFactorT->csrMat->column_indices->data().get(), 1040 upTriFactorT->solveInfo, 1041 xarray, tempGPU->data().get());CHKERRCUDA(stat); 1042 1043 /* Then, solve L */ 1044 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, 1045 loTriFactorT->csrMat->num_rows, &ALPHA, loTriFactorT->descr, 1046 loTriFactorT->csrMat->values->data().get(), 1047 loTriFactorT->csrMat->row_offsets->data().get(), 1048 loTriFactorT->csrMat->column_indices->data().get(), 1049 loTriFactorT->solveInfo, 1050 tempGPU->data().get(), xarray);CHKERRCUDA(stat); 1051 1052 /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */ 1053 thrust::copy(thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()), 1054 thrust::make_permutation_iterator(xGPU+n, cusparseTriFactors->cpermIndices->end()), 1055 tempGPU->begin()); 1056 1057 /* Copy the temporary to the full solution. */ 1058 thrust::copy(tempGPU->begin(), tempGPU->end(), xGPU); 1059 1060 /* restore */ 1061 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1062 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1063 ierr = WaitForGPU();CHKERRCUDA(ierr); 1064 1065 ierr = PetscLogFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1066 PetscFunctionReturn(0); 1067 } 1068 1069 #undef __FUNCT__ 1070 #define __FUNCT__ "MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering" 1071 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 1072 { 1073 const PetscScalar *barray; 1074 PetscScalar *xarray; 1075 cusparseStatus_t stat; 1076 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1077 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1078 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1079 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1080 PetscErrorCode ierr; 1081 1082 PetscFunctionBegin; 1083 /* Analyze the matrix and create the transpose ... on the fly */ 1084 if (!loTriFactorT && !upTriFactorT) { 1085 ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr); 1086 loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1087 upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1088 } 1089 1090 /* Get the GPU pointers */ 1091 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1092 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1093 1094 /* First, solve U */ 1095 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, 1096 upTriFactorT->csrMat->num_rows, &ALPHA, upTriFactorT->descr, 1097 upTriFactorT->csrMat->values->data().get(), 1098 upTriFactorT->csrMat->row_offsets->data().get(), 1099 upTriFactorT->csrMat->column_indices->data().get(), 1100 upTriFactorT->solveInfo, 1101 barray, tempGPU->data().get());CHKERRCUDA(stat); 1102 1103 /* Then, solve L */ 1104 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, 1105 loTriFactorT->csrMat->num_rows, &ALPHA, loTriFactorT->descr, 1106 loTriFactorT->csrMat->values->data().get(), 1107 loTriFactorT->csrMat->row_offsets->data().get(), 1108 loTriFactorT->csrMat->column_indices->data().get(), 1109 loTriFactorT->solveInfo, 1110 tempGPU->data().get(), xarray);CHKERRCUDA(stat); 1111 1112 /* restore */ 1113 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1114 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1115 ierr = WaitForGPU();CHKERRCUDA(ierr); 1116 ierr = PetscLogFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1117 PetscFunctionReturn(0); 1118 } 1119 1120 #undef __FUNCT__ 1121 #define __FUNCT__ "MatSolve_SeqAIJCUSPARSE" 1122 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 1123 { 1124 const PetscScalar *barray; 1125 PetscScalar *xarray; 1126 thrust::device_ptr<const PetscScalar> bGPU; 1127 thrust::device_ptr<PetscScalar> xGPU; 1128 cusparseStatus_t stat; 1129 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1130 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1131 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1132 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1133 PetscErrorCode ierr; 1134 VecType t; 1135 PetscBool flg; 1136 1137 PetscFunctionBegin; 1138 ierr = VecGetType(bb,&t);CHKERRQ(ierr); 1139 ierr = PetscStrcmp(t,VECSEQCUDA,&flg);CHKERRQ(ierr); 1140 if (!flg) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Vector of type %s passed into MatSolve_SeqAIJCUSPARSE (Arg #2). Can only deal with %s\n.",t,VECSEQCUDA); 1141 ierr = VecGetType(xx,&t);CHKERRQ(ierr); 1142 ierr = PetscStrcmp(t,VECSEQCUDA,&flg);CHKERRQ(ierr); 1143 if (!flg) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Vector of type %s passed into MatSolve_SeqAIJCUSPARSE (Arg #3). Can only deal with %s\n.",t,VECSEQCUDA); 1144 1145 /* Get the GPU pointers */ 1146 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1147 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1148 xGPU = thrust::device_pointer_cast(xarray); 1149 bGPU = thrust::device_pointer_cast(barray); 1150 1151 /* First, reorder with the row permutation */ 1152 thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), 1153 thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()), 1154 xGPU); 1155 1156 /* Next, solve L */ 1157 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp, 1158 loTriFactor->csrMat->num_rows, &ALPHA, loTriFactor->descr, 1159 loTriFactor->csrMat->values->data().get(), 1160 loTriFactor->csrMat->row_offsets->data().get(), 1161 loTriFactor->csrMat->column_indices->data().get(), 1162 loTriFactor->solveInfo, 1163 xarray, tempGPU->data().get());CHKERRCUDA(stat); 1164 1165 /* Then, solve U */ 1166 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp, 1167 upTriFactor->csrMat->num_rows, &ALPHA, upTriFactor->descr, 1168 upTriFactor->csrMat->values->data().get(), 1169 upTriFactor->csrMat->row_offsets->data().get(), 1170 upTriFactor->csrMat->column_indices->data().get(), 1171 upTriFactor->solveInfo, 1172 tempGPU->data().get(), xarray);CHKERRCUDA(stat); 1173 1174 /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */ 1175 thrust::copy(thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()), 1176 thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->end()), 1177 tempGPU->begin()); 1178 1179 /* Copy the temporary to the full solution. */ 1180 thrust::copy(tempGPU->begin(), tempGPU->end(), xGPU); 1181 1182 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1183 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1184 ierr = WaitForGPU();CHKERRCUDA(ierr); 1185 ierr = PetscLogFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1186 PetscFunctionReturn(0); 1187 } 1188 1189 #undef __FUNCT__ 1190 #define __FUNCT__ "MatSolve_SeqAIJCUSPARSE_NaturalOrdering" 1191 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 1192 { 1193 const PetscScalar *barray; 1194 PetscScalar *xarray; 1195 cusparseStatus_t stat; 1196 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1197 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1198 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1199 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1200 PetscErrorCode ierr; 1201 1202 PetscFunctionBegin; 1203 /* Get the GPU pointers */ 1204 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1205 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1206 1207 /* First, solve L */ 1208 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp, 1209 loTriFactor->csrMat->num_rows, &ALPHA, loTriFactor->descr, 1210 loTriFactor->csrMat->values->data().get(), 1211 loTriFactor->csrMat->row_offsets->data().get(), 1212 loTriFactor->csrMat->column_indices->data().get(), 1213 loTriFactor->solveInfo, 1214 barray, tempGPU->data().get());CHKERRCUDA(stat); 1215 1216 /* Next, solve U */ 1217 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp, 1218 upTriFactor->csrMat->num_rows, &ALPHA, upTriFactor->descr, 1219 upTriFactor->csrMat->values->data().get(), 1220 upTriFactor->csrMat->row_offsets->data().get(), 1221 upTriFactor->csrMat->column_indices->data().get(), 1222 upTriFactor->solveInfo, 1223 tempGPU->data().get(), xarray);CHKERRCUDA(stat); 1224 1225 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1226 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1227 ierr = WaitForGPU();CHKERRCUDA(ierr); 1228 ierr = PetscLogFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1229 PetscFunctionReturn(0); 1230 } 1231 1232 #undef __FUNCT__ 1233 #define __FUNCT__ "MatSeqAIJCUSPARSECopyToGPU" 1234 static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A) 1235 { 1236 1237 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1238 Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 1239 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1240 PetscInt m = A->rmap->n,*ii,*ridx; 1241 PetscErrorCode ierr; 1242 cusparseStatus_t stat; 1243 cudaError_t err; 1244 1245 PetscFunctionBegin; 1246 if (A->valid_GPU_matrix == PETSC_CUDA_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUDA_CPU) { 1247 ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1248 Mat_SeqAIJCUSPARSEMultStruct_Destroy(&matstruct,cusparsestruct->format); 1249 try { 1250 cusparsestruct->nonzerorow=0; 1251 for (int j = 0; j<m; j++) cusparsestruct->nonzerorow += ((a->i[j+1]-a->i[j])>0); 1252 1253 if (a->compressedrow.use) { 1254 m = a->compressedrow.nrows; 1255 ii = a->compressedrow.i; 1256 ridx = a->compressedrow.rindex; 1257 } else { 1258 /* Forcing compressed row on the GPU */ 1259 int k=0; 1260 ierr = PetscMalloc1(cusparsestruct->nonzerorow+1, &ii);CHKERRQ(ierr); 1261 ierr = PetscMalloc1(cusparsestruct->nonzerorow, &ridx);CHKERRQ(ierr); 1262 ii[0]=0; 1263 for (int j = 0; j<m; j++) { 1264 if ((a->i[j+1]-a->i[j])>0) { 1265 ii[k] = a->i[j]; 1266 ridx[k]= j; 1267 k++; 1268 } 1269 } 1270 ii[cusparsestruct->nonzerorow] = a->nz; 1271 m = cusparsestruct->nonzerorow; 1272 } 1273 1274 /* allocate space for the triangular factor information */ 1275 matstruct = new Mat_SeqAIJCUSPARSEMultStruct; 1276 stat = cusparseCreateMatDescr(&matstruct->descr);CHKERRCUDA(stat); 1277 stat = cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat); 1278 stat = cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUDA(stat); 1279 1280 err = cudaMalloc((void **)&(matstruct->alpha),sizeof(PetscScalar));CHKERRCUDA(err); 1281 err = cudaMemcpy(matstruct->alpha,&ALPHA,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1282 err = cudaMalloc((void **)&(matstruct->beta),sizeof(PetscScalar));CHKERRCUDA(err); 1283 err = cudaMemcpy(matstruct->beta,&BETA,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1284 stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUDA(stat); 1285 1286 /* Build a hybrid/ellpack matrix if this option is chosen for the storage */ 1287 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1288 /* set the matrix */ 1289 CsrMatrix *matrix= new CsrMatrix; 1290 matrix->num_rows = m; 1291 matrix->num_cols = A->cmap->n; 1292 matrix->num_entries = a->nz; 1293 matrix->row_offsets = new THRUSTINTARRAY32(m+1); 1294 matrix->row_offsets->assign(ii, ii + m+1); 1295 1296 matrix->column_indices = new THRUSTINTARRAY32(a->nz); 1297 matrix->column_indices->assign(a->j, a->j+a->nz); 1298 1299 matrix->values = new THRUSTARRAY(a->nz); 1300 matrix->values->assign(a->a, a->a+a->nz); 1301 1302 /* assign the pointer */ 1303 matstruct->mat = matrix; 1304 1305 } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) { 1306 #if CUDA_VERSION>=4020 1307 CsrMatrix *matrix= new CsrMatrix; 1308 matrix->num_rows = m; 1309 matrix->num_cols = A->cmap->n; 1310 matrix->num_entries = a->nz; 1311 matrix->row_offsets = new THRUSTINTARRAY32(m+1); 1312 matrix->row_offsets->assign(ii, ii + m+1); 1313 1314 matrix->column_indices = new THRUSTINTARRAY32(a->nz); 1315 matrix->column_indices->assign(a->j, a->j+a->nz); 1316 1317 matrix->values = new THRUSTARRAY(a->nz); 1318 matrix->values->assign(a->a, a->a+a->nz); 1319 1320 cusparseHybMat_t hybMat; 1321 stat = cusparseCreateHybMat(&hybMat);CHKERRCUDA(stat); 1322 cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ? 1323 CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 1324 stat = cusparse_csr2hyb(cusparsestruct->handle, matrix->num_rows, matrix->num_cols, 1325 matstruct->descr, matrix->values->data().get(), 1326 matrix->row_offsets->data().get(), 1327 matrix->column_indices->data().get(), 1328 hybMat, 0, partition);CHKERRCUDA(stat); 1329 /* assign the pointer */ 1330 matstruct->mat = hybMat; 1331 1332 if (matrix) { 1333 if (matrix->values) delete (THRUSTARRAY*)matrix->values; 1334 if (matrix->column_indices) delete (THRUSTINTARRAY32*)matrix->column_indices; 1335 if (matrix->row_offsets) delete (THRUSTINTARRAY32*)matrix->row_offsets; 1336 delete (CsrMatrix*)matrix; 1337 } 1338 #endif 1339 } 1340 1341 /* assign the compressed row indices */ 1342 matstruct->cprowIndices = new THRUSTINTARRAY(m); 1343 matstruct->cprowIndices->assign(ridx,ridx+m); 1344 1345 /* assign the pointer */ 1346 cusparsestruct->mat = matstruct; 1347 1348 if (!a->compressedrow.use) { 1349 ierr = PetscFree(ii);CHKERRQ(ierr); 1350 ierr = PetscFree(ridx);CHKERRQ(ierr); 1351 } 1352 cusparsestruct->workVector = new THRUSTARRAY; 1353 cusparsestruct->workVector->resize(m); 1354 } catch(char *ex) { 1355 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 1356 } 1357 ierr = WaitForGPU();CHKERRCUDA(ierr); 1358 1359 A->valid_GPU_matrix = PETSC_CUDA_BOTH; 1360 1361 ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1362 } 1363 PetscFunctionReturn(0); 1364 } 1365 1366 #undef __FUNCT__ 1367 #define __FUNCT__ "MatCreateVecs_SeqAIJCUSPARSE" 1368 static PetscErrorCode MatCreateVecs_SeqAIJCUSPARSE(Mat mat, Vec *right, Vec *left) 1369 { 1370 PetscErrorCode ierr; 1371 PetscInt rbs,cbs; 1372 1373 PetscFunctionBegin; 1374 ierr = MatGetBlockSizes(mat,&rbs,&cbs);CHKERRQ(ierr); 1375 if (right) { 1376 ierr = VecCreate(PetscObjectComm((PetscObject)mat),right);CHKERRQ(ierr); 1377 ierr = VecSetSizes(*right,mat->cmap->n,PETSC_DETERMINE);CHKERRQ(ierr); 1378 ierr = VecSetBlockSize(*right,cbs);CHKERRQ(ierr); 1379 ierr = VecSetType(*right,VECSEQCUDA);CHKERRQ(ierr); 1380 ierr = PetscLayoutReference(mat->cmap,&(*right)->map);CHKERRQ(ierr); 1381 } 1382 if (left) { 1383 ierr = VecCreate(PetscObjectComm((PetscObject)mat),left);CHKERRQ(ierr); 1384 ierr = VecSetSizes(*left,mat->rmap->n,PETSC_DETERMINE);CHKERRQ(ierr); 1385 ierr = VecSetBlockSize(*left,rbs);CHKERRQ(ierr); 1386 ierr = VecSetType(*left,VECSEQCUDA);CHKERRQ(ierr); 1387 ierr = PetscLayoutReference(mat->rmap,&(*left)->map);CHKERRQ(ierr); 1388 } 1389 PetscFunctionReturn(0); 1390 } 1391 1392 struct VecCUDAPlusEquals 1393 { 1394 template <typename Tuple> 1395 __host__ __device__ 1396 void operator()(Tuple t) 1397 { 1398 thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t); 1399 } 1400 }; 1401 1402 #undef __FUNCT__ 1403 #define __FUNCT__ "MatMult_SeqAIJCUSPARSE" 1404 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 1405 { 1406 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1407 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1408 Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 1409 const PetscScalar *xarray; 1410 PetscScalar *yarray; 1411 PetscErrorCode ierr; 1412 cusparseStatus_t stat; 1413 1414 PetscFunctionBegin; 1415 /* The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE 1416 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); */ 1417 ierr = VecCUDAGetArrayRead(xx,&xarray);CHKERRQ(ierr); 1418 ierr = VecCUDAGetArrayWrite(yy,&yarray);CHKERRQ(ierr); 1419 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1420 CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 1421 stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1422 mat->num_rows, mat->num_cols, mat->num_entries, 1423 matstruct->alpha, matstruct->descr, mat->values->data().get(), mat->row_offsets->data().get(), 1424 mat->column_indices->data().get(), xarray, matstruct->beta, 1425 yarray);CHKERRCUDA(stat); 1426 } else { 1427 #if CUDA_VERSION>=4020 1428 cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat; 1429 stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1430 matstruct->alpha, matstruct->descr, hybMat, 1431 xarray, matstruct->beta, 1432 yarray);CHKERRCUDA(stat); 1433 #endif 1434 } 1435 ierr = VecCUDARestoreArrayRead(xx,&xarray);CHKERRQ(ierr); 1436 ierr = VecCUDARestoreArrayWrite(yy,&yarray);CHKERRQ(ierr); 1437 if (!cusparsestruct->stream) { 1438 ierr = WaitForGPU();CHKERRCUDA(ierr); 1439 } 1440 ierr = PetscLogFlops(2.0*a->nz - cusparsestruct->nonzerorow);CHKERRQ(ierr); 1441 PetscFunctionReturn(0); 1442 } 1443 1444 #undef __FUNCT__ 1445 #define __FUNCT__ "MatMultTranspose_SeqAIJCUSPARSE" 1446 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 1447 { 1448 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1449 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1450 Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 1451 const PetscScalar *xarray; 1452 PetscScalar *yarray; 1453 PetscErrorCode ierr; 1454 cusparseStatus_t stat; 1455 1456 PetscFunctionBegin; 1457 /* The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE 1458 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); */ 1459 if (!matstructT) { 1460 ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr); 1461 matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 1462 } 1463 ierr = VecCUDAGetArrayRead(xx,&xarray);CHKERRQ(ierr); 1464 ierr = VecCUDAGetArrayWrite(yy,&yarray);CHKERRQ(ierr); 1465 1466 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1467 CsrMatrix *mat = (CsrMatrix*)matstructT->mat; 1468 stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1469 mat->num_rows, mat->num_cols, 1470 mat->num_entries, matstructT->alpha, matstructT->descr, 1471 mat->values->data().get(), mat->row_offsets->data().get(), 1472 mat->column_indices->data().get(), xarray, matstructT->beta, 1473 yarray);CHKERRCUDA(stat); 1474 } else { 1475 #if CUDA_VERSION>=4020 1476 cusparseHybMat_t hybMat = (cusparseHybMat_t)matstructT->mat; 1477 stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1478 matstructT->alpha, matstructT->descr, hybMat, 1479 xarray, matstructT->beta, 1480 yarray);CHKERRCUDA(stat); 1481 #endif 1482 } 1483 ierr = VecCUDARestoreArrayRead(xx,&xarray);CHKERRQ(ierr); 1484 ierr = VecCUDARestoreArrayWrite(yy,&yarray);CHKERRQ(ierr); 1485 if (!cusparsestruct->stream) { 1486 ierr = WaitForGPU();CHKERRCUDA(ierr); 1487 } 1488 ierr = PetscLogFlops(2.0*a->nz - cusparsestruct->nonzerorow);CHKERRQ(ierr); 1489 PetscFunctionReturn(0); 1490 } 1491 1492 1493 #undef __FUNCT__ 1494 #define __FUNCT__ "MatMultAdd_SeqAIJCUSPARSE" 1495 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 1496 { 1497 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1498 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1499 Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 1500 thrust::device_ptr<PetscScalar> zptr; 1501 const PetscScalar *xarray; 1502 PetscScalar *zarray; 1503 PetscErrorCode ierr; 1504 cusparseStatus_t stat; 1505 1506 PetscFunctionBegin; 1507 /* The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE 1508 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); */ 1509 try { 1510 ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr); 1511 ierr = VecCUDAGetArrayRead(xx,&xarray);CHKERRQ(ierr); 1512 ierr = VecCUDAGetArrayWrite(zz,&zarray);CHKERRQ(ierr); 1513 zptr = thrust::device_pointer_cast(zarray); 1514 1515 /* multiply add */ 1516 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1517 CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 1518 /* here we need to be careful to set the number of rows in the multiply to the 1519 number of compressed rows in the matrix ... which is equivalent to the 1520 size of the workVector */ 1521 stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1522 mat->num_rows, mat->num_cols, 1523 mat->num_entries, matstruct->alpha, matstruct->descr, 1524 mat->values->data().get(), mat->row_offsets->data().get(), 1525 mat->column_indices->data().get(), xarray, matstruct->beta, 1526 cusparsestruct->workVector->data().get());CHKERRCUDA(stat); 1527 } else { 1528 #if CUDA_VERSION>=4020 1529 cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat; 1530 if (cusparsestruct->workVector->size()) { 1531 stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1532 matstruct->alpha, matstruct->descr, hybMat, 1533 xarray, matstruct->beta, 1534 cusparsestruct->workVector->data().get());CHKERRCUDA(stat); 1535 } 1536 #endif 1537 } 1538 1539 /* scatter the data from the temporary into the full vector with a += operation */ 1540 thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))), 1541 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + cusparsestruct->workVector->size(), 1542 VecCUDAPlusEquals()); 1543 ierr = VecCUDARestoreArrayRead(xx,&xarray);CHKERRQ(ierr); 1544 ierr = VecCUDARestoreArrayWrite(zz,&zarray);CHKERRQ(ierr); 1545 1546 } catch(char *ex) { 1547 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 1548 } 1549 ierr = WaitForGPU();CHKERRCUDA(ierr); 1550 ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr); 1551 PetscFunctionReturn(0); 1552 } 1553 1554 #undef __FUNCT__ 1555 #define __FUNCT__ "MatMultTransposeAdd_SeqAIJCUSPARSE" 1556 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 1557 { 1558 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1559 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1560 Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 1561 thrust::device_ptr<PetscScalar> zptr; 1562 const PetscScalar *xarray; 1563 PetscScalar *zarray; 1564 PetscErrorCode ierr; 1565 cusparseStatus_t stat; 1566 1567 PetscFunctionBegin; 1568 /* The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE 1569 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); */ 1570 if (!matstructT) { 1571 ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr); 1572 matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 1573 } 1574 1575 try { 1576 ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr); 1577 ierr = VecCUDAGetArrayRead(xx,&xarray);CHKERRQ(ierr); 1578 ierr = VecCUDAGetArrayWrite(zz,&zarray);CHKERRQ(ierr); 1579 zptr = thrust::device_pointer_cast(zarray); 1580 1581 /* multiply add with matrix transpose */ 1582 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1583 CsrMatrix *mat = (CsrMatrix*)matstructT->mat; 1584 /* here we need to be careful to set the number of rows in the multiply to the 1585 number of compressed rows in the matrix ... which is equivalent to the 1586 size of the workVector */ 1587 stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1588 mat->num_rows, mat->num_cols, 1589 mat->num_entries, matstructT->alpha, matstructT->descr, 1590 mat->values->data().get(), mat->row_offsets->data().get(), 1591 mat->column_indices->data().get(), xarray, matstructT->beta, 1592 cusparsestruct->workVector->data().get());CHKERRCUDA(stat); 1593 } else { 1594 #if CUDA_VERSION>=4020 1595 cusparseHybMat_t hybMat = (cusparseHybMat_t)matstructT->mat; 1596 if (cusparsestruct->workVector->size()) { 1597 stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1598 matstructT->alpha, matstructT->descr, hybMat, 1599 xarray, matstructT->beta, 1600 cusparsestruct->workVector->data().get());CHKERRCUDA(stat); 1601 } 1602 #endif 1603 } 1604 1605 /* scatter the data from the temporary into the full vector with a += operation */ 1606 thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstructT->cprowIndices->begin()))), 1607 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstructT->cprowIndices->begin()))) + cusparsestruct->workVector->size(), 1608 VecCUDAPlusEquals()); 1609 1610 ierr = VecCUDARestoreArrayRead(xx,&xarray);CHKERRQ(ierr); 1611 ierr = VecCUDARestoreArrayWrite(zz,&zarray);CHKERRQ(ierr); 1612 1613 } catch(char *ex) { 1614 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 1615 } 1616 ierr = WaitForGPU();CHKERRCUDA(ierr); 1617 ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr); 1618 PetscFunctionReturn(0); 1619 } 1620 1621 #undef __FUNCT__ 1622 #define __FUNCT__ "MatAssemblyEnd_SeqAIJCUSPARSE" 1623 static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode) 1624 { 1625 PetscErrorCode ierr; 1626 1627 PetscFunctionBegin; 1628 ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr); 1629 if (A->factortype==MAT_FACTOR_NONE) { 1630 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 1631 } 1632 if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0); 1633 A->ops->mult = MatMult_SeqAIJCUSPARSE; 1634 A->ops->multadd = MatMultAdd_SeqAIJCUSPARSE; 1635 A->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE; 1636 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE; 1637 PetscFunctionReturn(0); 1638 } 1639 1640 /* --------------------------------------------------------------------------------*/ 1641 #undef __FUNCT__ 1642 #define __FUNCT__ "MatCreateSeqAIJCUSPARSE" 1643 /*@ 1644 MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format 1645 (the default parallel PETSc format). This matrix will ultimately pushed down 1646 to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix 1647 assembly performance the user should preallocate the matrix storage by setting 1648 the parameter nz (or the array nnz). By setting these parameters accurately, 1649 performance during matrix assembly can be increased by more than a factor of 50. 1650 1651 Collective on MPI_Comm 1652 1653 Input Parameters: 1654 + comm - MPI communicator, set to PETSC_COMM_SELF 1655 . m - number of rows 1656 . n - number of columns 1657 . nz - number of nonzeros per row (same for all rows) 1658 - nnz - array containing the number of nonzeros in the various rows 1659 (possibly different for each row) or NULL 1660 1661 Output Parameter: 1662 . A - the matrix 1663 1664 It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 1665 MatXXXXSetPreallocation() paradgm instead of this routine directly. 1666 [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 1667 1668 Notes: 1669 If nnz is given then nz is ignored 1670 1671 The AIJ format (also called the Yale sparse matrix format or 1672 compressed row storage), is fully compatible with standard Fortran 77 1673 storage. That is, the stored row and column indices can begin at 1674 either one (as in Fortran) or zero. See the users' manual for details. 1675 1676 Specify the preallocated storage with either nz or nnz (not both). 1677 Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 1678 allocation. For large problems you MUST preallocate memory or you 1679 will get TERRIBLE performance, see the users' manual chapter on matrices. 1680 1681 By default, this format uses inodes (identical nodes) when possible, to 1682 improve numerical efficiency of matrix-vector products and solves. We 1683 search for consecutive rows with the same nonzero structure, thereby 1684 reusing matrix information to achieve increased efficiency. 1685 1686 Level: intermediate 1687 1688 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE 1689 @*/ 1690 PetscErrorCode MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A) 1691 { 1692 PetscErrorCode ierr; 1693 1694 PetscFunctionBegin; 1695 ierr = MatCreate(comm,A);CHKERRQ(ierr); 1696 ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr); 1697 ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 1698 ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr); 1699 PetscFunctionReturn(0); 1700 } 1701 1702 #undef __FUNCT__ 1703 #define __FUNCT__ "MatDestroy_SeqAIJCUSPARSE" 1704 static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A) 1705 { 1706 PetscErrorCode ierr; 1707 1708 PetscFunctionBegin; 1709 if (A->factortype==MAT_FACTOR_NONE) { 1710 if (A->valid_GPU_matrix != PETSC_CUDA_UNALLOCATED) { 1711 ierr = Mat_SeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr);CHKERRQ(ierr); 1712 } 1713 } else { 1714 ierr = Mat_SeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr);CHKERRQ(ierr); 1715 } 1716 ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr); 1717 PetscFunctionReturn(0); 1718 } 1719 1720 #undef __FUNCT__ 1721 #define __FUNCT__ "MatCreate_SeqAIJCUSPARSE" 1722 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B) 1723 { 1724 PetscErrorCode ierr; 1725 cusparseStatus_t stat; 1726 cusparseHandle_t handle=0; 1727 1728 PetscFunctionBegin; 1729 ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr); 1730 if (B->factortype==MAT_FACTOR_NONE) { 1731 /* you cannot check the inode.use flag here since the matrix was just created. 1732 now build a GPU matrix data structure */ 1733 B->spptr = new Mat_SeqAIJCUSPARSE; 1734 ((Mat_SeqAIJCUSPARSE*)B->spptr)->mat = 0; 1735 ((Mat_SeqAIJCUSPARSE*)B->spptr)->matTranspose = 0; 1736 ((Mat_SeqAIJCUSPARSE*)B->spptr)->workVector = 0; 1737 ((Mat_SeqAIJCUSPARSE*)B->spptr)->format = MAT_CUSPARSE_CSR; 1738 ((Mat_SeqAIJCUSPARSE*)B->spptr)->stream = 0; 1739 ((Mat_SeqAIJCUSPARSE*)B->spptr)->handle = 0; 1740 stat = cusparseCreate(&handle);CHKERRCUDA(stat); 1741 ((Mat_SeqAIJCUSPARSE*)B->spptr)->handle = handle; 1742 ((Mat_SeqAIJCUSPARSE*)B->spptr)->stream = 0; 1743 } else { 1744 /* NEXT, set the pointers to the triangular factors */ 1745 B->spptr = new Mat_SeqAIJCUSPARSETriFactors; 1746 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->loTriFactorPtr = 0; 1747 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->upTriFactorPtr = 0; 1748 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->loTriFactorPtrTranspose = 0; 1749 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->upTriFactorPtrTranspose = 0; 1750 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->rpermIndices = 0; 1751 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->cpermIndices = 0; 1752 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->workVector = 0; 1753 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->handle = 0; 1754 stat = cusparseCreate(&handle);CHKERRCUDA(stat); 1755 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->handle = handle; 1756 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->nnz = 0; 1757 } 1758 1759 B->ops->assemblyend = MatAssemblyEnd_SeqAIJCUSPARSE; 1760 B->ops->destroy = MatDestroy_SeqAIJCUSPARSE; 1761 B->ops->getvecs = MatCreateVecs_SeqAIJCUSPARSE; 1762 B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE; 1763 B->ops->mult = MatMult_SeqAIJCUSPARSE; 1764 B->ops->multadd = MatMultAdd_SeqAIJCUSPARSE; 1765 B->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE; 1766 B->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE; 1767 1768 ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 1769 1770 B->valid_GPU_matrix = PETSC_CUDA_UNALLOCATED; 1771 1772 ierr = PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr); 1773 PetscFunctionReturn(0); 1774 } 1775 1776 /*M 1777 MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices. 1778 1779 A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either 1780 CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later. 1781 All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library. 1782 1783 Options Database Keys: 1784 + -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions() 1785 . -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 1786 . -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 1787 1788 Level: beginner 1789 1790 .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 1791 M*/ 1792 1793 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat,MatFactorType,Mat*); 1794 1795 1796 #undef __FUNCT__ 1797 #define __FUNCT__ "MatSolverPackageRegister_CUSPARSE" 1798 PETSC_EXTERN PetscErrorCode MatSolverPackageRegister_CUSPARSE(void) 1799 { 1800 PetscErrorCode ierr; 1801 1802 PetscFunctionBegin; 1803 ierr = MatSolverPackageRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 1804 ierr = MatSolverPackageRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 1805 ierr = MatSolverPackageRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 1806 ierr = MatSolverPackageRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 1807 PetscFunctionReturn(0); 1808 } 1809 1810 1811 #undef __FUNCT__ 1812 #define __FUNCT__ "Mat_SeqAIJCUSPARSE_Destroy" 1813 static PetscErrorCode Mat_SeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct) 1814 { 1815 cusparseStatus_t stat; 1816 cusparseHandle_t handle; 1817 1818 PetscFunctionBegin; 1819 if (*cusparsestruct) { 1820 Mat_SeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format); 1821 Mat_SeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format); 1822 delete (*cusparsestruct)->workVector; 1823 if (handle = (*cusparsestruct)->handle) { 1824 stat = cusparseDestroy(handle);CHKERRCUDA(stat); 1825 } 1826 delete *cusparsestruct; 1827 *cusparsestruct = 0; 1828 } 1829 PetscFunctionReturn(0); 1830 } 1831 1832 #undef __FUNCT__ 1833 #define __FUNCT__ "CsrMatrix_Destroy" 1834 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat) 1835 { 1836 PetscFunctionBegin; 1837 if (*mat) { 1838 delete (*mat)->values; 1839 delete (*mat)->column_indices; 1840 delete (*mat)->row_offsets; 1841 delete *mat; 1842 *mat = 0; 1843 } 1844 PetscFunctionReturn(0); 1845 } 1846 1847 #undef __FUNCT__ 1848 #define __FUNCT__ "Mat_SeqAIJCUSPARSETriFactorStruct_Destroy" 1849 static PetscErrorCode Mat_SeqAIJCUSPARSETriFactorStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor) 1850 { 1851 cusparseStatus_t stat; 1852 PetscErrorCode ierr; 1853 1854 PetscFunctionBegin; 1855 if (*trifactor) { 1856 if ((*trifactor)->descr) { stat = cusparseDestroyMatDescr((*trifactor)->descr);CHKERRCUDA(stat); } 1857 if ((*trifactor)->solveInfo) { stat = cusparseDestroySolveAnalysisInfo((*trifactor)->solveInfo);CHKERRCUDA(stat); } 1858 ierr = CsrMatrix_Destroy(&(*trifactor)->csrMat);CHKERRQ(ierr); 1859 delete *trifactor; 1860 *trifactor = 0; 1861 } 1862 PetscFunctionReturn(0); 1863 } 1864 1865 #undef __FUNCT__ 1866 #define __FUNCT__ "Mat_SeqAIJCUSPARSEMultStruct_Destroy" 1867 static PetscErrorCode Mat_SeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format) 1868 { 1869 CsrMatrix *mat; 1870 cusparseStatus_t stat; 1871 cudaError_t err; 1872 1873 PetscFunctionBegin; 1874 if (*matstruct) { 1875 if ((*matstruct)->mat) { 1876 if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) { 1877 cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat; 1878 stat = cusparseDestroyHybMat(hybMat);CHKERRCUDA(stat); 1879 } else { 1880 mat = (CsrMatrix*)(*matstruct)->mat; 1881 CsrMatrix_Destroy(&mat); 1882 } 1883 } 1884 if ((*matstruct)->descr) { stat = cusparseDestroyMatDescr((*matstruct)->descr);CHKERRCUDA(stat); } 1885 delete (*matstruct)->cprowIndices; 1886 if ((*matstruct)->alpha) { err=cudaFree((*matstruct)->alpha);CHKERRCUDA(err); } 1887 if ((*matstruct)->beta) { err=cudaFree((*matstruct)->beta);CHKERRCUDA(err); } 1888 delete *matstruct; 1889 *matstruct = 0; 1890 } 1891 PetscFunctionReturn(0); 1892 } 1893 1894 #undef __FUNCT__ 1895 #define __FUNCT__ "Mat_SeqAIJCUSPARSETriFactors_Destroy" 1896 static PetscErrorCode Mat_SeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors) 1897 { 1898 cusparseHandle_t handle; 1899 cusparseStatus_t stat; 1900 1901 PetscFunctionBegin; 1902 if (*trifactors) { 1903 Mat_SeqAIJCUSPARSETriFactorStruct_Destroy(&(*trifactors)->loTriFactorPtr); 1904 Mat_SeqAIJCUSPARSETriFactorStruct_Destroy(&(*trifactors)->upTriFactorPtr); 1905 Mat_SeqAIJCUSPARSETriFactorStruct_Destroy(&(*trifactors)->loTriFactorPtrTranspose); 1906 Mat_SeqAIJCUSPARSETriFactorStruct_Destroy(&(*trifactors)->upTriFactorPtrTranspose); 1907 delete (*trifactors)->rpermIndices; 1908 delete (*trifactors)->cpermIndices; 1909 delete (*trifactors)->workVector; 1910 if (handle = (*trifactors)->handle) { 1911 stat = cusparseDestroy(handle);CHKERRCUDA(stat); 1912 } 1913 delete *trifactors; 1914 *trifactors = 0; 1915 } 1916 PetscFunctionReturn(0); 1917 } 1918 1919