1 /* 2 Defines the basic matrix operations for the AIJ (compressed row) 3 matrix storage format using the CUSPARSE library, 4 */ 5 #define PETSC_SKIP_SPINLOCK 6 7 #include <petscconf.h> 8 #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/ 9 #include <../src/mat/impls/sbaij/seq/sbaij.h> 10 #include <../src/vec/vec/impls/dvecimpl.h> 11 #include <petsc/private/vecimpl.h> 12 #undef VecType 13 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h> 14 15 const char *const MatCUSPARSEStorageFormats[] = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0}; 16 17 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*); 18 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*); 19 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*); 20 21 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 22 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 23 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*); 24 25 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec); 26 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 27 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 28 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 29 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat); 30 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec); 31 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 32 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 33 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 34 35 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**); 36 static PetscErrorCode Mat_SeqAIJCUSPARSETriFactorStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**); 37 static PetscErrorCode Mat_SeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat); 38 static PetscErrorCode Mat_SeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**); 39 static PetscErrorCode Mat_SeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**); 40 41 #undef __FUNCT__ 42 #define __FUNCT__ "MatCUSPARSESetStream" 43 PetscErrorCode MatCUSPARSESetStream(Mat A,const cudaStream_t stream) 44 { 45 cusparseStatus_t stat; 46 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 47 48 PetscFunctionBegin; 49 cusparsestruct->stream = stream; 50 stat = cusparseSetStream(cusparsestruct->handle,cusparsestruct->stream);CHKERRCUDA(stat); 51 PetscFunctionReturn(0); 52 } 53 54 #undef __FUNCT__ 55 #define __FUNCT__ "MatCUSPARSESetHandle" 56 PetscErrorCode MatCUSPARSESetHandle(Mat A,const cusparseHandle_t handle) 57 { 58 cusparseStatus_t stat; 59 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 60 61 PetscFunctionBegin; 62 if (cusparsestruct->handle) { 63 stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUDA(stat); 64 } 65 cusparsestruct->handle = handle; 66 stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUDA(stat); 67 PetscFunctionReturn(0); 68 } 69 70 #undef __FUNCT__ 71 #define __FUNCT__ "MatCUSPARSEClearHandle" 72 PetscErrorCode MatCUSPARSEClearHandle(Mat A) 73 { 74 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 75 PetscFunctionBegin; 76 if (cusparsestruct->handle) 77 cusparsestruct->handle = 0; 78 PetscFunctionReturn(0); 79 } 80 81 #undef __FUNCT__ 82 #define __FUNCT__ "MatFactorGetSolverPackage_seqaij_cusparse" 83 PetscErrorCode MatFactorGetSolverPackage_seqaij_cusparse(Mat A,const MatSolverPackage *type) 84 { 85 PetscFunctionBegin; 86 *type = MATSOLVERCUSPARSE; 87 PetscFunctionReturn(0); 88 } 89 90 /*MC 91 MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices 92 on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported 93 algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer 94 performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the 95 CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these 96 algorithms are not recommended. This class does NOT support direct solver operations. 97 98 Level: beginner 99 100 .seealso: PCFactorSetMatSolverPackage(), MatSolverPackage, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 101 M*/ 102 103 #undef __FUNCT__ 104 #define __FUNCT__ "MatGetFactor_seqaijcusparse_cusparse" 105 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B) 106 { 107 PetscErrorCode ierr; 108 PetscInt n = A->rmap->n; 109 110 PetscFunctionBegin; 111 ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr); 112 (*B)->factortype = ftype; 113 ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr); 114 ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 115 116 if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) { 117 ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr); 118 (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE; 119 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJCUSPARSE; 120 } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) { 121 (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJCUSPARSE; 122 (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE; 123 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types"); 124 125 ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr); 126 ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverPackage_C",MatFactorGetSolverPackage_seqaij_cusparse);CHKERRQ(ierr); 127 PetscFunctionReturn(0); 128 } 129 130 #undef __FUNCT__ 131 #define __FUNCT__ "MatCUSPARSESetFormat_SeqAIJCUSPARSE" 132 PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 133 { 134 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 135 136 PetscFunctionBegin; 137 #if CUDA_VERSION>=4020 138 switch (op) { 139 case MAT_CUSPARSE_MULT: 140 cusparsestruct->format = format; 141 break; 142 case MAT_CUSPARSE_ALL: 143 cusparsestruct->format = format; 144 break; 145 default: 146 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op); 147 } 148 #else 149 if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"ELL (Ellpack) and HYB (Hybrid) storage format require CUDA 4.2 or later."); 150 #endif 151 PetscFunctionReturn(0); 152 } 153 154 /*@ 155 MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular 156 operation. Only the MatMult operation can use different GPU storage formats 157 for MPIAIJCUSPARSE matrices. 158 Not Collective 159 160 Input Parameters: 161 + A - Matrix of type SEQAIJCUSPARSE 162 . op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL. 163 - format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2) 164 165 Output Parameter: 166 167 Level: intermediate 168 169 .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 170 @*/ 171 #undef __FUNCT__ 172 #define __FUNCT__ "MatCUSPARSESetFormat" 173 PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 174 { 175 PetscErrorCode ierr; 176 177 PetscFunctionBegin; 178 PetscValidHeaderSpecific(A, MAT_CLASSID,1); 179 ierr = PetscTryMethod(A, "MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));CHKERRQ(ierr); 180 PetscFunctionReturn(0); 181 } 182 183 #undef __FUNCT__ 184 #define __FUNCT__ "MatSetFromOptions_SeqAIJCUSPARSE" 185 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A) 186 { 187 PetscErrorCode ierr; 188 MatCUSPARSEStorageFormat format; 189 PetscBool flg; 190 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 191 192 PetscFunctionBegin; 193 ierr = PetscOptionsHead(PetscOptionsObject,"SeqAIJCUSPARSE options");CHKERRQ(ierr); 194 ierr = PetscObjectOptionsBegin((PetscObject)A); 195 if (A->factortype==MAT_FACTOR_NONE) { 196 ierr = PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV", 197 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 198 if (flg) { 199 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);CHKERRQ(ierr); 200 } 201 } 202 ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve", 203 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 204 if (flg) { 205 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr); 206 } 207 ierr = PetscOptionsEnd();CHKERRQ(ierr); 208 PetscFunctionReturn(0); 209 210 } 211 212 #undef __FUNCT__ 213 #define __FUNCT__ "MatILUFactorSymbolic_SeqAIJCUSPARSE" 214 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 215 { 216 PetscErrorCode ierr; 217 218 PetscFunctionBegin; 219 ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 220 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 221 PetscFunctionReturn(0); 222 } 223 224 #undef __FUNCT__ 225 #define __FUNCT__ "MatLUFactorSymbolic_SeqAIJCUSPARSE" 226 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 227 { 228 PetscErrorCode ierr; 229 230 PetscFunctionBegin; 231 ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 232 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 233 PetscFunctionReturn(0); 234 } 235 236 #undef __FUNCT__ 237 #define __FUNCT__ "MatICCFactorSymbolic_SeqAIJCUSPARSE" 238 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info) 239 { 240 PetscErrorCode ierr; 241 242 PetscFunctionBegin; 243 ierr = MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr); 244 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 245 PetscFunctionReturn(0); 246 } 247 248 #undef __FUNCT__ 249 #define __FUNCT__ "MatCholeskyFactorSymbolic_SeqAIJCUSPARSE" 250 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info) 251 { 252 PetscErrorCode ierr; 253 254 PetscFunctionBegin; 255 ierr = MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr); 256 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 257 PetscFunctionReturn(0); 258 } 259 260 #undef __FUNCT__ 261 #define __FUNCT__ "MatSeqAIJCUSPARSEBuildILULowerTriMatrix" 262 static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A) 263 { 264 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 265 PetscInt n = A->rmap->n; 266 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 267 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 268 cusparseStatus_t stat; 269 const PetscInt *ai = a->i,*aj = a->j,*vi; 270 const MatScalar *aa = a->a,*v; 271 PetscInt *AiLo, *AjLo; 272 PetscScalar *AALo; 273 PetscInt i,nz, nzLower, offset, rowOffset; 274 PetscErrorCode ierr; 275 276 PetscFunctionBegin; 277 if (A->valid_GPU_matrix == PETSC_CUDA_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUDA_CPU) { 278 try { 279 /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */ 280 nzLower=n+ai[n]-ai[1]; 281 282 /* Allocate Space for the lower triangular matrix */ 283 ierr = cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUDA(ierr); 284 ierr = cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUDA(ierr); 285 ierr = cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUDA(ierr); 286 287 /* Fill the lower triangular matrix */ 288 AiLo[0] = (PetscInt) 0; 289 AiLo[n] = nzLower; 290 AjLo[0] = (PetscInt) 0; 291 AALo[0] = (MatScalar) 1.0; 292 v = aa; 293 vi = aj; 294 offset = 1; 295 rowOffset= 1; 296 for (i=1; i<n; i++) { 297 nz = ai[i+1] - ai[i]; 298 /* additional 1 for the term on the diagonal */ 299 AiLo[i] = rowOffset; 300 rowOffset += nz+1; 301 302 ierr = PetscMemcpy(&(AjLo[offset]), vi, nz*sizeof(PetscInt));CHKERRQ(ierr); 303 ierr = PetscMemcpy(&(AALo[offset]), v, nz*sizeof(PetscScalar));CHKERRQ(ierr); 304 305 offset += nz; 306 AjLo[offset] = (PetscInt) i; 307 AALo[offset] = (MatScalar) 1.0; 308 offset += 1; 309 310 v += nz; 311 vi += nz; 312 } 313 314 /* allocate space for the triangular factor information */ 315 loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct; 316 317 /* Create the matrix description */ 318 stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUDA(stat); 319 stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat); 320 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUDA(stat); 321 stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER);CHKERRCUDA(stat); 322 stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUDA(stat); 323 324 /* Create the solve analysis information */ 325 stat = cusparseCreateSolveAnalysisInfo(&loTriFactor->solveInfo);CHKERRCUDA(stat); 326 327 /* set the operation */ 328 loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 329 330 /* set the matrix */ 331 loTriFactor->csrMat = new CsrMatrix; 332 loTriFactor->csrMat->num_rows = n; 333 loTriFactor->csrMat->num_cols = n; 334 loTriFactor->csrMat->num_entries = nzLower; 335 336 loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1); 337 loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1); 338 339 loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower); 340 loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower); 341 342 loTriFactor->csrMat->values = new THRUSTARRAY(nzLower); 343 loTriFactor->csrMat->values->assign(AALo, AALo+nzLower); 344 345 /* perform the solve analysis */ 346 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, 347 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 348 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 349 loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo);CHKERRCUDA(stat); 350 351 /* assign the pointer. Is this really necessary? */ 352 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor; 353 354 ierr = cudaFreeHost(AiLo);CHKERRCUDA(ierr); 355 ierr = cudaFreeHost(AjLo);CHKERRCUDA(ierr); 356 ierr = cudaFreeHost(AALo);CHKERRCUDA(ierr); 357 } catch(char *ex) { 358 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 359 } 360 } 361 PetscFunctionReturn(0); 362 } 363 364 #undef __FUNCT__ 365 #define __FUNCT__ "MatSeqAIJCUSPARSEBuildILUUpperTriMatrix" 366 static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A) 367 { 368 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 369 PetscInt n = A->rmap->n; 370 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 371 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 372 cusparseStatus_t stat; 373 const PetscInt *aj = a->j,*adiag = a->diag,*vi; 374 const MatScalar *aa = a->a,*v; 375 PetscInt *AiUp, *AjUp; 376 PetscScalar *AAUp; 377 PetscInt i,nz, nzUpper, offset; 378 PetscErrorCode ierr; 379 380 PetscFunctionBegin; 381 if (A->valid_GPU_matrix == PETSC_CUDA_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUDA_CPU) { 382 try { 383 /* next, figure out the number of nonzeros in the upper triangular matrix. */ 384 nzUpper = adiag[0]-adiag[n]; 385 386 /* Allocate Space for the upper triangular matrix */ 387 ierr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(ierr); 388 ierr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(ierr); 389 ierr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(ierr); 390 391 /* Fill the upper triangular matrix */ 392 AiUp[0]=(PetscInt) 0; 393 AiUp[n]=nzUpper; 394 offset = nzUpper; 395 for (i=n-1; i>=0; i--) { 396 v = aa + adiag[i+1] + 1; 397 vi = aj + adiag[i+1] + 1; 398 399 /* number of elements NOT on the diagonal */ 400 nz = adiag[i] - adiag[i+1]-1; 401 402 /* decrement the offset */ 403 offset -= (nz+1); 404 405 /* first, set the diagonal elements */ 406 AjUp[offset] = (PetscInt) i; 407 AAUp[offset] = (MatScalar)1./v[nz]; 408 AiUp[i] = AiUp[i+1] - (nz+1); 409 410 ierr = PetscMemcpy(&(AjUp[offset+1]), vi, nz*sizeof(PetscInt));CHKERRQ(ierr); 411 ierr = PetscMemcpy(&(AAUp[offset+1]), v, nz*sizeof(PetscScalar));CHKERRQ(ierr); 412 } 413 414 /* allocate space for the triangular factor information */ 415 upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct; 416 417 /* Create the matrix description */ 418 stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUDA(stat); 419 stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat); 420 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUDA(stat); 421 stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUDA(stat); 422 stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUDA(stat); 423 424 /* Create the solve analysis information */ 425 stat = cusparseCreateSolveAnalysisInfo(&upTriFactor->solveInfo);CHKERRCUDA(stat); 426 427 /* set the operation */ 428 upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 429 430 /* set the matrix */ 431 upTriFactor->csrMat = new CsrMatrix; 432 upTriFactor->csrMat->num_rows = n; 433 upTriFactor->csrMat->num_cols = n; 434 upTriFactor->csrMat->num_entries = nzUpper; 435 436 upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1); 437 upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1); 438 439 upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper); 440 upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper); 441 442 upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper); 443 upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper); 444 445 /* perform the solve analysis */ 446 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, 447 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 448 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 449 upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo);CHKERRCUDA(stat); 450 451 /* assign the pointer. Is this really necessary? */ 452 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor; 453 454 ierr = cudaFreeHost(AiUp);CHKERRCUDA(ierr); 455 ierr = cudaFreeHost(AjUp);CHKERRCUDA(ierr); 456 ierr = cudaFreeHost(AAUp);CHKERRCUDA(ierr); 457 } catch(char *ex) { 458 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 459 } 460 } 461 PetscFunctionReturn(0); 462 } 463 464 #undef __FUNCT__ 465 #define __FUNCT__ "MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU" 466 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A) 467 { 468 PetscErrorCode ierr; 469 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 470 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 471 IS isrow = a->row,iscol = a->icol; 472 PetscBool row_identity,col_identity; 473 const PetscInt *r,*c; 474 PetscInt n = A->rmap->n; 475 476 PetscFunctionBegin; 477 ierr = MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);CHKERRQ(ierr); 478 ierr = MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);CHKERRQ(ierr); 479 480 cusparseTriFactors->workVector = new THRUSTARRAY; 481 cusparseTriFactors->workVector->resize(n); 482 cusparseTriFactors->nnz=a->nz; 483 484 A->valid_GPU_matrix = PETSC_CUDA_BOTH; 485 /*lower triangular indices */ 486 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 487 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 488 if (!row_identity) { 489 cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 490 cusparseTriFactors->rpermIndices->assign(r, r+n); 491 } 492 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 493 494 /*upper triangular indices */ 495 ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr); 496 ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 497 if (!col_identity) { 498 cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 499 cusparseTriFactors->cpermIndices->assign(c, c+n); 500 } 501 ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr); 502 PetscFunctionReturn(0); 503 } 504 505 #undef __FUNCT__ 506 #define __FUNCT__ "MatSeqAIJCUSPARSEBuildICCTriMatrices" 507 static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A) 508 { 509 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 510 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 511 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 512 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 513 cusparseStatus_t stat; 514 PetscErrorCode ierr; 515 PetscInt *AiUp, *AjUp; 516 PetscScalar *AAUp; 517 PetscScalar *AALo; 518 PetscInt nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j; 519 Mat_SeqSBAIJ *b = (Mat_SeqSBAIJ*)A->data; 520 const PetscInt *ai = b->i,*aj = b->j,*vj; 521 const MatScalar *aa = b->a,*v; 522 523 PetscFunctionBegin; 524 if (A->valid_GPU_matrix == PETSC_CUDA_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUDA_CPU) { 525 try { 526 /* Allocate Space for the upper triangular matrix */ 527 ierr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(ierr); 528 ierr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(ierr); 529 ierr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(ierr); 530 ierr = cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUDA(ierr); 531 532 /* Fill the upper triangular matrix */ 533 AiUp[0]=(PetscInt) 0; 534 AiUp[n]=nzUpper; 535 offset = 0; 536 for (i=0; i<n; i++) { 537 /* set the pointers */ 538 v = aa + ai[i]; 539 vj = aj + ai[i]; 540 nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */ 541 542 /* first, set the diagonal elements */ 543 AjUp[offset] = (PetscInt) i; 544 AAUp[offset] = (MatScalar)1.0/v[nz]; 545 AiUp[i] = offset; 546 AALo[offset] = (MatScalar)1.0/v[nz]; 547 548 offset+=1; 549 if (nz>0) { 550 ierr = PetscMemcpy(&(AjUp[offset]), vj, nz*sizeof(PetscInt));CHKERRQ(ierr); 551 ierr = PetscMemcpy(&(AAUp[offset]), v, nz*sizeof(PetscScalar));CHKERRQ(ierr); 552 for (j=offset; j<offset+nz; j++) { 553 AAUp[j] = -AAUp[j]; 554 AALo[j] = AAUp[j]/v[nz]; 555 } 556 offset+=nz; 557 } 558 } 559 560 /* allocate space for the triangular factor information */ 561 upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct; 562 563 /* Create the matrix description */ 564 stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUDA(stat); 565 stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat); 566 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUDA(stat); 567 stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUDA(stat); 568 stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUDA(stat); 569 570 /* Create the solve analysis information */ 571 stat = cusparseCreateSolveAnalysisInfo(&upTriFactor->solveInfo);CHKERRCUDA(stat); 572 573 /* set the operation */ 574 upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 575 576 /* set the matrix */ 577 upTriFactor->csrMat = new CsrMatrix; 578 upTriFactor->csrMat->num_rows = A->rmap->n; 579 upTriFactor->csrMat->num_cols = A->cmap->n; 580 upTriFactor->csrMat->num_entries = a->nz; 581 582 upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 583 upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1); 584 585 upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 586 upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz); 587 588 upTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 589 upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz); 590 591 /* perform the solve analysis */ 592 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, 593 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 594 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 595 upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo);CHKERRCUDA(stat); 596 597 /* assign the pointer. Is this really necessary? */ 598 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor; 599 600 /* allocate space for the triangular factor information */ 601 loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct; 602 603 /* Create the matrix description */ 604 stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUDA(stat); 605 stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat); 606 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUDA(stat); 607 stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUDA(stat); 608 stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUDA(stat); 609 610 /* Create the solve analysis information */ 611 stat = cusparseCreateSolveAnalysisInfo(&loTriFactor->solveInfo);CHKERRCUDA(stat); 612 613 /* set the operation */ 614 loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE; 615 616 /* set the matrix */ 617 loTriFactor->csrMat = new CsrMatrix; 618 loTriFactor->csrMat->num_rows = A->rmap->n; 619 loTriFactor->csrMat->num_cols = A->cmap->n; 620 loTriFactor->csrMat->num_entries = a->nz; 621 622 loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 623 loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1); 624 625 loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 626 loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz); 627 628 loTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 629 loTriFactor->csrMat->values->assign(AALo, AALo+a->nz); 630 631 /* perform the solve analysis */ 632 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, 633 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 634 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 635 loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo);CHKERRCUDA(stat); 636 637 /* assign the pointer. Is this really necessary? */ 638 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor; 639 640 A->valid_GPU_matrix = PETSC_CUDA_BOTH; 641 ierr = cudaFreeHost(AiUp);CHKERRCUDA(ierr); 642 ierr = cudaFreeHost(AjUp);CHKERRCUDA(ierr); 643 ierr = cudaFreeHost(AAUp);CHKERRCUDA(ierr); 644 ierr = cudaFreeHost(AALo);CHKERRCUDA(ierr); 645 } catch(char *ex) { 646 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 647 } 648 } 649 PetscFunctionReturn(0); 650 } 651 652 #undef __FUNCT__ 653 #define __FUNCT__ "MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU" 654 static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A) 655 { 656 PetscErrorCode ierr; 657 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 658 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 659 IS ip = a->row; 660 const PetscInt *rip; 661 PetscBool perm_identity; 662 PetscInt n = A->rmap->n; 663 664 PetscFunctionBegin; 665 ierr = MatSeqAIJCUSPARSEBuildICCTriMatrices(A);CHKERRQ(ierr); 666 cusparseTriFactors->workVector = new THRUSTARRAY; 667 cusparseTriFactors->workVector->resize(n); 668 cusparseTriFactors->nnz=(a->nz-n)*2 + n; 669 670 /*lower triangular indices */ 671 ierr = ISGetIndices(ip,&rip);CHKERRQ(ierr); 672 ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr); 673 if (!perm_identity) { 674 cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 675 cusparseTriFactors->rpermIndices->assign(rip, rip+n); 676 cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 677 cusparseTriFactors->cpermIndices->assign(rip, rip+n); 678 } 679 ierr = ISRestoreIndices(ip,&rip);CHKERRQ(ierr); 680 PetscFunctionReturn(0); 681 } 682 683 #undef __FUNCT__ 684 #define __FUNCT__ "MatLUFactorNumeric_SeqAIJCUSPARSE" 685 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info) 686 { 687 Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 688 IS isrow = b->row,iscol = b->col; 689 PetscBool row_identity,col_identity; 690 PetscErrorCode ierr; 691 692 PetscFunctionBegin; 693 ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 694 /* determine which version of MatSolve needs to be used. */ 695 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 696 ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 697 if (row_identity && col_identity) { 698 B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 699 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 700 } else { 701 B->ops->solve = MatSolve_SeqAIJCUSPARSE; 702 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 703 } 704 705 /* get the triangular factors */ 706 ierr = MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);CHKERRQ(ierr); 707 PetscFunctionReturn(0); 708 } 709 710 #undef __FUNCT__ 711 #define __FUNCT__ "MatCholeskyFactorNumeric_SeqAIJCUSPARSE" 712 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info) 713 { 714 Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 715 IS ip = b->row; 716 PetscBool perm_identity; 717 PetscErrorCode ierr; 718 719 PetscFunctionBegin; 720 ierr = MatCholeskyFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 721 722 /* determine which version of MatSolve needs to be used. */ 723 ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr); 724 if (perm_identity) { 725 B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 726 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 727 } else { 728 B->ops->solve = MatSolve_SeqAIJCUSPARSE; 729 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 730 } 731 732 /* get the triangular factors */ 733 ierr = MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);CHKERRQ(ierr); 734 PetscFunctionReturn(0); 735 } 736 737 #undef __FUNCT__ 738 #define __FUNCT__ "MatSeqAIJCUSPARSEAnalyzeTransposeForSolve" 739 static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A) 740 { 741 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 742 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 743 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 744 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 745 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 746 cusparseStatus_t stat; 747 cusparseIndexBase_t indexBase; 748 cusparseMatrixType_t matrixType; 749 cusparseFillMode_t fillMode; 750 cusparseDiagType_t diagType; 751 752 PetscFunctionBegin; 753 754 /*********************************************/ 755 /* Now the Transpose of the Lower Tri Factor */ 756 /*********************************************/ 757 758 /* allocate space for the transpose of the lower triangular factor */ 759 loTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct; 760 761 /* set the matrix descriptors of the lower triangular factor */ 762 matrixType = cusparseGetMatType(loTriFactor->descr); 763 indexBase = cusparseGetMatIndexBase(loTriFactor->descr); 764 fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ? 765 CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 766 diagType = cusparseGetMatDiagType(loTriFactor->descr); 767 768 /* Create the matrix description */ 769 stat = cusparseCreateMatDescr(&loTriFactorT->descr);CHKERRCUDA(stat); 770 stat = cusparseSetMatIndexBase(loTriFactorT->descr, indexBase);CHKERRCUDA(stat); 771 stat = cusparseSetMatType(loTriFactorT->descr, matrixType);CHKERRCUDA(stat); 772 stat = cusparseSetMatFillMode(loTriFactorT->descr, fillMode);CHKERRCUDA(stat); 773 stat = cusparseSetMatDiagType(loTriFactorT->descr, diagType);CHKERRCUDA(stat); 774 775 /* Create the solve analysis information */ 776 stat = cusparseCreateSolveAnalysisInfo(&loTriFactorT->solveInfo);CHKERRCUDA(stat); 777 778 /* set the operation */ 779 loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 780 781 /* allocate GPU space for the CSC of the lower triangular factor*/ 782 loTriFactorT->csrMat = new CsrMatrix; 783 loTriFactorT->csrMat->num_rows = loTriFactor->csrMat->num_rows; 784 loTriFactorT->csrMat->num_cols = loTriFactor->csrMat->num_cols; 785 loTriFactorT->csrMat->num_entries = loTriFactor->csrMat->num_entries; 786 loTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(loTriFactor->csrMat->num_rows+1); 787 loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactor->csrMat->num_entries); 788 loTriFactorT->csrMat->values = new THRUSTARRAY(loTriFactor->csrMat->num_entries); 789 790 /* compute the transpose of the lower triangular factor, i.e. the CSC */ 791 stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, 792 loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, 793 loTriFactor->csrMat->values->data().get(), 794 loTriFactor->csrMat->row_offsets->data().get(), 795 loTriFactor->csrMat->column_indices->data().get(), 796 loTriFactorT->csrMat->values->data().get(), 797 loTriFactorT->csrMat->column_indices->data().get(), 798 loTriFactorT->csrMat->row_offsets->data().get(), 799 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUDA(stat); 800 801 /* perform the solve analysis on the transposed matrix */ 802 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp, 803 loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, 804 loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(), 805 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), 806 loTriFactorT->solveInfo);CHKERRCUDA(stat); 807 808 /* assign the pointer. Is this really necessary? */ 809 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT; 810 811 /*********************************************/ 812 /* Now the Transpose of the Upper Tri Factor */ 813 /*********************************************/ 814 815 /* allocate space for the transpose of the upper triangular factor */ 816 upTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct; 817 818 /* set the matrix descriptors of the upper triangular factor */ 819 matrixType = cusparseGetMatType(upTriFactor->descr); 820 indexBase = cusparseGetMatIndexBase(upTriFactor->descr); 821 fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ? 822 CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 823 diagType = cusparseGetMatDiagType(upTriFactor->descr); 824 825 /* Create the matrix description */ 826 stat = cusparseCreateMatDescr(&upTriFactorT->descr);CHKERRCUDA(stat); 827 stat = cusparseSetMatIndexBase(upTriFactorT->descr, indexBase);CHKERRCUDA(stat); 828 stat = cusparseSetMatType(upTriFactorT->descr, matrixType);CHKERRCUDA(stat); 829 stat = cusparseSetMatFillMode(upTriFactorT->descr, fillMode);CHKERRCUDA(stat); 830 stat = cusparseSetMatDiagType(upTriFactorT->descr, diagType);CHKERRCUDA(stat); 831 832 /* Create the solve analysis information */ 833 stat = cusparseCreateSolveAnalysisInfo(&upTriFactorT->solveInfo);CHKERRCUDA(stat); 834 835 /* set the operation */ 836 upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 837 838 /* allocate GPU space for the CSC of the upper triangular factor*/ 839 upTriFactorT->csrMat = new CsrMatrix; 840 upTriFactorT->csrMat->num_rows = upTriFactor->csrMat->num_rows; 841 upTriFactorT->csrMat->num_cols = upTriFactor->csrMat->num_cols; 842 upTriFactorT->csrMat->num_entries = upTriFactor->csrMat->num_entries; 843 upTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(upTriFactor->csrMat->num_rows+1); 844 upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactor->csrMat->num_entries); 845 upTriFactorT->csrMat->values = new THRUSTARRAY(upTriFactor->csrMat->num_entries); 846 847 /* compute the transpose of the upper triangular factor, i.e. the CSC */ 848 stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, 849 upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, 850 upTriFactor->csrMat->values->data().get(), 851 upTriFactor->csrMat->row_offsets->data().get(), 852 upTriFactor->csrMat->column_indices->data().get(), 853 upTriFactorT->csrMat->values->data().get(), 854 upTriFactorT->csrMat->column_indices->data().get(), 855 upTriFactorT->csrMat->row_offsets->data().get(), 856 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUDA(stat); 857 858 /* perform the solve analysis on the transposed matrix */ 859 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp, 860 upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, 861 upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(), 862 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), 863 upTriFactorT->solveInfo);CHKERRCUDA(stat); 864 865 /* assign the pointer. Is this really necessary? */ 866 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT; 867 PetscFunctionReturn(0); 868 } 869 870 #undef __FUNCT__ 871 #define __FUNCT__ "MatSeqAIJCUSPARSEGenerateTransposeForMult" 872 static PetscErrorCode MatSeqAIJCUSPARSEGenerateTransposeForMult(Mat A) 873 { 874 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 875 Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 876 Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 877 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 878 cusparseStatus_t stat; 879 cusparseIndexBase_t indexBase; 880 cudaError_t err; 881 882 PetscFunctionBegin; 883 884 /* allocate space for the triangular factor information */ 885 matstructT = new Mat_SeqAIJCUSPARSEMultStruct; 886 stat = cusparseCreateMatDescr(&matstructT->descr);CHKERRCUDA(stat); 887 indexBase = cusparseGetMatIndexBase(matstruct->descr); 888 stat = cusparseSetMatIndexBase(matstructT->descr, indexBase);CHKERRCUDA(stat); 889 stat = cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUDA(stat); 890 891 /* set alpha and beta */ 892 err = cudaMalloc((void **)&(matstructT->alpha),sizeof(PetscScalar));CHKERRCUDA(err); 893 err = cudaMemcpy(matstructT->alpha,&ALPHA,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 894 err = cudaMalloc((void **)&(matstructT->beta),sizeof(PetscScalar));CHKERRCUDA(err); 895 err = cudaMemcpy(matstructT->beta,&BETA,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 896 stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUDA(stat); 897 898 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 899 CsrMatrix *matrix = (CsrMatrix*)matstruct->mat; 900 CsrMatrix *matrixT= new CsrMatrix; 901 matrixT->num_rows = A->rmap->n; 902 matrixT->num_cols = A->cmap->n; 903 matrixT->num_entries = a->nz; 904 matrixT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 905 matrixT->column_indices = new THRUSTINTARRAY32(a->nz); 906 matrixT->values = new THRUSTARRAY(a->nz); 907 908 /* compute the transpose of the upper triangular factor, i.e. the CSC */ 909 indexBase = cusparseGetMatIndexBase(matstruct->descr); 910 stat = cusparse_csr2csc(cusparsestruct->handle, matrix->num_rows, 911 matrix->num_cols, matrix->num_entries, 912 matrix->values->data().get(), 913 matrix->row_offsets->data().get(), 914 matrix->column_indices->data().get(), 915 matrixT->values->data().get(), 916 matrixT->column_indices->data().get(), 917 matrixT->row_offsets->data().get(), 918 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUDA(stat); 919 920 /* assign the pointer */ 921 matstructT->mat = matrixT; 922 923 } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) { 924 #if CUDA_VERSION>=5000 925 /* First convert HYB to CSR */ 926 CsrMatrix *temp= new CsrMatrix; 927 temp->num_rows = A->rmap->n; 928 temp->num_cols = A->cmap->n; 929 temp->num_entries = a->nz; 930 temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 931 temp->column_indices = new THRUSTINTARRAY32(a->nz); 932 temp->values = new THRUSTARRAY(a->nz); 933 934 935 stat = cusparse_hyb2csr(cusparsestruct->handle, 936 matstruct->descr, (cusparseHybMat_t)matstruct->mat, 937 temp->values->data().get(), 938 temp->row_offsets->data().get(), 939 temp->column_indices->data().get());CHKERRCUDA(stat); 940 941 /* Next, convert CSR to CSC (i.e. the matrix transpose) */ 942 CsrMatrix *tempT= new CsrMatrix; 943 tempT->num_rows = A->rmap->n; 944 tempT->num_cols = A->cmap->n; 945 tempT->num_entries = a->nz; 946 tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 947 tempT->column_indices = new THRUSTINTARRAY32(a->nz); 948 tempT->values = new THRUSTARRAY(a->nz); 949 950 stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows, 951 temp->num_cols, temp->num_entries, 952 temp->values->data().get(), 953 temp->row_offsets->data().get(), 954 temp->column_indices->data().get(), 955 tempT->values->data().get(), 956 tempT->column_indices->data().get(), 957 tempT->row_offsets->data().get(), 958 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUDA(stat); 959 960 /* Last, convert CSC to HYB */ 961 cusparseHybMat_t hybMat; 962 stat = cusparseCreateHybMat(&hybMat);CHKERRCUDA(stat); 963 cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ? 964 CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 965 stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n, 966 matstructT->descr, tempT->values->data().get(), 967 tempT->row_offsets->data().get(), 968 tempT->column_indices->data().get(), 969 hybMat, 0, partition);CHKERRCUDA(stat); 970 971 /* assign the pointer */ 972 matstructT->mat = hybMat; 973 974 /* delete temporaries */ 975 if (tempT) { 976 if (tempT->values) delete (THRUSTARRAY*) tempT->values; 977 if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices; 978 if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets; 979 delete (CsrMatrix*) tempT; 980 } 981 if (temp) { 982 if (temp->values) delete (THRUSTARRAY*) temp->values; 983 if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices; 984 if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets; 985 delete (CsrMatrix*) temp; 986 } 987 #else 988 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"ELL (Ellpack) and HYB (Hybrid) storage format for the Matrix Transpose (in MatMultTranspose) require CUDA 5.0 or later."); 989 #endif 990 } 991 /* assign the compressed row indices */ 992 matstructT->cprowIndices = new THRUSTINTARRAY; 993 994 /* assign the pointer */ 995 ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT; 996 PetscFunctionReturn(0); 997 } 998 999 #undef __FUNCT__ 1000 #define __FUNCT__ "MatSolveTranspose_SeqAIJCUSPARSE" 1001 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 1002 { 1003 PetscInt n = xx->map->n; 1004 const PetscScalar *barray; 1005 PetscScalar *xarray; 1006 thrust::device_ptr<const PetscScalar> bGPU; 1007 thrust::device_ptr<PetscScalar> xGPU; 1008 cusparseStatus_t stat; 1009 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1010 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1011 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1012 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1013 PetscErrorCode ierr; 1014 1015 PetscFunctionBegin; 1016 /* Analyze the matrix and create the transpose ... on the fly */ 1017 if (!loTriFactorT && !upTriFactorT) { 1018 ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr); 1019 loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1020 upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1021 } 1022 1023 /* Get the GPU pointers */ 1024 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1025 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1026 xGPU = thrust::device_pointer_cast(xarray); 1027 bGPU = thrust::device_pointer_cast(barray); 1028 1029 /* First, reorder with the row permutation */ 1030 thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), 1031 thrust::make_permutation_iterator(bGPU+n, cusparseTriFactors->rpermIndices->end()), 1032 xGPU); 1033 1034 /* First, solve U */ 1035 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, 1036 upTriFactorT->csrMat->num_rows, &ALPHA, upTriFactorT->descr, 1037 upTriFactorT->csrMat->values->data().get(), 1038 upTriFactorT->csrMat->row_offsets->data().get(), 1039 upTriFactorT->csrMat->column_indices->data().get(), 1040 upTriFactorT->solveInfo, 1041 xarray, tempGPU->data().get());CHKERRCUDA(stat); 1042 1043 /* Then, solve L */ 1044 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, 1045 loTriFactorT->csrMat->num_rows, &ALPHA, loTriFactorT->descr, 1046 loTriFactorT->csrMat->values->data().get(), 1047 loTriFactorT->csrMat->row_offsets->data().get(), 1048 loTriFactorT->csrMat->column_indices->data().get(), 1049 loTriFactorT->solveInfo, 1050 tempGPU->data().get(), xarray);CHKERRCUDA(stat); 1051 1052 /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */ 1053 thrust::copy(thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()), 1054 thrust::make_permutation_iterator(xGPU+n, cusparseTriFactors->cpermIndices->end()), 1055 tempGPU->begin()); 1056 1057 /* Copy the temporary to the full solution. */ 1058 thrust::copy(tempGPU->begin(), tempGPU->end(), xGPU); 1059 1060 /* restore */ 1061 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1062 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1063 ierr = WaitForGPU();CHKERRCUDA(ierr); 1064 1065 ierr = PetscLogFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1066 PetscFunctionReturn(0); 1067 } 1068 1069 #undef __FUNCT__ 1070 #define __FUNCT__ "MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering" 1071 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 1072 { 1073 const PetscScalar *barray; 1074 PetscScalar *xarray; 1075 cusparseStatus_t stat; 1076 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1077 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1078 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1079 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1080 PetscErrorCode ierr; 1081 1082 PetscFunctionBegin; 1083 /* Analyze the matrix and create the transpose ... on the fly */ 1084 if (!loTriFactorT && !upTriFactorT) { 1085 ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr); 1086 loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1087 upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1088 } 1089 1090 /* Get the GPU pointers */ 1091 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1092 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1093 1094 /* First, solve U */ 1095 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, 1096 upTriFactorT->csrMat->num_rows, &ALPHA, upTriFactorT->descr, 1097 upTriFactorT->csrMat->values->data().get(), 1098 upTriFactorT->csrMat->row_offsets->data().get(), 1099 upTriFactorT->csrMat->column_indices->data().get(), 1100 upTriFactorT->solveInfo, 1101 barray, tempGPU->data().get());CHKERRCUDA(stat); 1102 1103 /* Then, solve L */ 1104 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, 1105 loTriFactorT->csrMat->num_rows, &ALPHA, loTriFactorT->descr, 1106 loTriFactorT->csrMat->values->data().get(), 1107 loTriFactorT->csrMat->row_offsets->data().get(), 1108 loTriFactorT->csrMat->column_indices->data().get(), 1109 loTriFactorT->solveInfo, 1110 tempGPU->data().get(), xarray);CHKERRCUDA(stat); 1111 1112 /* restore */ 1113 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1114 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1115 ierr = WaitForGPU();CHKERRCUDA(ierr); 1116 ierr = PetscLogFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1117 PetscFunctionReturn(0); 1118 } 1119 1120 #undef __FUNCT__ 1121 #define __FUNCT__ "MatSolve_SeqAIJCUSPARSE" 1122 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 1123 { 1124 const PetscScalar *barray; 1125 PetscScalar *xarray; 1126 thrust::device_ptr<const PetscScalar> bGPU; 1127 thrust::device_ptr<PetscScalar> xGPU; 1128 cusparseStatus_t stat; 1129 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1130 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1131 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1132 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1133 PetscErrorCode ierr; 1134 VecType t; 1135 PetscBool flg; 1136 1137 PetscFunctionBegin; 1138 ierr = VecGetType(bb,&t);CHKERRQ(ierr); 1139 ierr = PetscStrcmp(t,VECSEQCUDA,&flg);CHKERRQ(ierr); 1140 if (!flg) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Vector of type %s passed into MatSolve_SeqAIJCUSPARSE (Arg #2). Can only deal with %s\n.",t,VECSEQCUDA); 1141 ierr = VecGetType(xx,&t);CHKERRQ(ierr); 1142 ierr = PetscStrcmp(t,VECSEQCUDA,&flg);CHKERRQ(ierr); 1143 if (!flg) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Vector of type %s passed into MatSolve_SeqAIJCUSPARSE (Arg #3). Can only deal with %s\n.",t,VECSEQCUDA); 1144 1145 /* Get the GPU pointers */ 1146 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1147 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1148 xGPU = thrust::device_pointer_cast(xarray); 1149 bGPU = thrust::device_pointer_cast(barray); 1150 1151 /* First, reorder with the row permutation */ 1152 thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), 1153 thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()), 1154 xGPU); 1155 1156 /* Next, solve L */ 1157 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp, 1158 loTriFactor->csrMat->num_rows, &ALPHA, loTriFactor->descr, 1159 loTriFactor->csrMat->values->data().get(), 1160 loTriFactor->csrMat->row_offsets->data().get(), 1161 loTriFactor->csrMat->column_indices->data().get(), 1162 loTriFactor->solveInfo, 1163 xarray, tempGPU->data().get());CHKERRCUDA(stat); 1164 1165 /* Then, solve U */ 1166 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp, 1167 upTriFactor->csrMat->num_rows, &ALPHA, upTriFactor->descr, 1168 upTriFactor->csrMat->values->data().get(), 1169 upTriFactor->csrMat->row_offsets->data().get(), 1170 upTriFactor->csrMat->column_indices->data().get(), 1171 upTriFactor->solveInfo, 1172 tempGPU->data().get(), xarray);CHKERRCUDA(stat); 1173 1174 /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */ 1175 thrust::copy(thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()), 1176 thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->end()), 1177 tempGPU->begin()); 1178 1179 /* Copy the temporary to the full solution. */ 1180 thrust::copy(tempGPU->begin(), tempGPU->end(), xGPU); 1181 1182 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1183 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1184 ierr = WaitForGPU();CHKERRCUDA(ierr); 1185 ierr = PetscLogFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1186 PetscFunctionReturn(0); 1187 } 1188 1189 #undef __FUNCT__ 1190 #define __FUNCT__ "MatSolve_SeqAIJCUSPARSE_NaturalOrdering" 1191 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 1192 { 1193 const PetscScalar *barray; 1194 PetscScalar *xarray; 1195 cusparseStatus_t stat; 1196 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1197 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1198 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1199 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1200 PetscErrorCode ierr; 1201 1202 PetscFunctionBegin; 1203 /* Get the GPU pointers */ 1204 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1205 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1206 1207 /* First, solve L */ 1208 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp, 1209 loTriFactor->csrMat->num_rows, &ALPHA, loTriFactor->descr, 1210 loTriFactor->csrMat->values->data().get(), 1211 loTriFactor->csrMat->row_offsets->data().get(), 1212 loTriFactor->csrMat->column_indices->data().get(), 1213 loTriFactor->solveInfo, 1214 barray, tempGPU->data().get());CHKERRCUDA(stat); 1215 1216 /* Next, solve U */ 1217 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp, 1218 upTriFactor->csrMat->num_rows, &ALPHA, upTriFactor->descr, 1219 upTriFactor->csrMat->values->data().get(), 1220 upTriFactor->csrMat->row_offsets->data().get(), 1221 upTriFactor->csrMat->column_indices->data().get(), 1222 upTriFactor->solveInfo, 1223 tempGPU->data().get(), xarray);CHKERRCUDA(stat); 1224 1225 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1226 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1227 ierr = WaitForGPU();CHKERRCUDA(ierr); 1228 ierr = PetscLogFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1229 PetscFunctionReturn(0); 1230 } 1231 1232 #undef __FUNCT__ 1233 #define __FUNCT__ "MatSeqAIJCUSPARSECopyToGPU" 1234 static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A) 1235 { 1236 1237 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1238 Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 1239 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1240 PetscInt m = A->rmap->n,*ii,*ridx; 1241 PetscErrorCode ierr; 1242 cusparseStatus_t stat; 1243 cudaError_t err; 1244 1245 PetscFunctionBegin; 1246 if (A->valid_GPU_matrix == PETSC_CUDA_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUDA_CPU) { 1247 ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1248 if (A->assembled && A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) { 1249 CsrMatrix *matrix = (CsrMatrix*)matstruct->mat; 1250 /* copy values only */ 1251 matrix->values->assign(a->a, a->a+a->nz); 1252 } else { 1253 Mat_SeqAIJCUSPARSEMultStruct_Destroy(&matstruct,cusparsestruct->format); 1254 try { 1255 cusparsestruct->nonzerorow=0; 1256 for (int j = 0; j<m; j++) cusparsestruct->nonzerorow += ((a->i[j+1]-a->i[j])>0); 1257 1258 if (a->compressedrow.use) { 1259 m = a->compressedrow.nrows; 1260 ii = a->compressedrow.i; 1261 ridx = a->compressedrow.rindex; 1262 } else { 1263 /* Forcing compressed row on the GPU */ 1264 int k=0; 1265 ierr = PetscMalloc1(cusparsestruct->nonzerorow+1, &ii);CHKERRQ(ierr); 1266 ierr = PetscMalloc1(cusparsestruct->nonzerorow, &ridx);CHKERRQ(ierr); 1267 ii[0]=0; 1268 for (int j = 0; j<m; j++) { 1269 if ((a->i[j+1]-a->i[j])>0) { 1270 ii[k] = a->i[j]; 1271 ridx[k]= j; 1272 k++; 1273 } 1274 } 1275 ii[cusparsestruct->nonzerorow] = a->nz; 1276 m = cusparsestruct->nonzerorow; 1277 } 1278 1279 /* allocate space for the triangular factor information */ 1280 matstruct = new Mat_SeqAIJCUSPARSEMultStruct; 1281 stat = cusparseCreateMatDescr(&matstruct->descr);CHKERRCUDA(stat); 1282 stat = cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat); 1283 stat = cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUDA(stat); 1284 1285 err = cudaMalloc((void **)&(matstruct->alpha),sizeof(PetscScalar));CHKERRCUDA(err); 1286 err = cudaMemcpy(matstruct->alpha,&ALPHA,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1287 err = cudaMalloc((void **)&(matstruct->beta),sizeof(PetscScalar));CHKERRCUDA(err); 1288 err = cudaMemcpy(matstruct->beta,&BETA,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1289 stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUDA(stat); 1290 1291 /* Build a hybrid/ellpack matrix if this option is chosen for the storage */ 1292 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1293 /* set the matrix */ 1294 CsrMatrix *matrix= new CsrMatrix; 1295 matrix->num_rows = m; 1296 matrix->num_cols = A->cmap->n; 1297 matrix->num_entries = a->nz; 1298 matrix->row_offsets = new THRUSTINTARRAY32(m+1); 1299 matrix->row_offsets->assign(ii, ii + m+1); 1300 1301 matrix->column_indices = new THRUSTINTARRAY32(a->nz); 1302 matrix->column_indices->assign(a->j, a->j+a->nz); 1303 1304 matrix->values = new THRUSTARRAY(a->nz); 1305 matrix->values->assign(a->a, a->a+a->nz); 1306 1307 /* assign the pointer */ 1308 matstruct->mat = matrix; 1309 1310 } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) { 1311 #if CUDA_VERSION>=4020 1312 CsrMatrix *matrix= new CsrMatrix; 1313 matrix->num_rows = m; 1314 matrix->num_cols = A->cmap->n; 1315 matrix->num_entries = a->nz; 1316 matrix->row_offsets = new THRUSTINTARRAY32(m+1); 1317 matrix->row_offsets->assign(ii, ii + m+1); 1318 1319 matrix->column_indices = new THRUSTINTARRAY32(a->nz); 1320 matrix->column_indices->assign(a->j, a->j+a->nz); 1321 1322 matrix->values = new THRUSTARRAY(a->nz); 1323 matrix->values->assign(a->a, a->a+a->nz); 1324 1325 cusparseHybMat_t hybMat; 1326 stat = cusparseCreateHybMat(&hybMat);CHKERRCUDA(stat); 1327 cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ? 1328 CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 1329 stat = cusparse_csr2hyb(cusparsestruct->handle, matrix->num_rows, matrix->num_cols, 1330 matstruct->descr, matrix->values->data().get(), 1331 matrix->row_offsets->data().get(), 1332 matrix->column_indices->data().get(), 1333 hybMat, 0, partition);CHKERRCUDA(stat); 1334 /* assign the pointer */ 1335 matstruct->mat = hybMat; 1336 1337 if (matrix) { 1338 if (matrix->values) delete (THRUSTARRAY*)matrix->values; 1339 if (matrix->column_indices) delete (THRUSTINTARRAY32*)matrix->column_indices; 1340 if (matrix->row_offsets) delete (THRUSTINTARRAY32*)matrix->row_offsets; 1341 delete (CsrMatrix*)matrix; 1342 } 1343 #endif 1344 } 1345 1346 /* assign the compressed row indices */ 1347 matstruct->cprowIndices = new THRUSTINTARRAY(m); 1348 matstruct->cprowIndices->assign(ridx,ridx+m); 1349 1350 /* assign the pointer */ 1351 cusparsestruct->mat = matstruct; 1352 1353 if (!a->compressedrow.use) { 1354 ierr = PetscFree(ii);CHKERRQ(ierr); 1355 ierr = PetscFree(ridx);CHKERRQ(ierr); 1356 } 1357 cusparsestruct->workVector = new THRUSTARRAY; 1358 cusparsestruct->workVector->resize(m); 1359 } catch(char *ex) { 1360 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 1361 } 1362 ierr = WaitForGPU();CHKERRCUDA(ierr); 1363 1364 A->valid_GPU_matrix = PETSC_CUDA_BOTH; 1365 cusparsestruct->nonzerostate = A->nonzerostate; 1366 } 1367 ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1368 } 1369 PetscFunctionReturn(0); 1370 } 1371 1372 #undef __FUNCT__ 1373 #define __FUNCT__ "MatCreateVecs_SeqAIJCUSPARSE" 1374 static PetscErrorCode MatCreateVecs_SeqAIJCUSPARSE(Mat mat, Vec *right, Vec *left) 1375 { 1376 PetscErrorCode ierr; 1377 PetscInt rbs,cbs; 1378 1379 PetscFunctionBegin; 1380 ierr = MatGetBlockSizes(mat,&rbs,&cbs);CHKERRQ(ierr); 1381 if (right) { 1382 ierr = VecCreate(PetscObjectComm((PetscObject)mat),right);CHKERRQ(ierr); 1383 ierr = VecSetSizes(*right,mat->cmap->n,PETSC_DETERMINE);CHKERRQ(ierr); 1384 ierr = VecSetBlockSize(*right,cbs);CHKERRQ(ierr); 1385 ierr = VecSetType(*right,VECSEQCUDA);CHKERRQ(ierr); 1386 ierr = PetscLayoutReference(mat->cmap,&(*right)->map);CHKERRQ(ierr); 1387 } 1388 if (left) { 1389 ierr = VecCreate(PetscObjectComm((PetscObject)mat),left);CHKERRQ(ierr); 1390 ierr = VecSetSizes(*left,mat->rmap->n,PETSC_DETERMINE);CHKERRQ(ierr); 1391 ierr = VecSetBlockSize(*left,rbs);CHKERRQ(ierr); 1392 ierr = VecSetType(*left,VECSEQCUDA);CHKERRQ(ierr); 1393 ierr = PetscLayoutReference(mat->rmap,&(*left)->map);CHKERRQ(ierr); 1394 } 1395 PetscFunctionReturn(0); 1396 } 1397 1398 struct VecCUDAPlusEquals 1399 { 1400 template <typename Tuple> 1401 __host__ __device__ 1402 void operator()(Tuple t) 1403 { 1404 thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t); 1405 } 1406 }; 1407 1408 #undef __FUNCT__ 1409 #define __FUNCT__ "MatMult_SeqAIJCUSPARSE" 1410 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 1411 { 1412 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1413 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1414 Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 1415 const PetscScalar *xarray; 1416 PetscScalar *yarray; 1417 PetscErrorCode ierr; 1418 cusparseStatus_t stat; 1419 1420 PetscFunctionBegin; 1421 /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */ 1422 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 1423 ierr = VecCUDAGetArrayRead(xx,&xarray);CHKERRQ(ierr); 1424 ierr = VecCUDAGetArrayWrite(yy,&yarray);CHKERRQ(ierr); 1425 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1426 CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 1427 stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1428 mat->num_rows, mat->num_cols, mat->num_entries, 1429 matstruct->alpha, matstruct->descr, mat->values->data().get(), mat->row_offsets->data().get(), 1430 mat->column_indices->data().get(), xarray, matstruct->beta, 1431 yarray);CHKERRCUDA(stat); 1432 } else { 1433 #if CUDA_VERSION>=4020 1434 cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat; 1435 stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1436 matstruct->alpha, matstruct->descr, hybMat, 1437 xarray, matstruct->beta, 1438 yarray);CHKERRCUDA(stat); 1439 #endif 1440 } 1441 ierr = VecCUDARestoreArrayRead(xx,&xarray);CHKERRQ(ierr); 1442 ierr = VecCUDARestoreArrayWrite(yy,&yarray);CHKERRQ(ierr); 1443 if (!cusparsestruct->stream) { 1444 ierr = WaitForGPU();CHKERRCUDA(ierr); 1445 } 1446 ierr = PetscLogFlops(2.0*a->nz - cusparsestruct->nonzerorow);CHKERRQ(ierr); 1447 PetscFunctionReturn(0); 1448 } 1449 1450 #undef __FUNCT__ 1451 #define __FUNCT__ "MatMultTranspose_SeqAIJCUSPARSE" 1452 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 1453 { 1454 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1455 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1456 Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 1457 const PetscScalar *xarray; 1458 PetscScalar *yarray; 1459 PetscErrorCode ierr; 1460 cusparseStatus_t stat; 1461 1462 PetscFunctionBegin; 1463 /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */ 1464 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 1465 if (!matstructT) { 1466 ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr); 1467 matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 1468 } 1469 ierr = VecCUDAGetArrayRead(xx,&xarray);CHKERRQ(ierr); 1470 ierr = VecCUDAGetArrayWrite(yy,&yarray);CHKERRQ(ierr); 1471 1472 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1473 CsrMatrix *mat = (CsrMatrix*)matstructT->mat; 1474 stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1475 mat->num_rows, mat->num_cols, 1476 mat->num_entries, matstructT->alpha, matstructT->descr, 1477 mat->values->data().get(), mat->row_offsets->data().get(), 1478 mat->column_indices->data().get(), xarray, matstructT->beta, 1479 yarray);CHKERRCUDA(stat); 1480 } else { 1481 #if CUDA_VERSION>=4020 1482 cusparseHybMat_t hybMat = (cusparseHybMat_t)matstructT->mat; 1483 stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1484 matstructT->alpha, matstructT->descr, hybMat, 1485 xarray, matstructT->beta, 1486 yarray);CHKERRCUDA(stat); 1487 #endif 1488 } 1489 ierr = VecCUDARestoreArrayRead(xx,&xarray);CHKERRQ(ierr); 1490 ierr = VecCUDARestoreArrayWrite(yy,&yarray);CHKERRQ(ierr); 1491 if (!cusparsestruct->stream) { 1492 ierr = WaitForGPU();CHKERRCUDA(ierr); 1493 } 1494 ierr = PetscLogFlops(2.0*a->nz - cusparsestruct->nonzerorow);CHKERRQ(ierr); 1495 PetscFunctionReturn(0); 1496 } 1497 1498 1499 #undef __FUNCT__ 1500 #define __FUNCT__ "MatMultAdd_SeqAIJCUSPARSE" 1501 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 1502 { 1503 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1504 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1505 Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 1506 thrust::device_ptr<PetscScalar> zptr; 1507 const PetscScalar *xarray; 1508 PetscScalar *zarray; 1509 PetscErrorCode ierr; 1510 cusparseStatus_t stat; 1511 1512 PetscFunctionBegin; 1513 /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */ 1514 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 1515 try { 1516 ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr); 1517 ierr = VecCUDAGetArrayRead(xx,&xarray);CHKERRQ(ierr); 1518 ierr = VecCUDAGetArrayReadWrite(zz,&zarray);CHKERRQ(ierr); 1519 zptr = thrust::device_pointer_cast(zarray); 1520 1521 /* multiply add */ 1522 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1523 CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 1524 /* here we need to be careful to set the number of rows in the multiply to the 1525 number of compressed rows in the matrix ... which is equivalent to the 1526 size of the workVector */ 1527 stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1528 mat->num_rows, mat->num_cols, 1529 mat->num_entries, matstruct->alpha, matstruct->descr, 1530 mat->values->data().get(), mat->row_offsets->data().get(), 1531 mat->column_indices->data().get(), xarray, matstruct->beta, 1532 cusparsestruct->workVector->data().get());CHKERRCUDA(stat); 1533 } else { 1534 #if CUDA_VERSION>=4020 1535 cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat; 1536 if (cusparsestruct->workVector->size()) { 1537 stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1538 matstruct->alpha, matstruct->descr, hybMat, 1539 xarray, matstruct->beta, 1540 cusparsestruct->workVector->data().get());CHKERRCUDA(stat); 1541 } 1542 #endif 1543 } 1544 1545 /* scatter the data from the temporary into the full vector with a += operation */ 1546 thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))), 1547 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + cusparsestruct->workVector->size(), 1548 VecCUDAPlusEquals()); 1549 ierr = VecCUDARestoreArrayRead(xx,&xarray);CHKERRQ(ierr); 1550 ierr = VecCUDARestoreArrayReadWrite(zz,&zarray);CHKERRQ(ierr); 1551 1552 } catch(char *ex) { 1553 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 1554 } 1555 ierr = WaitForGPU();CHKERRCUDA(ierr); 1556 ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr); 1557 PetscFunctionReturn(0); 1558 } 1559 1560 #undef __FUNCT__ 1561 #define __FUNCT__ "MatMultTransposeAdd_SeqAIJCUSPARSE" 1562 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 1563 { 1564 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1565 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1566 Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 1567 thrust::device_ptr<PetscScalar> zptr; 1568 const PetscScalar *xarray; 1569 PetscScalar *zarray; 1570 PetscErrorCode ierr; 1571 cusparseStatus_t stat; 1572 1573 PetscFunctionBegin; 1574 /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */ 1575 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 1576 if (!matstructT) { 1577 ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr); 1578 matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 1579 } 1580 1581 try { 1582 ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr); 1583 ierr = VecCUDAGetArrayRead(xx,&xarray);CHKERRQ(ierr); 1584 ierr = VecCUDAGetArrayReadWrite(zz,&zarray);CHKERRQ(ierr); 1585 zptr = thrust::device_pointer_cast(zarray); 1586 1587 /* multiply add with matrix transpose */ 1588 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1589 CsrMatrix *mat = (CsrMatrix*)matstructT->mat; 1590 /* here we need to be careful to set the number of rows in the multiply to the 1591 number of compressed rows in the matrix ... which is equivalent to the 1592 size of the workVector */ 1593 stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1594 mat->num_rows, mat->num_cols, 1595 mat->num_entries, matstructT->alpha, matstructT->descr, 1596 mat->values->data().get(), mat->row_offsets->data().get(), 1597 mat->column_indices->data().get(), xarray, matstructT->beta, 1598 cusparsestruct->workVector->data().get());CHKERRCUDA(stat); 1599 } else { 1600 #if CUDA_VERSION>=4020 1601 cusparseHybMat_t hybMat = (cusparseHybMat_t)matstructT->mat; 1602 if (cusparsestruct->workVector->size()) { 1603 stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1604 matstructT->alpha, matstructT->descr, hybMat, 1605 xarray, matstructT->beta, 1606 cusparsestruct->workVector->data().get());CHKERRCUDA(stat); 1607 } 1608 #endif 1609 } 1610 1611 /* scatter the data from the temporary into the full vector with a += operation */ 1612 thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstructT->cprowIndices->begin()))), 1613 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstructT->cprowIndices->begin()))) + cusparsestruct->workVector->size(), 1614 VecCUDAPlusEquals()); 1615 1616 ierr = VecCUDARestoreArrayRead(xx,&xarray);CHKERRQ(ierr); 1617 ierr = VecCUDARestoreArrayReadWrite(zz,&zarray);CHKERRQ(ierr); 1618 1619 } catch(char *ex) { 1620 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 1621 } 1622 ierr = WaitForGPU();CHKERRCUDA(ierr); 1623 ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr); 1624 PetscFunctionReturn(0); 1625 } 1626 1627 #undef __FUNCT__ 1628 #define __FUNCT__ "MatAssemblyEnd_SeqAIJCUSPARSE" 1629 static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode) 1630 { 1631 PetscErrorCode ierr; 1632 1633 PetscFunctionBegin; 1634 ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr); 1635 if (A->factortype==MAT_FACTOR_NONE) { 1636 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 1637 } 1638 if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0); 1639 A->ops->mult = MatMult_SeqAIJCUSPARSE; 1640 A->ops->multadd = MatMultAdd_SeqAIJCUSPARSE; 1641 A->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE; 1642 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE; 1643 PetscFunctionReturn(0); 1644 } 1645 1646 /* --------------------------------------------------------------------------------*/ 1647 #undef __FUNCT__ 1648 #define __FUNCT__ "MatCreateSeqAIJCUSPARSE" 1649 /*@ 1650 MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format 1651 (the default parallel PETSc format). This matrix will ultimately pushed down 1652 to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix 1653 assembly performance the user should preallocate the matrix storage by setting 1654 the parameter nz (or the array nnz). By setting these parameters accurately, 1655 performance during matrix assembly can be increased by more than a factor of 50. 1656 1657 Collective on MPI_Comm 1658 1659 Input Parameters: 1660 + comm - MPI communicator, set to PETSC_COMM_SELF 1661 . m - number of rows 1662 . n - number of columns 1663 . nz - number of nonzeros per row (same for all rows) 1664 - nnz - array containing the number of nonzeros in the various rows 1665 (possibly different for each row) or NULL 1666 1667 Output Parameter: 1668 . A - the matrix 1669 1670 It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 1671 MatXXXXSetPreallocation() paradgm instead of this routine directly. 1672 [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 1673 1674 Notes: 1675 If nnz is given then nz is ignored 1676 1677 The AIJ format (also called the Yale sparse matrix format or 1678 compressed row storage), is fully compatible with standard Fortran 77 1679 storage. That is, the stored row and column indices can begin at 1680 either one (as in Fortran) or zero. See the users' manual for details. 1681 1682 Specify the preallocated storage with either nz or nnz (not both). 1683 Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 1684 allocation. For large problems you MUST preallocate memory or you 1685 will get TERRIBLE performance, see the users' manual chapter on matrices. 1686 1687 By default, this format uses inodes (identical nodes) when possible, to 1688 improve numerical efficiency of matrix-vector products and solves. We 1689 search for consecutive rows with the same nonzero structure, thereby 1690 reusing matrix information to achieve increased efficiency. 1691 1692 Level: intermediate 1693 1694 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE 1695 @*/ 1696 PetscErrorCode MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A) 1697 { 1698 PetscErrorCode ierr; 1699 1700 PetscFunctionBegin; 1701 ierr = MatCreate(comm,A);CHKERRQ(ierr); 1702 ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr); 1703 ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 1704 ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr); 1705 PetscFunctionReturn(0); 1706 } 1707 1708 #undef __FUNCT__ 1709 #define __FUNCT__ "MatDestroy_SeqAIJCUSPARSE" 1710 static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A) 1711 { 1712 PetscErrorCode ierr; 1713 1714 PetscFunctionBegin; 1715 if (A->factortype==MAT_FACTOR_NONE) { 1716 if (A->valid_GPU_matrix != PETSC_CUDA_UNALLOCATED) { 1717 ierr = Mat_SeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr);CHKERRQ(ierr); 1718 } 1719 } else { 1720 ierr = Mat_SeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr);CHKERRQ(ierr); 1721 } 1722 ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr); 1723 PetscFunctionReturn(0); 1724 } 1725 1726 #undef __FUNCT__ 1727 #define __FUNCT__ "MatCreate_SeqAIJCUSPARSE" 1728 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B) 1729 { 1730 PetscErrorCode ierr; 1731 cusparseStatus_t stat; 1732 cusparseHandle_t handle=0; 1733 1734 PetscFunctionBegin; 1735 ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr); 1736 if (B->factortype==MAT_FACTOR_NONE) { 1737 /* you cannot check the inode.use flag here since the matrix was just created. 1738 now build a GPU matrix data structure */ 1739 B->spptr = new Mat_SeqAIJCUSPARSE; 1740 ((Mat_SeqAIJCUSPARSE*)B->spptr)->mat = 0; 1741 ((Mat_SeqAIJCUSPARSE*)B->spptr)->matTranspose = 0; 1742 ((Mat_SeqAIJCUSPARSE*)B->spptr)->workVector = 0; 1743 ((Mat_SeqAIJCUSPARSE*)B->spptr)->format = MAT_CUSPARSE_CSR; 1744 ((Mat_SeqAIJCUSPARSE*)B->spptr)->stream = 0; 1745 ((Mat_SeqAIJCUSPARSE*)B->spptr)->handle = 0; 1746 stat = cusparseCreate(&handle);CHKERRCUDA(stat); 1747 ((Mat_SeqAIJCUSPARSE*)B->spptr)->handle = handle; 1748 ((Mat_SeqAIJCUSPARSE*)B->spptr)->stream = 0; 1749 } else { 1750 /* NEXT, set the pointers to the triangular factors */ 1751 B->spptr = new Mat_SeqAIJCUSPARSETriFactors; 1752 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->loTriFactorPtr = 0; 1753 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->upTriFactorPtr = 0; 1754 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->loTriFactorPtrTranspose = 0; 1755 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->upTriFactorPtrTranspose = 0; 1756 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->rpermIndices = 0; 1757 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->cpermIndices = 0; 1758 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->workVector = 0; 1759 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->handle = 0; 1760 stat = cusparseCreate(&handle);CHKERRCUDA(stat); 1761 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->handle = handle; 1762 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->nnz = 0; 1763 } 1764 1765 B->ops->assemblyend = MatAssemblyEnd_SeqAIJCUSPARSE; 1766 B->ops->destroy = MatDestroy_SeqAIJCUSPARSE; 1767 B->ops->getvecs = MatCreateVecs_SeqAIJCUSPARSE; 1768 B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE; 1769 B->ops->mult = MatMult_SeqAIJCUSPARSE; 1770 B->ops->multadd = MatMultAdd_SeqAIJCUSPARSE; 1771 B->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE; 1772 B->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE; 1773 1774 ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 1775 1776 B->valid_GPU_matrix = PETSC_CUDA_UNALLOCATED; 1777 1778 ierr = PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr); 1779 PetscFunctionReturn(0); 1780 } 1781 1782 /*M 1783 MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices. 1784 1785 A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either 1786 CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later. 1787 All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library. 1788 1789 Options Database Keys: 1790 + -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions() 1791 . -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 1792 . -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 1793 1794 Level: beginner 1795 1796 .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 1797 M*/ 1798 1799 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat,MatFactorType,Mat*); 1800 1801 1802 #undef __FUNCT__ 1803 #define __FUNCT__ "MatSolverPackageRegister_CUSPARSE" 1804 PETSC_EXTERN PetscErrorCode MatSolverPackageRegister_CUSPARSE(void) 1805 { 1806 PetscErrorCode ierr; 1807 1808 PetscFunctionBegin; 1809 ierr = MatSolverPackageRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 1810 ierr = MatSolverPackageRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 1811 ierr = MatSolverPackageRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 1812 ierr = MatSolverPackageRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 1813 PetscFunctionReturn(0); 1814 } 1815 1816 1817 #undef __FUNCT__ 1818 #define __FUNCT__ "Mat_SeqAIJCUSPARSE_Destroy" 1819 static PetscErrorCode Mat_SeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct) 1820 { 1821 cusparseStatus_t stat; 1822 cusparseHandle_t handle; 1823 1824 PetscFunctionBegin; 1825 if (*cusparsestruct) { 1826 Mat_SeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format); 1827 Mat_SeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format); 1828 delete (*cusparsestruct)->workVector; 1829 if (handle = (*cusparsestruct)->handle) { 1830 stat = cusparseDestroy(handle);CHKERRCUDA(stat); 1831 } 1832 delete *cusparsestruct; 1833 *cusparsestruct = 0; 1834 } 1835 PetscFunctionReturn(0); 1836 } 1837 1838 #undef __FUNCT__ 1839 #define __FUNCT__ "CsrMatrix_Destroy" 1840 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat) 1841 { 1842 PetscFunctionBegin; 1843 if (*mat) { 1844 delete (*mat)->values; 1845 delete (*mat)->column_indices; 1846 delete (*mat)->row_offsets; 1847 delete *mat; 1848 *mat = 0; 1849 } 1850 PetscFunctionReturn(0); 1851 } 1852 1853 #undef __FUNCT__ 1854 #define __FUNCT__ "Mat_SeqAIJCUSPARSETriFactorStruct_Destroy" 1855 static PetscErrorCode Mat_SeqAIJCUSPARSETriFactorStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor) 1856 { 1857 cusparseStatus_t stat; 1858 PetscErrorCode ierr; 1859 1860 PetscFunctionBegin; 1861 if (*trifactor) { 1862 if ((*trifactor)->descr) { stat = cusparseDestroyMatDescr((*trifactor)->descr);CHKERRCUDA(stat); } 1863 if ((*trifactor)->solveInfo) { stat = cusparseDestroySolveAnalysisInfo((*trifactor)->solveInfo);CHKERRCUDA(stat); } 1864 ierr = CsrMatrix_Destroy(&(*trifactor)->csrMat);CHKERRQ(ierr); 1865 delete *trifactor; 1866 *trifactor = 0; 1867 } 1868 PetscFunctionReturn(0); 1869 } 1870 1871 #undef __FUNCT__ 1872 #define __FUNCT__ "Mat_SeqAIJCUSPARSEMultStruct_Destroy" 1873 static PetscErrorCode Mat_SeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format) 1874 { 1875 CsrMatrix *mat; 1876 cusparseStatus_t stat; 1877 cudaError_t err; 1878 1879 PetscFunctionBegin; 1880 if (*matstruct) { 1881 if ((*matstruct)->mat) { 1882 if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) { 1883 cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat; 1884 stat = cusparseDestroyHybMat(hybMat);CHKERRCUDA(stat); 1885 } else { 1886 mat = (CsrMatrix*)(*matstruct)->mat; 1887 CsrMatrix_Destroy(&mat); 1888 } 1889 } 1890 if ((*matstruct)->descr) { stat = cusparseDestroyMatDescr((*matstruct)->descr);CHKERRCUDA(stat); } 1891 delete (*matstruct)->cprowIndices; 1892 if ((*matstruct)->alpha) { err=cudaFree((*matstruct)->alpha);CHKERRCUDA(err); } 1893 if ((*matstruct)->beta) { err=cudaFree((*matstruct)->beta);CHKERRCUDA(err); } 1894 delete *matstruct; 1895 *matstruct = 0; 1896 } 1897 PetscFunctionReturn(0); 1898 } 1899 1900 #undef __FUNCT__ 1901 #define __FUNCT__ "Mat_SeqAIJCUSPARSETriFactors_Destroy" 1902 static PetscErrorCode Mat_SeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors) 1903 { 1904 cusparseHandle_t handle; 1905 cusparseStatus_t stat; 1906 1907 PetscFunctionBegin; 1908 if (*trifactors) { 1909 Mat_SeqAIJCUSPARSETriFactorStruct_Destroy(&(*trifactors)->loTriFactorPtr); 1910 Mat_SeqAIJCUSPARSETriFactorStruct_Destroy(&(*trifactors)->upTriFactorPtr); 1911 Mat_SeqAIJCUSPARSETriFactorStruct_Destroy(&(*trifactors)->loTriFactorPtrTranspose); 1912 Mat_SeqAIJCUSPARSETriFactorStruct_Destroy(&(*trifactors)->upTriFactorPtrTranspose); 1913 delete (*trifactors)->rpermIndices; 1914 delete (*trifactors)->cpermIndices; 1915 delete (*trifactors)->workVector; 1916 if (handle = (*trifactors)->handle) { 1917 stat = cusparseDestroy(handle);CHKERRCUDA(stat); 1918 } 1919 delete *trifactors; 1920 *trifactors = 0; 1921 } 1922 PetscFunctionReturn(0); 1923 } 1924 1925