1 /* 2 Defines the basic matrix operations for the AIJ (compressed row) 3 matrix storage format using the CUSPARSE library, 4 */ 5 #define PETSC_SKIP_SPINLOCK 6 7 #include <petscconf.h> 8 #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/ 9 #include <../src/mat/impls/sbaij/seq/sbaij.h> 10 #include <../src/vec/vec/impls/dvecimpl.h> 11 #include <petsc/private/vecimpl.h> 12 #undef VecType 13 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h> 14 15 const char *const MatCUSPARSEStorageFormats[] = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0}; 16 17 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*); 18 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*); 19 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*); 20 21 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 22 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 23 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*); 24 25 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec); 26 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 27 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 28 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 29 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat); 30 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec); 31 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 32 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 33 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 34 35 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**); 36 static PetscErrorCode Mat_SeqAIJCUSPARSETriFactorStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**); 37 static PetscErrorCode Mat_SeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat); 38 static PetscErrorCode Mat_SeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**); 39 static PetscErrorCode Mat_SeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**); 40 41 #undef __FUNCT__ 42 #define __FUNCT__ "MatCUSPARSESetStream" 43 PetscErrorCode MatCUSPARSESetStream(Mat A,const cudaStream_t stream) 44 { 45 cusparseStatus_t stat; 46 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 47 48 PetscFunctionBegin; 49 cusparsestruct->stream = stream; 50 stat = cusparseSetStream(cusparsestruct->handle,cusparsestruct->stream);CHKERRCUDA(stat); 51 PetscFunctionReturn(0); 52 } 53 54 #undef __FUNCT__ 55 #define __FUNCT__ "MatCUSPARSESetHandle" 56 PetscErrorCode MatCUSPARSESetHandle(Mat A,const cusparseHandle_t handle) 57 { 58 cusparseStatus_t stat; 59 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 60 61 PetscFunctionBegin; 62 if (cusparsestruct->handle) 63 stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUDA(stat); 64 cusparsestruct->handle = handle; 65 stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUDA(stat); 66 PetscFunctionReturn(0); 67 } 68 69 #undef __FUNCT__ 70 #define __FUNCT__ "MatCUSPARSEClearHandle" 71 PetscErrorCode MatCUSPARSEClearHandle(Mat A) 72 { 73 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 74 PetscFunctionBegin; 75 if (cusparsestruct->handle) 76 cusparsestruct->handle = 0; 77 PetscFunctionReturn(0); 78 } 79 80 #undef __FUNCT__ 81 #define __FUNCT__ "MatFactorGetSolverPackage_seqaij_cusparse" 82 PetscErrorCode MatFactorGetSolverPackage_seqaij_cusparse(Mat A,const MatSolverPackage *type) 83 { 84 PetscFunctionBegin; 85 *type = MATSOLVERCUSPARSE; 86 PetscFunctionReturn(0); 87 } 88 89 /*MC 90 MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices 91 on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported 92 algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer 93 performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the 94 CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these 95 algorithms are not recommended. This class does NOT support direct solver operations. 96 97 Level: beginner 98 99 .seealso: PCFactorSetMatSolverPackage(), MatSolverPackage, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 100 M*/ 101 102 #undef __FUNCT__ 103 #define __FUNCT__ "MatGetFactor_seqaijcusparse_cusparse" 104 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B) 105 { 106 PetscErrorCode ierr; 107 PetscInt n = A->rmap->n; 108 109 PetscFunctionBegin; 110 ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr); 111 (*B)->factortype = ftype; 112 ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr); 113 ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 114 115 if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) { 116 ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr); 117 (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE; 118 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJCUSPARSE; 119 } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) { 120 (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJCUSPARSE; 121 (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE; 122 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types"); 123 124 ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr); 125 ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverPackage_C",MatFactorGetSolverPackage_seqaij_cusparse);CHKERRQ(ierr); 126 PetscFunctionReturn(0); 127 } 128 129 #undef __FUNCT__ 130 #define __FUNCT__ "MatCUSPARSESetFormat_SeqAIJCUSPARSE" 131 PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 132 { 133 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 134 135 PetscFunctionBegin; 136 #if CUDA_VERSION>=4020 137 switch (op) { 138 case MAT_CUSPARSE_MULT: 139 cusparsestruct->format = format; 140 break; 141 case MAT_CUSPARSE_ALL: 142 cusparsestruct->format = format; 143 break; 144 default: 145 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op); 146 } 147 #else 148 if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"ELL (Ellpack) and HYB (Hybrid) storage format require CUDA 4.2 or later."); 149 #endif 150 PetscFunctionReturn(0); 151 } 152 153 /*@ 154 MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular 155 operation. Only the MatMult operation can use different GPU storage formats 156 for MPIAIJCUSPARSE matrices. 157 Not Collective 158 159 Input Parameters: 160 + A - Matrix of type SEQAIJCUSPARSE 161 . op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL. 162 - format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2) 163 164 Output Parameter: 165 166 Level: intermediate 167 168 .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 169 @*/ 170 #undef __FUNCT__ 171 #define __FUNCT__ "MatCUSPARSESetFormat" 172 PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 173 { 174 PetscErrorCode ierr; 175 176 PetscFunctionBegin; 177 PetscValidHeaderSpecific(A, MAT_CLASSID,1); 178 ierr = PetscTryMethod(A, "MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));CHKERRQ(ierr); 179 PetscFunctionReturn(0); 180 } 181 182 #undef __FUNCT__ 183 #define __FUNCT__ "MatSetFromOptions_SeqAIJCUSPARSE" 184 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A) 185 { 186 PetscErrorCode ierr; 187 MatCUSPARSEStorageFormat format; 188 PetscBool flg; 189 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 190 191 PetscFunctionBegin; 192 ierr = PetscOptionsHead(PetscOptionsObject,"SeqAIJCUSPARSE options");CHKERRQ(ierr); 193 ierr = PetscObjectOptionsBegin((PetscObject)A); 194 if (A->factortype==MAT_FACTOR_NONE) { 195 ierr = PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV", 196 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 197 if (flg) { 198 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);CHKERRQ(ierr); 199 } 200 } 201 ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve", 202 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 203 if (flg) { 204 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr); 205 } 206 ierr = PetscOptionsEnd();CHKERRQ(ierr); 207 PetscFunctionReturn(0); 208 209 } 210 211 #undef __FUNCT__ 212 #define __FUNCT__ "MatILUFactorSymbolic_SeqAIJCUSPARSE" 213 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 214 { 215 PetscErrorCode ierr; 216 217 PetscFunctionBegin; 218 ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 219 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 220 PetscFunctionReturn(0); 221 } 222 223 #undef __FUNCT__ 224 #define __FUNCT__ "MatLUFactorSymbolic_SeqAIJCUSPARSE" 225 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 226 { 227 PetscErrorCode ierr; 228 229 PetscFunctionBegin; 230 ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 231 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 232 PetscFunctionReturn(0); 233 } 234 235 #undef __FUNCT__ 236 #define __FUNCT__ "MatICCFactorSymbolic_SeqAIJCUSPARSE" 237 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info) 238 { 239 PetscErrorCode ierr; 240 241 PetscFunctionBegin; 242 ierr = MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr); 243 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 244 PetscFunctionReturn(0); 245 } 246 247 #undef __FUNCT__ 248 #define __FUNCT__ "MatCholeskyFactorSymbolic_SeqAIJCUSPARSE" 249 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info) 250 { 251 PetscErrorCode ierr; 252 253 PetscFunctionBegin; 254 ierr = MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr); 255 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 256 PetscFunctionReturn(0); 257 } 258 259 #undef __FUNCT__ 260 #define __FUNCT__ "MatSeqAIJCUSPARSEBuildILULowerTriMatrix" 261 static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A) 262 { 263 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 264 PetscInt n = A->rmap->n; 265 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 266 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 267 cusparseStatus_t stat; 268 const PetscInt *ai = a->i,*aj = a->j,*vi; 269 const MatScalar *aa = a->a,*v; 270 PetscInt *AiLo, *AjLo; 271 PetscScalar *AALo; 272 PetscInt i,nz, nzLower, offset, rowOffset; 273 PetscErrorCode ierr; 274 275 PetscFunctionBegin; 276 if (A->valid_GPU_matrix == PETSC_CUDA_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUDA_CPU) { 277 try { 278 /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */ 279 nzLower=n+ai[n]-ai[1]; 280 281 /* Allocate Space for the lower triangular matrix */ 282 ierr = cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUDA(ierr); 283 ierr = cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUDA(ierr); 284 ierr = cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUDA(ierr); 285 286 /* Fill the lower triangular matrix */ 287 AiLo[0] = (PetscInt) 0; 288 AiLo[n] = nzLower; 289 AjLo[0] = (PetscInt) 0; 290 AALo[0] = (MatScalar) 1.0; 291 v = aa; 292 vi = aj; 293 offset = 1; 294 rowOffset= 1; 295 for (i=1; i<n; i++) { 296 nz = ai[i+1] - ai[i]; 297 /* additional 1 for the term on the diagonal */ 298 AiLo[i] = rowOffset; 299 rowOffset += nz+1; 300 301 ierr = PetscMemcpy(&(AjLo[offset]), vi, nz*sizeof(PetscInt));CHKERRQ(ierr); 302 ierr = PetscMemcpy(&(AALo[offset]), v, nz*sizeof(PetscScalar));CHKERRQ(ierr); 303 304 offset += nz; 305 AjLo[offset] = (PetscInt) i; 306 AALo[offset] = (MatScalar) 1.0; 307 offset += 1; 308 309 v += nz; 310 vi += nz; 311 } 312 313 /* allocate space for the triangular factor information */ 314 loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct; 315 316 /* Create the matrix description */ 317 stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUDA(stat); 318 stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat); 319 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUDA(stat); 320 stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER);CHKERRCUDA(stat); 321 stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUDA(stat); 322 323 /* Create the solve analysis information */ 324 stat = cusparseCreateSolveAnalysisInfo(&loTriFactor->solveInfo);CHKERRCUDA(stat); 325 326 /* set the operation */ 327 loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 328 329 /* set the matrix */ 330 loTriFactor->csrMat = new CsrMatrix; 331 loTriFactor->csrMat->num_rows = n; 332 loTriFactor->csrMat->num_cols = n; 333 loTriFactor->csrMat->num_entries = nzLower; 334 335 loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1); 336 loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1); 337 338 loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower); 339 loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower); 340 341 loTriFactor->csrMat->values = new THRUSTARRAY(nzLower); 342 loTriFactor->csrMat->values->assign(AALo, AALo+nzLower); 343 344 /* perform the solve analysis */ 345 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, 346 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 347 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 348 loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo);CHKERRCUDA(stat); 349 350 /* assign the pointer. Is this really necessary? */ 351 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor; 352 353 ierr = cudaFreeHost(AiLo);CHKERRCUDA(ierr); 354 ierr = cudaFreeHost(AjLo);CHKERRCUDA(ierr); 355 ierr = cudaFreeHost(AALo);CHKERRCUDA(ierr); 356 } catch(char *ex) { 357 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 358 } 359 } 360 PetscFunctionReturn(0); 361 } 362 363 #undef __FUNCT__ 364 #define __FUNCT__ "MatSeqAIJCUSPARSEBuildILUUpperTriMatrix" 365 static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A) 366 { 367 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 368 PetscInt n = A->rmap->n; 369 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 370 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 371 cusparseStatus_t stat; 372 const PetscInt *aj = a->j,*adiag = a->diag,*vi; 373 const MatScalar *aa = a->a,*v; 374 PetscInt *AiUp, *AjUp; 375 PetscScalar *AAUp; 376 PetscInt i,nz, nzUpper, offset; 377 PetscErrorCode ierr; 378 379 PetscFunctionBegin; 380 if (A->valid_GPU_matrix == PETSC_CUDA_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUDA_CPU) { 381 try { 382 /* next, figure out the number of nonzeros in the upper triangular matrix. */ 383 nzUpper = adiag[0]-adiag[n]; 384 385 /* Allocate Space for the upper triangular matrix */ 386 ierr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(ierr); 387 ierr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(ierr); 388 ierr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(ierr); 389 390 /* Fill the upper triangular matrix */ 391 AiUp[0]=(PetscInt) 0; 392 AiUp[n]=nzUpper; 393 offset = nzUpper; 394 for (i=n-1; i>=0; i--) { 395 v = aa + adiag[i+1] + 1; 396 vi = aj + adiag[i+1] + 1; 397 398 /* number of elements NOT on the diagonal */ 399 nz = adiag[i] - adiag[i+1]-1; 400 401 /* decrement the offset */ 402 offset -= (nz+1); 403 404 /* first, set the diagonal elements */ 405 AjUp[offset] = (PetscInt) i; 406 AAUp[offset] = (MatScalar)1./v[nz]; 407 AiUp[i] = AiUp[i+1] - (nz+1); 408 409 ierr = PetscMemcpy(&(AjUp[offset+1]), vi, nz*sizeof(PetscInt));CHKERRQ(ierr); 410 ierr = PetscMemcpy(&(AAUp[offset+1]), v, nz*sizeof(PetscScalar));CHKERRQ(ierr); 411 } 412 413 /* allocate space for the triangular factor information */ 414 upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct; 415 416 /* Create the matrix description */ 417 stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUDA(stat); 418 stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat); 419 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUDA(stat); 420 stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUDA(stat); 421 stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUDA(stat); 422 423 /* Create the solve analysis information */ 424 stat = cusparseCreateSolveAnalysisInfo(&upTriFactor->solveInfo);CHKERRCUDA(stat); 425 426 /* set the operation */ 427 upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 428 429 /* set the matrix */ 430 upTriFactor->csrMat = new CsrMatrix; 431 upTriFactor->csrMat->num_rows = n; 432 upTriFactor->csrMat->num_cols = n; 433 upTriFactor->csrMat->num_entries = nzUpper; 434 435 upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1); 436 upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1); 437 438 upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper); 439 upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper); 440 441 upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper); 442 upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper); 443 444 /* perform the solve analysis */ 445 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, 446 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 447 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 448 upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo);CHKERRCUDA(stat); 449 450 /* assign the pointer. Is this really necessary? */ 451 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor; 452 453 ierr = cudaFreeHost(AiUp);CHKERRCUDA(ierr); 454 ierr = cudaFreeHost(AjUp);CHKERRCUDA(ierr); 455 ierr = cudaFreeHost(AAUp);CHKERRCUDA(ierr); 456 } catch(char *ex) { 457 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 458 } 459 } 460 PetscFunctionReturn(0); 461 } 462 463 #undef __FUNCT__ 464 #define __FUNCT__ "MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU" 465 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A) 466 { 467 PetscErrorCode ierr; 468 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 469 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 470 IS isrow = a->row,iscol = a->icol; 471 PetscBool row_identity,col_identity; 472 const PetscInt *r,*c; 473 PetscInt n = A->rmap->n; 474 475 PetscFunctionBegin; 476 ierr = MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);CHKERRQ(ierr); 477 ierr = MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);CHKERRQ(ierr); 478 479 cusparseTriFactors->workVector = new THRUSTARRAY; 480 cusparseTriFactors->workVector->resize(n); 481 cusparseTriFactors->nnz=a->nz; 482 483 A->valid_GPU_matrix = PETSC_CUDA_BOTH; 484 /*lower triangular indices */ 485 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 486 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 487 if (!row_identity) { 488 cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 489 cusparseTriFactors->rpermIndices->assign(r, r+n); 490 } 491 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 492 493 /*upper triangular indices */ 494 ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr); 495 ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 496 if (!col_identity) { 497 cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 498 cusparseTriFactors->cpermIndices->assign(c, c+n); 499 } 500 ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr); 501 PetscFunctionReturn(0); 502 } 503 504 #undef __FUNCT__ 505 #define __FUNCT__ "MatSeqAIJCUSPARSEBuildICCTriMatrices" 506 static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A) 507 { 508 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 509 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 510 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 511 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 512 cusparseStatus_t stat; 513 PetscErrorCode ierr; 514 PetscInt *AiUp, *AjUp; 515 PetscScalar *AAUp; 516 PetscScalar *AALo; 517 PetscInt nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j; 518 Mat_SeqSBAIJ *b = (Mat_SeqSBAIJ*)A->data; 519 const PetscInt *ai = b->i,*aj = b->j,*vj; 520 const MatScalar *aa = b->a,*v; 521 522 PetscFunctionBegin; 523 if (A->valid_GPU_matrix == PETSC_CUDA_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUDA_CPU) { 524 try { 525 /* Allocate Space for the upper triangular matrix */ 526 ierr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(ierr); 527 ierr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(ierr); 528 ierr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(ierr); 529 ierr = cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUDA(ierr); 530 531 /* Fill the upper triangular matrix */ 532 AiUp[0]=(PetscInt) 0; 533 AiUp[n]=nzUpper; 534 offset = 0; 535 for (i=0; i<n; i++) { 536 /* set the pointers */ 537 v = aa + ai[i]; 538 vj = aj + ai[i]; 539 nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */ 540 541 /* first, set the diagonal elements */ 542 AjUp[offset] = (PetscInt) i; 543 AAUp[offset] = (MatScalar)1.0/v[nz]; 544 AiUp[i] = offset; 545 AALo[offset] = (MatScalar)1.0/v[nz]; 546 547 offset+=1; 548 if (nz>0) { 549 ierr = PetscMemcpy(&(AjUp[offset]), vj, nz*sizeof(PetscInt));CHKERRQ(ierr); 550 ierr = PetscMemcpy(&(AAUp[offset]), v, nz*sizeof(PetscScalar));CHKERRQ(ierr); 551 for (j=offset; j<offset+nz; j++) { 552 AAUp[j] = -AAUp[j]; 553 AALo[j] = AAUp[j]/v[nz]; 554 } 555 offset+=nz; 556 } 557 } 558 559 /* allocate space for the triangular factor information */ 560 upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct; 561 562 /* Create the matrix description */ 563 stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUDA(stat); 564 stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat); 565 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUDA(stat); 566 stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUDA(stat); 567 stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUDA(stat); 568 569 /* Create the solve analysis information */ 570 stat = cusparseCreateSolveAnalysisInfo(&upTriFactor->solveInfo);CHKERRCUDA(stat); 571 572 /* set the operation */ 573 upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 574 575 /* set the matrix */ 576 upTriFactor->csrMat = new CsrMatrix; 577 upTriFactor->csrMat->num_rows = A->rmap->n; 578 upTriFactor->csrMat->num_cols = A->cmap->n; 579 upTriFactor->csrMat->num_entries = a->nz; 580 581 upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 582 upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1); 583 584 upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 585 upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz); 586 587 upTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 588 upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz); 589 590 /* perform the solve analysis */ 591 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, 592 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 593 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 594 upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo);CHKERRCUDA(stat); 595 596 /* assign the pointer. Is this really necessary? */ 597 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor; 598 599 /* allocate space for the triangular factor information */ 600 loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct; 601 602 /* Create the matrix description */ 603 stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUDA(stat); 604 stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat); 605 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUDA(stat); 606 stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUDA(stat); 607 stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUDA(stat); 608 609 /* Create the solve analysis information */ 610 stat = cusparseCreateSolveAnalysisInfo(&loTriFactor->solveInfo);CHKERRCUDA(stat); 611 612 /* set the operation */ 613 loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE; 614 615 /* set the matrix */ 616 loTriFactor->csrMat = new CsrMatrix; 617 loTriFactor->csrMat->num_rows = A->rmap->n; 618 loTriFactor->csrMat->num_cols = A->cmap->n; 619 loTriFactor->csrMat->num_entries = a->nz; 620 621 loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 622 loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1); 623 624 loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 625 loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz); 626 627 loTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 628 loTriFactor->csrMat->values->assign(AALo, AALo+a->nz); 629 630 /* perform the solve analysis */ 631 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, 632 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 633 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 634 loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo);CHKERRCUDA(stat); 635 636 /* assign the pointer. Is this really necessary? */ 637 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor; 638 639 A->valid_GPU_matrix = PETSC_CUDA_BOTH; 640 ierr = cudaFreeHost(AiUp);CHKERRCUDA(ierr); 641 ierr = cudaFreeHost(AjUp);CHKERRCUDA(ierr); 642 ierr = cudaFreeHost(AAUp);CHKERRCUDA(ierr); 643 ierr = cudaFreeHost(AALo);CHKERRCUDA(ierr); 644 } catch(char *ex) { 645 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 646 } 647 } 648 PetscFunctionReturn(0); 649 } 650 651 #undef __FUNCT__ 652 #define __FUNCT__ "MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU" 653 static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A) 654 { 655 PetscErrorCode ierr; 656 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 657 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 658 IS ip = a->row; 659 const PetscInt *rip; 660 PetscBool perm_identity; 661 PetscInt n = A->rmap->n; 662 663 PetscFunctionBegin; 664 ierr = MatSeqAIJCUSPARSEBuildICCTriMatrices(A);CHKERRQ(ierr); 665 cusparseTriFactors->workVector = new THRUSTARRAY; 666 cusparseTriFactors->workVector->resize(n); 667 cusparseTriFactors->nnz=(a->nz-n)*2 + n; 668 669 /*lower triangular indices */ 670 ierr = ISGetIndices(ip,&rip);CHKERRQ(ierr); 671 ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr); 672 if (!perm_identity) { 673 cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 674 cusparseTriFactors->rpermIndices->assign(rip, rip+n); 675 cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 676 cusparseTriFactors->cpermIndices->assign(rip, rip+n); 677 } 678 ierr = ISRestoreIndices(ip,&rip);CHKERRQ(ierr); 679 PetscFunctionReturn(0); 680 } 681 682 #undef __FUNCT__ 683 #define __FUNCT__ "MatLUFactorNumeric_SeqAIJCUSPARSE" 684 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info) 685 { 686 Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 687 IS isrow = b->row,iscol = b->col; 688 PetscBool row_identity,col_identity; 689 PetscErrorCode ierr; 690 691 PetscFunctionBegin; 692 ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 693 /* determine which version of MatSolve needs to be used. */ 694 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 695 ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 696 if (row_identity && col_identity) { 697 B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 698 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 699 } else { 700 B->ops->solve = MatSolve_SeqAIJCUSPARSE; 701 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 702 } 703 704 /* get the triangular factors */ 705 ierr = MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);CHKERRQ(ierr); 706 PetscFunctionReturn(0); 707 } 708 709 #undef __FUNCT__ 710 #define __FUNCT__ "MatCholeskyFactorNumeric_SeqAIJCUSPARSE" 711 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info) 712 { 713 Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 714 IS ip = b->row; 715 PetscBool perm_identity; 716 PetscErrorCode ierr; 717 718 PetscFunctionBegin; 719 ierr = MatCholeskyFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 720 721 /* determine which version of MatSolve needs to be used. */ 722 ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr); 723 if (perm_identity) { 724 B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 725 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 726 } else { 727 B->ops->solve = MatSolve_SeqAIJCUSPARSE; 728 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 729 } 730 731 /* get the triangular factors */ 732 ierr = MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);CHKERRQ(ierr); 733 PetscFunctionReturn(0); 734 } 735 736 #undef __FUNCT__ 737 #define __FUNCT__ "MatSeqAIJCUSPARSEAnalyzeTransposeForSolve" 738 static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A) 739 { 740 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 741 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 742 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 743 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 744 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 745 cusparseStatus_t stat; 746 cusparseIndexBase_t indexBase; 747 cusparseMatrixType_t matrixType; 748 cusparseFillMode_t fillMode; 749 cusparseDiagType_t diagType; 750 751 PetscFunctionBegin; 752 753 /*********************************************/ 754 /* Now the Transpose of the Lower Tri Factor */ 755 /*********************************************/ 756 757 /* allocate space for the transpose of the lower triangular factor */ 758 loTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct; 759 760 /* set the matrix descriptors of the lower triangular factor */ 761 matrixType = cusparseGetMatType(loTriFactor->descr); 762 indexBase = cusparseGetMatIndexBase(loTriFactor->descr); 763 fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ? 764 CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 765 diagType = cusparseGetMatDiagType(loTriFactor->descr); 766 767 /* Create the matrix description */ 768 stat = cusparseCreateMatDescr(&loTriFactorT->descr);CHKERRCUDA(stat); 769 stat = cusparseSetMatIndexBase(loTriFactorT->descr, indexBase);CHKERRCUDA(stat); 770 stat = cusparseSetMatType(loTriFactorT->descr, matrixType);CHKERRCUDA(stat); 771 stat = cusparseSetMatFillMode(loTriFactorT->descr, fillMode);CHKERRCUDA(stat); 772 stat = cusparseSetMatDiagType(loTriFactorT->descr, diagType);CHKERRCUDA(stat); 773 774 /* Create the solve analysis information */ 775 stat = cusparseCreateSolveAnalysisInfo(&loTriFactorT->solveInfo);CHKERRCUDA(stat); 776 777 /* set the operation */ 778 loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 779 780 /* allocate GPU space for the CSC of the lower triangular factor*/ 781 loTriFactorT->csrMat = new CsrMatrix; 782 loTriFactorT->csrMat->num_rows = loTriFactor->csrMat->num_rows; 783 loTriFactorT->csrMat->num_cols = loTriFactor->csrMat->num_cols; 784 loTriFactorT->csrMat->num_entries = loTriFactor->csrMat->num_entries; 785 loTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(loTriFactor->csrMat->num_rows+1); 786 loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactor->csrMat->num_entries); 787 loTriFactorT->csrMat->values = new THRUSTARRAY(loTriFactor->csrMat->num_entries); 788 789 /* compute the transpose of the lower triangular factor, i.e. the CSC */ 790 stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, 791 loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, 792 loTriFactor->csrMat->values->data().get(), 793 loTriFactor->csrMat->row_offsets->data().get(), 794 loTriFactor->csrMat->column_indices->data().get(), 795 loTriFactorT->csrMat->values->data().get(), 796 loTriFactorT->csrMat->column_indices->data().get(), 797 loTriFactorT->csrMat->row_offsets->data().get(), 798 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUDA(stat); 799 800 /* perform the solve analysis on the transposed matrix */ 801 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp, 802 loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, 803 loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(), 804 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), 805 loTriFactorT->solveInfo);CHKERRCUDA(stat); 806 807 /* assign the pointer. Is this really necessary? */ 808 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT; 809 810 /*********************************************/ 811 /* Now the Transpose of the Upper Tri Factor */ 812 /*********************************************/ 813 814 /* allocate space for the transpose of the upper triangular factor */ 815 upTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct; 816 817 /* set the matrix descriptors of the upper triangular factor */ 818 matrixType = cusparseGetMatType(upTriFactor->descr); 819 indexBase = cusparseGetMatIndexBase(upTriFactor->descr); 820 fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ? 821 CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 822 diagType = cusparseGetMatDiagType(upTriFactor->descr); 823 824 /* Create the matrix description */ 825 stat = cusparseCreateMatDescr(&upTriFactorT->descr);CHKERRCUDA(stat); 826 stat = cusparseSetMatIndexBase(upTriFactorT->descr, indexBase);CHKERRCUDA(stat); 827 stat = cusparseSetMatType(upTriFactorT->descr, matrixType);CHKERRCUDA(stat); 828 stat = cusparseSetMatFillMode(upTriFactorT->descr, fillMode);CHKERRCUDA(stat); 829 stat = cusparseSetMatDiagType(upTriFactorT->descr, diagType);CHKERRCUDA(stat); 830 831 /* Create the solve analysis information */ 832 stat = cusparseCreateSolveAnalysisInfo(&upTriFactorT->solveInfo);CHKERRCUDA(stat); 833 834 /* set the operation */ 835 upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 836 837 /* allocate GPU space for the CSC of the upper triangular factor*/ 838 upTriFactorT->csrMat = new CsrMatrix; 839 upTriFactorT->csrMat->num_rows = upTriFactor->csrMat->num_rows; 840 upTriFactorT->csrMat->num_cols = upTriFactor->csrMat->num_cols; 841 upTriFactorT->csrMat->num_entries = upTriFactor->csrMat->num_entries; 842 upTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(upTriFactor->csrMat->num_rows+1); 843 upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactor->csrMat->num_entries); 844 upTriFactorT->csrMat->values = new THRUSTARRAY(upTriFactor->csrMat->num_entries); 845 846 /* compute the transpose of the upper triangular factor, i.e. the CSC */ 847 stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, 848 upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, 849 upTriFactor->csrMat->values->data().get(), 850 upTriFactor->csrMat->row_offsets->data().get(), 851 upTriFactor->csrMat->column_indices->data().get(), 852 upTriFactorT->csrMat->values->data().get(), 853 upTriFactorT->csrMat->column_indices->data().get(), 854 upTriFactorT->csrMat->row_offsets->data().get(), 855 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUDA(stat); 856 857 /* perform the solve analysis on the transposed matrix */ 858 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp, 859 upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, 860 upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(), 861 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), 862 upTriFactorT->solveInfo);CHKERRCUDA(stat); 863 864 /* assign the pointer. Is this really necessary? */ 865 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT; 866 PetscFunctionReturn(0); 867 } 868 869 #undef __FUNCT__ 870 #define __FUNCT__ "MatSeqAIJCUSPARSEGenerateTransposeForMult" 871 static PetscErrorCode MatSeqAIJCUSPARSEGenerateTransposeForMult(Mat A) 872 { 873 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 874 Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 875 Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 876 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 877 cusparseStatus_t stat; 878 cusparseIndexBase_t indexBase; 879 cudaError_t err; 880 881 PetscFunctionBegin; 882 883 /* allocate space for the triangular factor information */ 884 matstructT = new Mat_SeqAIJCUSPARSEMultStruct; 885 stat = cusparseCreateMatDescr(&matstructT->descr);CHKERRCUDA(stat); 886 indexBase = cusparseGetMatIndexBase(matstruct->descr); 887 stat = cusparseSetMatIndexBase(matstructT->descr, indexBase);CHKERRCUDA(stat); 888 stat = cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUDA(stat); 889 890 /* set alpha and beta */ 891 err = cudaMalloc((void **)&(matstructT->alpha),sizeof(PetscScalar));CHKERRCUDA(err); 892 err = cudaMemcpy(matstructT->alpha,&ALPHA,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 893 err = cudaMalloc((void **)&(matstructT->beta),sizeof(PetscScalar));CHKERRCUDA(err); 894 err = cudaMemcpy(matstructT->beta,&BETA,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 895 stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUDA(stat); 896 897 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 898 CsrMatrix *matrix = (CsrMatrix*)matstruct->mat; 899 CsrMatrix *matrixT= new CsrMatrix; 900 matrixT->num_rows = A->rmap->n; 901 matrixT->num_cols = A->cmap->n; 902 matrixT->num_entries = a->nz; 903 matrixT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 904 matrixT->column_indices = new THRUSTINTARRAY32(a->nz); 905 matrixT->values = new THRUSTARRAY(a->nz); 906 907 /* compute the transpose of the upper triangular factor, i.e. the CSC */ 908 indexBase = cusparseGetMatIndexBase(matstruct->descr); 909 stat = cusparse_csr2csc(cusparsestruct->handle, matrix->num_rows, 910 matrix->num_cols, matrix->num_entries, 911 matrix->values->data().get(), 912 matrix->row_offsets->data().get(), 913 matrix->column_indices->data().get(), 914 matrixT->values->data().get(), 915 matrixT->column_indices->data().get(), 916 matrixT->row_offsets->data().get(), 917 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUDA(stat); 918 919 /* assign the pointer */ 920 matstructT->mat = matrixT; 921 922 } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) { 923 #if CUDA_VERSION>=5000 924 /* First convert HYB to CSR */ 925 CsrMatrix *temp= new CsrMatrix; 926 temp->num_rows = A->rmap->n; 927 temp->num_cols = A->cmap->n; 928 temp->num_entries = a->nz; 929 temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 930 temp->column_indices = new THRUSTINTARRAY32(a->nz); 931 temp->values = new THRUSTARRAY(a->nz); 932 933 934 stat = cusparse_hyb2csr(cusparsestruct->handle, 935 matstruct->descr, (cusparseHybMat_t)matstruct->mat, 936 temp->values->data().get(), 937 temp->row_offsets->data().get(), 938 temp->column_indices->data().get());CHKERRCUDA(stat); 939 940 /* Next, convert CSR to CSC (i.e. the matrix transpose) */ 941 CsrMatrix *tempT= new CsrMatrix; 942 tempT->num_rows = A->rmap->n; 943 tempT->num_cols = A->cmap->n; 944 tempT->num_entries = a->nz; 945 tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 946 tempT->column_indices = new THRUSTINTARRAY32(a->nz); 947 tempT->values = new THRUSTARRAY(a->nz); 948 949 stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows, 950 temp->num_cols, temp->num_entries, 951 temp->values->data().get(), 952 temp->row_offsets->data().get(), 953 temp->column_indices->data().get(), 954 tempT->values->data().get(), 955 tempT->column_indices->data().get(), 956 tempT->row_offsets->data().get(), 957 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUDA(stat); 958 959 /* Last, convert CSC to HYB */ 960 cusparseHybMat_t hybMat; 961 stat = cusparseCreateHybMat(&hybMat);CHKERRCUDA(stat); 962 cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ? 963 CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 964 stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n, 965 matstructT->descr, tempT->values->data().get(), 966 tempT->row_offsets->data().get(), 967 tempT->column_indices->data().get(), 968 hybMat, 0, partition);CHKERRCUDA(stat); 969 970 /* assign the pointer */ 971 matstructT->mat = hybMat; 972 973 /* delete temporaries */ 974 if (tempT) { 975 if (tempT->values) delete (THRUSTARRAY*) tempT->values; 976 if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices; 977 if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets; 978 delete (CsrMatrix*) tempT; 979 } 980 if (temp) { 981 if (temp->values) delete (THRUSTARRAY*) temp->values; 982 if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices; 983 if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets; 984 delete (CsrMatrix*) temp; 985 } 986 #else 987 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"ELL (Ellpack) and HYB (Hybrid) storage format for the Matrix Transpose (in MatMultTranspose) require CUDA 5.0 or later."); 988 #endif 989 } 990 /* assign the compressed row indices */ 991 matstructT->cprowIndices = new THRUSTINTARRAY; 992 993 /* assign the pointer */ 994 ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT; 995 PetscFunctionReturn(0); 996 } 997 998 #undef __FUNCT__ 999 #define __FUNCT__ "MatSolveTranspose_SeqAIJCUSPARSE" 1000 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 1001 { 1002 PetscInt n = xx->map->n; 1003 PetscScalar *xarray, *barray; 1004 thrust::device_ptr<PetscScalar> xGPU,bGPU; 1005 cusparseStatus_t stat; 1006 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1007 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1008 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1009 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1010 PetscErrorCode ierr; 1011 1012 PetscFunctionBegin; 1013 /* Analyze the matrix and create the transpose ... on the fly */ 1014 if (!loTriFactorT && !upTriFactorT) { 1015 ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr); 1016 loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1017 upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1018 } 1019 1020 /* Get the GPU pointers */ 1021 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1022 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1023 xGPU = thrust::device_pointer_cast(xarray); 1024 bGPU = thrust::device_pointer_cast(barray); 1025 1026 /* First, reorder with the row permutation */ 1027 thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), 1028 thrust::make_permutation_iterator(bGPU+n, cusparseTriFactors->rpermIndices->end()), 1029 xGPU); 1030 1031 /* First, solve U */ 1032 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, 1033 upTriFactorT->csrMat->num_rows, &ALPHA, upTriFactorT->descr, 1034 upTriFactorT->csrMat->values->data().get(), 1035 upTriFactorT->csrMat->row_offsets->data().get(), 1036 upTriFactorT->csrMat->column_indices->data().get(), 1037 upTriFactorT->solveInfo, 1038 xarray, tempGPU->data().get());CHKERRCUDA(stat); 1039 1040 /* Then, solve L */ 1041 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, 1042 loTriFactorT->csrMat->num_rows, &ALPHA, loTriFactorT->descr, 1043 loTriFactorT->csrMat->values->data().get(), 1044 loTriFactorT->csrMat->row_offsets->data().get(), 1045 loTriFactorT->csrMat->column_indices->data().get(), 1046 loTriFactorT->solveInfo, 1047 tempGPU->data().get(), xarray);CHKERRCUDA(stat); 1048 1049 /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */ 1050 thrust::copy(thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()), 1051 thrust::make_permutation_iterator(xGPU+n, cusparseTriFactors->cpermIndices->end()), 1052 tempGPU->begin()); 1053 1054 /* Copy the temporary to the full solution. */ 1055 thrust::copy(tempGPU->begin(), tempGPU->end(), xGPU); 1056 1057 /* restore */ 1058 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1059 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1060 ierr = WaitForGPU();CHKERRCUDA(ierr); 1061 1062 ierr = PetscLogFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1063 PetscFunctionReturn(0); 1064 } 1065 1066 #undef __FUNCT__ 1067 #define __FUNCT__ "MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering" 1068 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 1069 { 1070 PetscScalar *xarray, *barray; 1071 cusparseStatus_t stat; 1072 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1073 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1074 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1075 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1076 PetscErrorCode ierr; 1077 1078 PetscFunctionBegin; 1079 /* Analyze the matrix and create the transpose ... on the fly */ 1080 if (!loTriFactorT && !upTriFactorT) { 1081 ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr); 1082 loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1083 upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1084 } 1085 1086 /* Get the GPU pointers */ 1087 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1088 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1089 1090 /* First, solve U */ 1091 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, 1092 upTriFactorT->csrMat->num_rows, &ALPHA, upTriFactorT->descr, 1093 upTriFactorT->csrMat->values->data().get(), 1094 upTriFactorT->csrMat->row_offsets->data().get(), 1095 upTriFactorT->csrMat->column_indices->data().get(), 1096 upTriFactorT->solveInfo, 1097 barray, tempGPU->data().get());CHKERRCUDA(stat); 1098 1099 /* Then, solve L */ 1100 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, 1101 loTriFactorT->csrMat->num_rows, &ALPHA, loTriFactorT->descr, 1102 loTriFactorT->csrMat->values->data().get(), 1103 loTriFactorT->csrMat->row_offsets->data().get(), 1104 loTriFactorT->csrMat->column_indices->data().get(), 1105 loTriFactorT->solveInfo, 1106 tempGPU->data().get(), xarray);CHKERRCUDA(stat); 1107 1108 /* restore */ 1109 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1110 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1111 ierr = WaitForGPU();CHKERRCUDA(ierr); 1112 ierr = PetscLogFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1113 PetscFunctionReturn(0); 1114 } 1115 1116 #undef __FUNCT__ 1117 #define __FUNCT__ "MatSolve_SeqAIJCUSPARSE" 1118 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 1119 { 1120 PetscScalar *xarray, *barray; 1121 thrust::device_ptr<PetscScalar> xGPU,bGPU; 1122 cusparseStatus_t stat; 1123 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1124 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1125 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1126 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1127 PetscErrorCode ierr; 1128 VecType t; 1129 PetscBool flg; 1130 1131 PetscFunctionBegin; 1132 ierr = VecGetType(bb,&t);CHKERRQ(ierr); 1133 ierr = PetscStrcmp(t,VECSEQCUDA,&flg);CHKERRQ(ierr); 1134 if (!flg) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Vector of type %s passed into MatSolve_SeqAIJCUSPARSE (Arg #2). Can only deal with %s\n.",t,VECSEQCUDA); 1135 ierr = VecGetType(xx,&t);CHKERRQ(ierr); 1136 ierr = PetscStrcmp(t,VECSEQCUDA,&flg);CHKERRQ(ierr); 1137 if (!flg) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Vector of type %s passed into MatSolve_SeqAIJCUSPARSE (Arg #3). Can only deal with %s\n.",t,VECSEQCUDA); 1138 1139 /* Get the GPU pointers */ 1140 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1141 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1142 xGPU = thrust::device_pointer_cast(xarray); 1143 bGPU = thrust::device_pointer_cast(barray); 1144 1145 /* First, reorder with the row permutation */ 1146 thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), 1147 thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()), 1148 xGPU); 1149 1150 /* Next, solve L */ 1151 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp, 1152 loTriFactor->csrMat->num_rows, &ALPHA, loTriFactor->descr, 1153 loTriFactor->csrMat->values->data().get(), 1154 loTriFactor->csrMat->row_offsets->data().get(), 1155 loTriFactor->csrMat->column_indices->data().get(), 1156 loTriFactor->solveInfo, 1157 xarray, tempGPU->data().get());CHKERRCUDA(stat); 1158 1159 /* Then, solve U */ 1160 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp, 1161 upTriFactor->csrMat->num_rows, &ALPHA, upTriFactor->descr, 1162 upTriFactor->csrMat->values->data().get(), 1163 upTriFactor->csrMat->row_offsets->data().get(), 1164 upTriFactor->csrMat->column_indices->data().get(), 1165 upTriFactor->solveInfo, 1166 tempGPU->data().get(), xarray);CHKERRCUDA(stat); 1167 1168 /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */ 1169 thrust::copy(thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()), 1170 thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->end()), 1171 tempGPU->begin()); 1172 1173 /* Copy the temporary to the full solution. */ 1174 thrust::copy(tempGPU->begin(), tempGPU->end(), xGPU); 1175 1176 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1177 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1178 ierr = WaitForGPU();CHKERRCUDA(ierr); 1179 ierr = PetscLogFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1180 PetscFunctionReturn(0); 1181 } 1182 1183 #undef __FUNCT__ 1184 #define __FUNCT__ "MatSolve_SeqAIJCUSPARSE_NaturalOrdering" 1185 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 1186 { 1187 PetscScalar *xarray, *barray; 1188 cusparseStatus_t stat; 1189 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1190 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1191 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1192 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1193 PetscErrorCode ierr; 1194 1195 PetscFunctionBegin; 1196 /* Get the GPU pointers */ 1197 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1198 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1199 1200 /* First, solve L */ 1201 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp, 1202 loTriFactor->csrMat->num_rows, &ALPHA, loTriFactor->descr, 1203 loTriFactor->csrMat->values->data().get(), 1204 loTriFactor->csrMat->row_offsets->data().get(), 1205 loTriFactor->csrMat->column_indices->data().get(), 1206 loTriFactor->solveInfo, 1207 barray, tempGPU->data().get());CHKERRCUDA(stat); 1208 1209 /* Next, solve U */ 1210 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp, 1211 upTriFactor->csrMat->num_rows, &ALPHA, upTriFactor->descr, 1212 upTriFactor->csrMat->values->data().get(), 1213 upTriFactor->csrMat->row_offsets->data().get(), 1214 upTriFactor->csrMat->column_indices->data().get(), 1215 upTriFactor->solveInfo, 1216 tempGPU->data().get(), xarray);CHKERRCUDA(stat); 1217 1218 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1219 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1220 ierr = WaitForGPU();CHKERRCUDA(ierr); 1221 ierr = PetscLogFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1222 PetscFunctionReturn(0); 1223 } 1224 1225 #undef __FUNCT__ 1226 #define __FUNCT__ "MatSeqAIJCUSPARSECopyToGPU" 1227 static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A) 1228 { 1229 1230 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1231 Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 1232 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1233 PetscInt m = A->rmap->n,*ii,*ridx; 1234 PetscErrorCode ierr; 1235 cusparseStatus_t stat; 1236 cudaError_t err; 1237 1238 PetscFunctionBegin; 1239 if (A->valid_GPU_matrix == PETSC_CUDA_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUDA_CPU) { 1240 ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1241 Mat_SeqAIJCUSPARSEMultStruct_Destroy(&matstruct,cusparsestruct->format); 1242 try { 1243 cusparsestruct->nonzerorow=0; 1244 for (int j = 0; j<m; j++) cusparsestruct->nonzerorow += ((a->i[j+1]-a->i[j])>0); 1245 1246 if (a->compressedrow.use) { 1247 m = a->compressedrow.nrows; 1248 ii = a->compressedrow.i; 1249 ridx = a->compressedrow.rindex; 1250 } else { 1251 /* Forcing compressed row on the GPU */ 1252 int k=0; 1253 ierr = PetscMalloc1(cusparsestruct->nonzerorow+1, &ii);CHKERRQ(ierr); 1254 ierr = PetscMalloc1(cusparsestruct->nonzerorow, &ridx);CHKERRQ(ierr); 1255 ii[0]=0; 1256 for (int j = 0; j<m; j++) { 1257 if ((a->i[j+1]-a->i[j])>0) { 1258 ii[k] = a->i[j]; 1259 ridx[k]= j; 1260 k++; 1261 } 1262 } 1263 ii[cusparsestruct->nonzerorow] = a->nz; 1264 m = cusparsestruct->nonzerorow; 1265 } 1266 1267 /* allocate space for the triangular factor information */ 1268 matstruct = new Mat_SeqAIJCUSPARSEMultStruct; 1269 stat = cusparseCreateMatDescr(&matstruct->descr);CHKERRCUDA(stat); 1270 stat = cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat); 1271 stat = cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUDA(stat); 1272 1273 err = cudaMalloc((void **)&(matstruct->alpha),sizeof(PetscScalar));CHKERRCUDA(err); 1274 err = cudaMemcpy(matstruct->alpha,&ALPHA,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1275 err = cudaMalloc((void **)&(matstruct->beta),sizeof(PetscScalar));CHKERRCUDA(err); 1276 err = cudaMemcpy(matstruct->beta,&BETA,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1277 stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUDA(stat); 1278 1279 /* Build a hybrid/ellpack matrix if this option is chosen for the storage */ 1280 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1281 /* set the matrix */ 1282 CsrMatrix *matrix= new CsrMatrix; 1283 matrix->num_rows = m; 1284 matrix->num_cols = A->cmap->n; 1285 matrix->num_entries = a->nz; 1286 matrix->row_offsets = new THRUSTINTARRAY32(m+1); 1287 matrix->row_offsets->assign(ii, ii + m+1); 1288 1289 matrix->column_indices = new THRUSTINTARRAY32(a->nz); 1290 matrix->column_indices->assign(a->j, a->j+a->nz); 1291 1292 matrix->values = new THRUSTARRAY(a->nz); 1293 matrix->values->assign(a->a, a->a+a->nz); 1294 1295 /* assign the pointer */ 1296 matstruct->mat = matrix; 1297 1298 } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) { 1299 #if CUDA_VERSION>=4020 1300 CsrMatrix *matrix= new CsrMatrix; 1301 matrix->num_rows = m; 1302 matrix->num_cols = A->cmap->n; 1303 matrix->num_entries = a->nz; 1304 matrix->row_offsets = new THRUSTINTARRAY32(m+1); 1305 matrix->row_offsets->assign(ii, ii + m+1); 1306 1307 matrix->column_indices = new THRUSTINTARRAY32(a->nz); 1308 matrix->column_indices->assign(a->j, a->j+a->nz); 1309 1310 matrix->values = new THRUSTARRAY(a->nz); 1311 matrix->values->assign(a->a, a->a+a->nz); 1312 1313 cusparseHybMat_t hybMat; 1314 stat = cusparseCreateHybMat(&hybMat);CHKERRCUDA(stat); 1315 cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ? 1316 CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 1317 stat = cusparse_csr2hyb(cusparsestruct->handle, matrix->num_rows, matrix->num_cols, 1318 matstruct->descr, matrix->values->data().get(), 1319 matrix->row_offsets->data().get(), 1320 matrix->column_indices->data().get(), 1321 hybMat, 0, partition);CHKERRCUDA(stat); 1322 /* assign the pointer */ 1323 matstruct->mat = hybMat; 1324 1325 if (matrix) { 1326 if (matrix->values) delete (THRUSTARRAY*)matrix->values; 1327 if (matrix->column_indices) delete (THRUSTINTARRAY32*)matrix->column_indices; 1328 if (matrix->row_offsets) delete (THRUSTINTARRAY32*)matrix->row_offsets; 1329 delete (CsrMatrix*)matrix; 1330 } 1331 #endif 1332 } 1333 1334 /* assign the compressed row indices */ 1335 matstruct->cprowIndices = new THRUSTINTARRAY(m); 1336 matstruct->cprowIndices->assign(ridx,ridx+m); 1337 1338 /* assign the pointer */ 1339 cusparsestruct->mat = matstruct; 1340 1341 if (!a->compressedrow.use) { 1342 ierr = PetscFree(ii);CHKERRQ(ierr); 1343 ierr = PetscFree(ridx);CHKERRQ(ierr); 1344 } 1345 cusparsestruct->workVector = new THRUSTARRAY; 1346 cusparsestruct->workVector->resize(m); 1347 } catch(char *ex) { 1348 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 1349 } 1350 ierr = WaitForGPU();CHKERRCUDA(ierr); 1351 1352 A->valid_GPU_matrix = PETSC_CUDA_BOTH; 1353 1354 ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1355 } 1356 PetscFunctionReturn(0); 1357 } 1358 1359 #undef __FUNCT__ 1360 #define __FUNCT__ "MatCreateVecs_SeqAIJCUSPARSE" 1361 static PetscErrorCode MatCreateVecs_SeqAIJCUSPARSE(Mat mat, Vec *right, Vec *left) 1362 { 1363 PetscErrorCode ierr; 1364 PetscInt rbs,cbs; 1365 1366 PetscFunctionBegin; 1367 ierr = MatGetBlockSizes(mat,&rbs,&cbs);CHKERRQ(ierr); 1368 if (right) { 1369 ierr = VecCreate(PetscObjectComm((PetscObject)mat),right);CHKERRQ(ierr); 1370 ierr = VecSetSizes(*right,mat->cmap->n,PETSC_DETERMINE);CHKERRQ(ierr); 1371 ierr = VecSetBlockSize(*right,cbs);CHKERRQ(ierr); 1372 ierr = VecSetType(*right,VECSEQCUDA);CHKERRQ(ierr); 1373 ierr = PetscLayoutReference(mat->cmap,&(*right)->map);CHKERRQ(ierr); 1374 } 1375 if (left) { 1376 ierr = VecCreate(PetscObjectComm((PetscObject)mat),left);CHKERRQ(ierr); 1377 ierr = VecSetSizes(*left,mat->rmap->n,PETSC_DETERMINE);CHKERRQ(ierr); 1378 ierr = VecSetBlockSize(*left,rbs);CHKERRQ(ierr); 1379 ierr = VecSetType(*left,VECSEQCUDA);CHKERRQ(ierr); 1380 ierr = PetscLayoutReference(mat->rmap,&(*left)->map);CHKERRQ(ierr); 1381 } 1382 PetscFunctionReturn(0); 1383 } 1384 1385 struct VecCUDAPlusEquals 1386 { 1387 template <typename Tuple> 1388 __host__ __device__ 1389 void operator()(Tuple t) 1390 { 1391 thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t); 1392 } 1393 }; 1394 1395 #undef __FUNCT__ 1396 #define __FUNCT__ "MatMult_SeqAIJCUSPARSE" 1397 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 1398 { 1399 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1400 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1401 Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 1402 PetscScalar *xarray,*yarray; 1403 PetscErrorCode ierr; 1404 cusparseStatus_t stat; 1405 1406 PetscFunctionBegin; 1407 /* The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE 1408 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); */ 1409 ierr = VecCUDAGetArrayRead(xx,&xarray);CHKERRQ(ierr); 1410 ierr = VecCUDAGetArrayWrite(yy,&yarray);CHKERRQ(ierr); 1411 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1412 CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 1413 stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1414 mat->num_rows, mat->num_cols, mat->num_entries, 1415 matstruct->alpha, matstruct->descr, mat->values->data().get(), mat->row_offsets->data().get(), 1416 mat->column_indices->data().get(), xarray, matstruct->beta, 1417 yarray);CHKERRCUDA(stat); 1418 } else { 1419 #if CUDA_VERSION>=4020 1420 cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat; 1421 stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1422 matstruct->alpha, matstruct->descr, hybMat, 1423 xarray, matstruct->beta, 1424 yarray);CHKERRCUDA(stat); 1425 #endif 1426 } 1427 ierr = VecCUDARestoreArrayRead(xx,&xarray);CHKERRQ(ierr); 1428 ierr = VecCUDARestoreArrayWrite(yy,&yarray);CHKERRQ(ierr); 1429 if (!cusparsestruct->stream) { 1430 ierr = WaitForGPU();CHKERRCUDA(ierr); 1431 } 1432 ierr = PetscLogFlops(2.0*a->nz - cusparsestruct->nonzerorow);CHKERRQ(ierr); 1433 PetscFunctionReturn(0); 1434 } 1435 1436 #undef __FUNCT__ 1437 #define __FUNCT__ "MatMultTranspose_SeqAIJCUSPARSE" 1438 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 1439 { 1440 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1441 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1442 Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 1443 PetscScalar *xarray,*yarray; 1444 PetscErrorCode ierr; 1445 cusparseStatus_t stat; 1446 1447 PetscFunctionBegin; 1448 /* The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE 1449 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); */ 1450 if (!matstructT) { 1451 ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr); 1452 matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 1453 } 1454 ierr = VecCUDAGetArrayRead(xx,&xarray);CHKERRQ(ierr); 1455 ierr = VecCUDAGetArrayWrite(yy,&yarray);CHKERRQ(ierr); 1456 1457 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1458 CsrMatrix *mat = (CsrMatrix*)matstructT->mat; 1459 stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1460 mat->num_rows, mat->num_cols, 1461 mat->num_entries, matstructT->alpha, matstructT->descr, 1462 mat->values->data().get(), mat->row_offsets->data().get(), 1463 mat->column_indices->data().get(), xarray, matstructT->beta, 1464 yarray);CHKERRCUDA(stat); 1465 } else { 1466 #if CUDA_VERSION>=4020 1467 cusparseHybMat_t hybMat = (cusparseHybMat_t)matstructT->mat; 1468 stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1469 matstructT->alpha, matstructT->descr, hybMat, 1470 xarray, matstructT->beta, 1471 yarray);CHKERRCUDA(stat); 1472 #endif 1473 } 1474 ierr = VecCUDARestoreArrayRead(xx,&xarray);CHKERRQ(ierr); 1475 ierr = VecCUDARestoreArrayWrite(yy,&yarray);CHKERRQ(ierr); 1476 if (!cusparsestruct->stream) { 1477 ierr = WaitForGPU();CHKERRCUDA(ierr); 1478 } 1479 ierr = PetscLogFlops(2.0*a->nz - cusparsestruct->nonzerorow);CHKERRQ(ierr); 1480 PetscFunctionReturn(0); 1481 } 1482 1483 1484 #undef __FUNCT__ 1485 #define __FUNCT__ "MatMultAdd_SeqAIJCUSPARSE" 1486 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 1487 { 1488 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1489 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1490 Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 1491 thrust::device_ptr<PetscScalar> zptr; 1492 PetscScalar *xarray,*zarray; 1493 PetscErrorCode ierr; 1494 cusparseStatus_t stat; 1495 1496 PetscFunctionBegin; 1497 /* The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE 1498 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); */ 1499 try { 1500 ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr); 1501 ierr = VecCUDAGetArrayRead(xx,&xarray);CHKERRQ(ierr); 1502 ierr = VecCUDAGetArrayWrite(zz,&zarray);CHKERRQ(ierr); 1503 zptr = thrust::device_pointer_cast(zarray); 1504 1505 /* multiply add */ 1506 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1507 CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 1508 /* here we need to be careful to set the number of rows in the multiply to the 1509 number of compressed rows in the matrix ... which is equivalent to the 1510 size of the workVector */ 1511 stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1512 mat->num_rows, mat->num_cols, 1513 mat->num_entries, matstruct->alpha, matstruct->descr, 1514 mat->values->data().get(), mat->row_offsets->data().get(), 1515 mat->column_indices->data().get(), xarray, matstruct->beta, 1516 cusparsestruct->workVector->data().get());CHKERRCUDA(stat); 1517 } else { 1518 #if CUDA_VERSION>=4020 1519 cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat; 1520 if (cusparsestruct->workVector->size()) { 1521 stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1522 matstruct->alpha, matstruct->descr, hybMat, 1523 xarray, matstruct->beta, 1524 cusparsestruct->workVector->data().get());CHKERRCUDA(stat); 1525 } 1526 #endif 1527 } 1528 1529 /* scatter the data from the temporary into the full vector with a += operation */ 1530 thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))), 1531 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + cusparsestruct->workVector->size(), 1532 VecCUDAPlusEquals()); 1533 ierr = VecCUDARestoreArrayRead(xx,&xarray);CHKERRQ(ierr); 1534 ierr = VecCUDARestoreArrayWrite(zz,&zarray);CHKERRQ(ierr); 1535 1536 } catch(char *ex) { 1537 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 1538 } 1539 ierr = WaitForGPU();CHKERRCUDA(ierr); 1540 ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr); 1541 PetscFunctionReturn(0); 1542 } 1543 1544 #undef __FUNCT__ 1545 #define __FUNCT__ "MatMultTransposeAdd_SeqAIJCUSPARSE" 1546 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 1547 { 1548 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1549 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1550 Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 1551 thrust::device_ptr<PetscScalar> zptr; 1552 PetscScalar *xarray,*zarray; 1553 PetscErrorCode ierr; 1554 cusparseStatus_t stat; 1555 1556 PetscFunctionBegin; 1557 /* The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE 1558 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); */ 1559 if (!matstructT) { 1560 ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr); 1561 matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 1562 } 1563 1564 try { 1565 ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr); 1566 ierr = VecCUDAGetArrayRead(xx,&xarray);CHKERRQ(ierr); 1567 ierr = VecCUDAGetArrayWrite(zz,&zarray);CHKERRQ(ierr); 1568 zptr = thrust::device_pointer_cast(zarray); 1569 1570 /* multiply add with matrix transpose */ 1571 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1572 CsrMatrix *mat = (CsrMatrix*)matstructT->mat; 1573 /* here we need to be careful to set the number of rows in the multiply to the 1574 number of compressed rows in the matrix ... which is equivalent to the 1575 size of the workVector */ 1576 stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1577 mat->num_rows, mat->num_cols, 1578 mat->num_entries, matstructT->alpha, matstructT->descr, 1579 mat->values->data().get(), mat->row_offsets->data().get(), 1580 mat->column_indices->data().get(), xarray, matstructT->beta, 1581 cusparsestruct->workVector->data().get());CHKERRCUDA(stat); 1582 } else { 1583 #if CUDA_VERSION>=4020 1584 cusparseHybMat_t hybMat = (cusparseHybMat_t)matstructT->mat; 1585 if (cusparsestruct->workVector->size()) { 1586 stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1587 matstructT->alpha, matstructT->descr, hybMat, 1588 xarray, matstructT->beta, 1589 cusparsestruct->workVector->data().get());CHKERRCUDA(stat); 1590 } 1591 #endif 1592 } 1593 1594 /* scatter the data from the temporary into the full vector with a += operation */ 1595 thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstructT->cprowIndices->begin()))), 1596 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstructT->cprowIndices->begin()))) + cusparsestruct->workVector->size(), 1597 VecCUDAPlusEquals()); 1598 1599 ierr = VecCUDARestoreArrayRead(xx,&xarray);CHKERRQ(ierr); 1600 ierr = VecCUDARestoreArrayWrite(zz,&zarray);CHKERRQ(ierr); 1601 1602 } catch(char *ex) { 1603 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 1604 } 1605 ierr = WaitForGPU();CHKERRCUDA(ierr); 1606 ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr); 1607 PetscFunctionReturn(0); 1608 } 1609 1610 #undef __FUNCT__ 1611 #define __FUNCT__ "MatAssemblyEnd_SeqAIJCUSPARSE" 1612 static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode) 1613 { 1614 PetscErrorCode ierr; 1615 1616 PetscFunctionBegin; 1617 ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr); 1618 if (A->factortype==MAT_FACTOR_NONE) { 1619 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 1620 } 1621 if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0); 1622 A->ops->mult = MatMult_SeqAIJCUSPARSE; 1623 A->ops->multadd = MatMultAdd_SeqAIJCUSPARSE; 1624 A->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE; 1625 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE; 1626 PetscFunctionReturn(0); 1627 } 1628 1629 /* --------------------------------------------------------------------------------*/ 1630 #undef __FUNCT__ 1631 #define __FUNCT__ "MatCreateSeqAIJCUSPARSE" 1632 /*@ 1633 MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format 1634 (the default parallel PETSc format). This matrix will ultimately pushed down 1635 to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix 1636 assembly performance the user should preallocate the matrix storage by setting 1637 the parameter nz (or the array nnz). By setting these parameters accurately, 1638 performance during matrix assembly can be increased by more than a factor of 50. 1639 1640 Collective on MPI_Comm 1641 1642 Input Parameters: 1643 + comm - MPI communicator, set to PETSC_COMM_SELF 1644 . m - number of rows 1645 . n - number of columns 1646 . nz - number of nonzeros per row (same for all rows) 1647 - nnz - array containing the number of nonzeros in the various rows 1648 (possibly different for each row) or NULL 1649 1650 Output Parameter: 1651 . A - the matrix 1652 1653 It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 1654 MatXXXXSetPreallocation() paradgm instead of this routine directly. 1655 [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 1656 1657 Notes: 1658 If nnz is given then nz is ignored 1659 1660 The AIJ format (also called the Yale sparse matrix format or 1661 compressed row storage), is fully compatible with standard Fortran 77 1662 storage. That is, the stored row and column indices can begin at 1663 either one (as in Fortran) or zero. See the users' manual for details. 1664 1665 Specify the preallocated storage with either nz or nnz (not both). 1666 Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 1667 allocation. For large problems you MUST preallocate memory or you 1668 will get TERRIBLE performance, see the users' manual chapter on matrices. 1669 1670 By default, this format uses inodes (identical nodes) when possible, to 1671 improve numerical efficiency of matrix-vector products and solves. We 1672 search for consecutive rows with the same nonzero structure, thereby 1673 reusing matrix information to achieve increased efficiency. 1674 1675 Level: intermediate 1676 1677 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE 1678 @*/ 1679 PetscErrorCode MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A) 1680 { 1681 PetscErrorCode ierr; 1682 1683 PetscFunctionBegin; 1684 ierr = MatCreate(comm,A);CHKERRQ(ierr); 1685 ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr); 1686 ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 1687 ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr); 1688 PetscFunctionReturn(0); 1689 } 1690 1691 #undef __FUNCT__ 1692 #define __FUNCT__ "MatDestroy_SeqAIJCUSPARSE" 1693 static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A) 1694 { 1695 PetscErrorCode ierr; 1696 1697 PetscFunctionBegin; 1698 if (A->factortype==MAT_FACTOR_NONE) { 1699 if (A->valid_GPU_matrix != PETSC_CUDA_UNALLOCATED) { 1700 ierr = Mat_SeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr);CHKERRQ(ierr); 1701 } 1702 } else { 1703 ierr = Mat_SeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr);CHKERRQ(ierr); 1704 } 1705 ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr); 1706 PetscFunctionReturn(0); 1707 } 1708 1709 #undef __FUNCT__ 1710 #define __FUNCT__ "MatCreate_SeqAIJCUSPARSE" 1711 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B) 1712 { 1713 PetscErrorCode ierr; 1714 cusparseStatus_t stat; 1715 cusparseHandle_t handle=0; 1716 1717 PetscFunctionBegin; 1718 ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr); 1719 if (B->factortype==MAT_FACTOR_NONE) { 1720 /* you cannot check the inode.use flag here since the matrix was just created. 1721 now build a GPU matrix data structure */ 1722 B->spptr = new Mat_SeqAIJCUSPARSE; 1723 ((Mat_SeqAIJCUSPARSE*)B->spptr)->mat = 0; 1724 ((Mat_SeqAIJCUSPARSE*)B->spptr)->matTranspose = 0; 1725 ((Mat_SeqAIJCUSPARSE*)B->spptr)->workVector = 0; 1726 ((Mat_SeqAIJCUSPARSE*)B->spptr)->format = MAT_CUSPARSE_CSR; 1727 ((Mat_SeqAIJCUSPARSE*)B->spptr)->stream = 0; 1728 ((Mat_SeqAIJCUSPARSE*)B->spptr)->handle = 0; 1729 stat = cusparseCreate(&handle);CHKERRCUDA(stat); 1730 ((Mat_SeqAIJCUSPARSE*)B->spptr)->handle = handle; 1731 ((Mat_SeqAIJCUSPARSE*)B->spptr)->stream = 0; 1732 } else { 1733 /* NEXT, set the pointers to the triangular factors */ 1734 B->spptr = new Mat_SeqAIJCUSPARSETriFactors; 1735 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->loTriFactorPtr = 0; 1736 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->upTriFactorPtr = 0; 1737 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->loTriFactorPtrTranspose = 0; 1738 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->upTriFactorPtrTranspose = 0; 1739 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->rpermIndices = 0; 1740 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->cpermIndices = 0; 1741 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->workVector = 0; 1742 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->handle = 0; 1743 stat = cusparseCreate(&handle);CHKERRCUDA(stat); 1744 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->handle = handle; 1745 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->nnz = 0; 1746 } 1747 1748 B->ops->assemblyend = MatAssemblyEnd_SeqAIJCUSPARSE; 1749 B->ops->destroy = MatDestroy_SeqAIJCUSPARSE; 1750 B->ops->getvecs = MatCreateVecs_SeqAIJCUSPARSE; 1751 B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE; 1752 B->ops->mult = MatMult_SeqAIJCUSPARSE; 1753 B->ops->multadd = MatMultAdd_SeqAIJCUSPARSE; 1754 B->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE; 1755 B->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE; 1756 1757 ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 1758 1759 B->valid_GPU_matrix = PETSC_CUDA_UNALLOCATED; 1760 1761 ierr = PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr); 1762 PetscFunctionReturn(0); 1763 } 1764 1765 /*M 1766 MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices. 1767 1768 A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either 1769 CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later. 1770 All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library. 1771 1772 Options Database Keys: 1773 + -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions() 1774 . -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 1775 . -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 1776 1777 Level: beginner 1778 1779 .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 1780 M*/ 1781 1782 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat,MatFactorType,Mat*); 1783 1784 1785 #undef __FUNCT__ 1786 #define __FUNCT__ "MatSolverPackageRegister_CUSPARSE" 1787 PETSC_EXTERN PetscErrorCode MatSolverPackageRegister_CUSPARSE(void) 1788 { 1789 PetscErrorCode ierr; 1790 1791 PetscFunctionBegin; 1792 ierr = MatSolverPackageRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 1793 ierr = MatSolverPackageRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 1794 ierr = MatSolverPackageRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 1795 ierr = MatSolverPackageRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 1796 PetscFunctionReturn(0); 1797 } 1798 1799 1800 #undef __FUNCT__ 1801 #define __FUNCT__ "Mat_SeqAIJCUSPARSE_Destroy" 1802 static PetscErrorCode Mat_SeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct) 1803 { 1804 cusparseStatus_t stat; 1805 cusparseHandle_t handle; 1806 1807 PetscFunctionBegin; 1808 if (*cusparsestruct) { 1809 Mat_SeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format); 1810 Mat_SeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format); 1811 delete (*cusparsestruct)->workVector; 1812 if (handle = (*cusparsestruct)->handle) { 1813 stat = cusparseDestroy(handle);CHKERRCUDA(stat); 1814 } 1815 delete *cusparsestruct; 1816 *cusparsestruct = 0; 1817 } 1818 PetscFunctionReturn(0); 1819 } 1820 1821 #undef __FUNCT__ 1822 #define __FUNCT__ "CsrMatrix_Destroy" 1823 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat) 1824 { 1825 PetscFunctionBegin; 1826 if (*mat) { 1827 delete (*mat)->values; 1828 delete (*mat)->column_indices; 1829 delete (*mat)->row_offsets; 1830 delete *mat; 1831 *mat = 0; 1832 } 1833 PetscFunctionReturn(0); 1834 } 1835 1836 #undef __FUNCT__ 1837 #define __FUNCT__ "Mat_SeqAIJCUSPARSETriFactorStruct_Destroy" 1838 static PetscErrorCode Mat_SeqAIJCUSPARSETriFactorStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor) 1839 { 1840 cusparseStatus_t stat; 1841 PetscErrorCode ierr; 1842 1843 PetscFunctionBegin; 1844 if (*trifactor) { 1845 if ((*trifactor)->descr) { stat = cusparseDestroyMatDescr((*trifactor)->descr);CHKERRCUDA(stat); } 1846 if ((*trifactor)->solveInfo) { stat = cusparseDestroySolveAnalysisInfo((*trifactor)->solveInfo);CHKERRCUDA(stat); } 1847 ierr = CsrMatrix_Destroy(&(*trifactor)->csrMat);CHKERRQ(ierr); 1848 delete *trifactor; 1849 *trifactor = 0; 1850 } 1851 PetscFunctionReturn(0); 1852 } 1853 1854 #undef __FUNCT__ 1855 #define __FUNCT__ "Mat_SeqAIJCUSPARSEMultStruct_Destroy" 1856 static PetscErrorCode Mat_SeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format) 1857 { 1858 CsrMatrix *mat; 1859 cusparseStatus_t stat; 1860 cudaError_t err; 1861 1862 PetscFunctionBegin; 1863 if (*matstruct) { 1864 if ((*matstruct)->mat) { 1865 if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) { 1866 cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat; 1867 stat = cusparseDestroyHybMat(hybMat);CHKERRCUDA(stat); 1868 } else { 1869 mat = (CsrMatrix*)(*matstruct)->mat; 1870 CsrMatrix_Destroy(&mat); 1871 } 1872 } 1873 if ((*matstruct)->descr) { stat = cusparseDestroyMatDescr((*matstruct)->descr);CHKERRCUDA(stat); } 1874 delete (*matstruct)->cprowIndices; 1875 if ((*matstruct)->alpha) { err=cudaFree((*matstruct)->alpha);CHKERRCUDA(err); } 1876 if ((*matstruct)->beta) { err=cudaFree((*matstruct)->beta);CHKERRCUDA(err); } 1877 delete *matstruct; 1878 *matstruct = 0; 1879 } 1880 PetscFunctionReturn(0); 1881 } 1882 1883 #undef __FUNCT__ 1884 #define __FUNCT__ "Mat_SeqAIJCUSPARSETriFactors_Destroy" 1885 static PetscErrorCode Mat_SeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors) 1886 { 1887 cusparseHandle_t handle; 1888 cusparseStatus_t stat; 1889 1890 PetscFunctionBegin; 1891 if (*trifactors) { 1892 Mat_SeqAIJCUSPARSETriFactorStruct_Destroy(&(*trifactors)->loTriFactorPtr); 1893 Mat_SeqAIJCUSPARSETriFactorStruct_Destroy(&(*trifactors)->upTriFactorPtr); 1894 Mat_SeqAIJCUSPARSETriFactorStruct_Destroy(&(*trifactors)->loTriFactorPtrTranspose); 1895 Mat_SeqAIJCUSPARSETriFactorStruct_Destroy(&(*trifactors)->upTriFactorPtrTranspose); 1896 delete (*trifactors)->rpermIndices; 1897 delete (*trifactors)->cpermIndices; 1898 delete (*trifactors)->workVector; 1899 if (handle = (*trifactors)->handle) { 1900 stat = cusparseDestroy(handle);CHKERRCUDA(stat); 1901 } 1902 delete *trifactors; 1903 *trifactors = 0; 1904 } 1905 PetscFunctionReturn(0); 1906 } 1907 1908