1 /* 2 Defines the basic matrix operations for the AIJ (compressed row) 3 matrix storage format using the CUSPARSE library, 4 */ 5 #define PETSC_SKIP_SPINLOCK 6 7 #include <petscconf.h> 8 #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/ 9 #include <../src/mat/impls/sbaij/seq/sbaij.h> 10 #include <../src/vec/vec/impls/dvecimpl.h> 11 #include <petsc/private/vecimpl.h> 12 #undef VecType 13 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h> 14 15 const char *const MatCUSPARSEStorageFormats[] = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0}; 16 17 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*); 18 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*); 19 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*); 20 21 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 22 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 23 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*); 24 25 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec); 26 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 27 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 28 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 29 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat); 30 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec); 31 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 32 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 33 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 34 35 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**); 36 static PetscErrorCode Mat_SeqAIJCUSPARSETriFactorStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**); 37 static PetscErrorCode Mat_SeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat); 38 static PetscErrorCode Mat_SeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**); 39 static PetscErrorCode Mat_SeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**); 40 41 #undef __FUNCT__ 42 #define __FUNCT__ "MatCUSPARSESetStream" 43 PetscErrorCode MatCUSPARSESetStream(Mat A,const cudaStream_t stream) 44 { 45 cusparseStatus_t stat; 46 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 47 48 PetscFunctionBegin; 49 cusparsestruct->stream = stream; 50 stat = cusparseSetStream(cusparsestruct->handle,cusparsestruct->stream);CHKERRCUDA(stat); 51 PetscFunctionReturn(0); 52 } 53 54 #undef __FUNCT__ 55 #define __FUNCT__ "MatCUSPARSESetHandle" 56 PetscErrorCode MatCUSPARSESetHandle(Mat A,const cusparseHandle_t handle) 57 { 58 cusparseStatus_t stat; 59 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 60 61 PetscFunctionBegin; 62 if (cusparsestruct->handle) { 63 stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUDA(stat); 64 } 65 cusparsestruct->handle = handle; 66 stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUDA(stat); 67 PetscFunctionReturn(0); 68 } 69 70 #undef __FUNCT__ 71 #define __FUNCT__ "MatCUSPARSEClearHandle" 72 PetscErrorCode MatCUSPARSEClearHandle(Mat A) 73 { 74 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 75 PetscFunctionBegin; 76 if (cusparsestruct->handle) 77 cusparsestruct->handle = 0; 78 PetscFunctionReturn(0); 79 } 80 81 #undef __FUNCT__ 82 #define __FUNCT__ "MatFactorGetSolverPackage_seqaij_cusparse" 83 PetscErrorCode MatFactorGetSolverPackage_seqaij_cusparse(Mat A,const MatSolverPackage *type) 84 { 85 PetscFunctionBegin; 86 *type = MATSOLVERCUSPARSE; 87 PetscFunctionReturn(0); 88 } 89 90 /*MC 91 MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices 92 on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported 93 algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer 94 performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the 95 CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these 96 algorithms are not recommended. This class does NOT support direct solver operations. 97 98 Level: beginner 99 100 .seealso: PCFactorSetMatSolverPackage(), MatSolverPackage, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 101 M*/ 102 103 #undef __FUNCT__ 104 #define __FUNCT__ "MatGetFactor_seqaijcusparse_cusparse" 105 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B) 106 { 107 PetscErrorCode ierr; 108 PetscInt n = A->rmap->n; 109 110 PetscFunctionBegin; 111 ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr); 112 (*B)->factortype = ftype; 113 ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr); 114 ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 115 116 if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) { 117 ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr); 118 (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE; 119 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJCUSPARSE; 120 } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) { 121 (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJCUSPARSE; 122 (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE; 123 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types"); 124 125 ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr); 126 ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverPackage_C",MatFactorGetSolverPackage_seqaij_cusparse);CHKERRQ(ierr); 127 PetscFunctionReturn(0); 128 } 129 130 #undef __FUNCT__ 131 #define __FUNCT__ "MatCUSPARSESetFormat_SeqAIJCUSPARSE" 132 PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 133 { 134 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 135 136 PetscFunctionBegin; 137 #if CUDA_VERSION>=4020 138 switch (op) { 139 case MAT_CUSPARSE_MULT: 140 cusparsestruct->format = format; 141 break; 142 case MAT_CUSPARSE_ALL: 143 cusparsestruct->format = format; 144 break; 145 default: 146 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op); 147 } 148 #else 149 if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"ELL (Ellpack) and HYB (Hybrid) storage format require CUDA 4.2 or later."); 150 #endif 151 PetscFunctionReturn(0); 152 } 153 154 /*@ 155 MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular 156 operation. Only the MatMult operation can use different GPU storage formats 157 for MPIAIJCUSPARSE matrices. 158 Not Collective 159 160 Input Parameters: 161 + A - Matrix of type SEQAIJCUSPARSE 162 . op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL. 163 - format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2) 164 165 Output Parameter: 166 167 Level: intermediate 168 169 .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 170 @*/ 171 #undef __FUNCT__ 172 #define __FUNCT__ "MatCUSPARSESetFormat" 173 PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 174 { 175 PetscErrorCode ierr; 176 177 PetscFunctionBegin; 178 PetscValidHeaderSpecific(A, MAT_CLASSID,1); 179 ierr = PetscTryMethod(A, "MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));CHKERRQ(ierr); 180 PetscFunctionReturn(0); 181 } 182 183 #undef __FUNCT__ 184 #define __FUNCT__ "MatSetFromOptions_SeqAIJCUSPARSE" 185 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A) 186 { 187 PetscErrorCode ierr; 188 MatCUSPARSEStorageFormat format; 189 PetscBool flg; 190 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 191 192 PetscFunctionBegin; 193 ierr = PetscOptionsHead(PetscOptionsObject,"SeqAIJCUSPARSE options");CHKERRQ(ierr); 194 ierr = PetscObjectOptionsBegin((PetscObject)A); 195 if (A->factortype==MAT_FACTOR_NONE) { 196 ierr = PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV", 197 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 198 if (flg) { 199 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);CHKERRQ(ierr); 200 } 201 } 202 ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve", 203 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 204 if (flg) { 205 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr); 206 } 207 ierr = PetscOptionsEnd();CHKERRQ(ierr); 208 PetscFunctionReturn(0); 209 210 } 211 212 #undef __FUNCT__ 213 #define __FUNCT__ "MatILUFactorSymbolic_SeqAIJCUSPARSE" 214 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 215 { 216 PetscErrorCode ierr; 217 218 PetscFunctionBegin; 219 ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 220 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 221 PetscFunctionReturn(0); 222 } 223 224 #undef __FUNCT__ 225 #define __FUNCT__ "MatLUFactorSymbolic_SeqAIJCUSPARSE" 226 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 227 { 228 PetscErrorCode ierr; 229 230 PetscFunctionBegin; 231 ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 232 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 233 PetscFunctionReturn(0); 234 } 235 236 #undef __FUNCT__ 237 #define __FUNCT__ "MatICCFactorSymbolic_SeqAIJCUSPARSE" 238 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info) 239 { 240 PetscErrorCode ierr; 241 242 PetscFunctionBegin; 243 ierr = MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr); 244 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 245 PetscFunctionReturn(0); 246 } 247 248 #undef __FUNCT__ 249 #define __FUNCT__ "MatCholeskyFactorSymbolic_SeqAIJCUSPARSE" 250 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info) 251 { 252 PetscErrorCode ierr; 253 254 PetscFunctionBegin; 255 ierr = MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr); 256 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 257 PetscFunctionReturn(0); 258 } 259 260 #undef __FUNCT__ 261 #define __FUNCT__ "MatSeqAIJCUSPARSEBuildILULowerTriMatrix" 262 static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A) 263 { 264 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 265 PetscInt n = A->rmap->n; 266 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 267 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 268 cusparseStatus_t stat; 269 const PetscInt *ai = a->i,*aj = a->j,*vi; 270 const MatScalar *aa = a->a,*v; 271 PetscInt *AiLo, *AjLo; 272 PetscScalar *AALo; 273 PetscInt i,nz, nzLower, offset, rowOffset; 274 PetscErrorCode ierr; 275 276 PetscFunctionBegin; 277 if (A->valid_GPU_matrix == PETSC_CUDA_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUDA_CPU) { 278 try { 279 /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */ 280 nzLower=n+ai[n]-ai[1]; 281 282 /* Allocate Space for the lower triangular matrix */ 283 ierr = cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUDA(ierr); 284 ierr = cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUDA(ierr); 285 ierr = cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUDA(ierr); 286 287 /* Fill the lower triangular matrix */ 288 AiLo[0] = (PetscInt) 0; 289 AiLo[n] = nzLower; 290 AjLo[0] = (PetscInt) 0; 291 AALo[0] = (MatScalar) 1.0; 292 v = aa; 293 vi = aj; 294 offset = 1; 295 rowOffset= 1; 296 for (i=1; i<n; i++) { 297 nz = ai[i+1] - ai[i]; 298 /* additional 1 for the term on the diagonal */ 299 AiLo[i] = rowOffset; 300 rowOffset += nz+1; 301 302 ierr = PetscMemcpy(&(AjLo[offset]), vi, nz*sizeof(PetscInt));CHKERRQ(ierr); 303 ierr = PetscMemcpy(&(AALo[offset]), v, nz*sizeof(PetscScalar));CHKERRQ(ierr); 304 305 offset += nz; 306 AjLo[offset] = (PetscInt) i; 307 AALo[offset] = (MatScalar) 1.0; 308 offset += 1; 309 310 v += nz; 311 vi += nz; 312 } 313 314 /* allocate space for the triangular factor information */ 315 loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct; 316 317 /* Create the matrix description */ 318 stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUDA(stat); 319 stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat); 320 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUDA(stat); 321 stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER);CHKERRCUDA(stat); 322 stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUDA(stat); 323 324 /* Create the solve analysis information */ 325 stat = cusparseCreateSolveAnalysisInfo(&loTriFactor->solveInfo);CHKERRCUDA(stat); 326 327 /* set the operation */ 328 loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 329 330 /* set the matrix */ 331 loTriFactor->csrMat = new CsrMatrix; 332 loTriFactor->csrMat->num_rows = n; 333 loTriFactor->csrMat->num_cols = n; 334 loTriFactor->csrMat->num_entries = nzLower; 335 336 loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1); 337 loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1); 338 339 loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower); 340 loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower); 341 342 loTriFactor->csrMat->values = new THRUSTARRAY(nzLower); 343 loTriFactor->csrMat->values->assign(AALo, AALo+nzLower); 344 345 /* perform the solve analysis */ 346 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, 347 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 348 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 349 loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo);CHKERRCUDA(stat); 350 351 /* assign the pointer. Is this really necessary? */ 352 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor; 353 354 ierr = cudaFreeHost(AiLo);CHKERRCUDA(ierr); 355 ierr = cudaFreeHost(AjLo);CHKERRCUDA(ierr); 356 ierr = cudaFreeHost(AALo);CHKERRCUDA(ierr); 357 } catch(char *ex) { 358 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 359 } 360 } 361 PetscFunctionReturn(0); 362 } 363 364 #undef __FUNCT__ 365 #define __FUNCT__ "MatSeqAIJCUSPARSEBuildILUUpperTriMatrix" 366 static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A) 367 { 368 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 369 PetscInt n = A->rmap->n; 370 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 371 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 372 cusparseStatus_t stat; 373 const PetscInt *aj = a->j,*adiag = a->diag,*vi; 374 const MatScalar *aa = a->a,*v; 375 PetscInt *AiUp, *AjUp; 376 PetscScalar *AAUp; 377 PetscInt i,nz, nzUpper, offset; 378 PetscErrorCode ierr; 379 380 PetscFunctionBegin; 381 if (A->valid_GPU_matrix == PETSC_CUDA_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUDA_CPU) { 382 try { 383 /* next, figure out the number of nonzeros in the upper triangular matrix. */ 384 nzUpper = adiag[0]-adiag[n]; 385 386 /* Allocate Space for the upper triangular matrix */ 387 ierr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(ierr); 388 ierr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(ierr); 389 ierr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(ierr); 390 391 /* Fill the upper triangular matrix */ 392 AiUp[0]=(PetscInt) 0; 393 AiUp[n]=nzUpper; 394 offset = nzUpper; 395 for (i=n-1; i>=0; i--) { 396 v = aa + adiag[i+1] + 1; 397 vi = aj + adiag[i+1] + 1; 398 399 /* number of elements NOT on the diagonal */ 400 nz = adiag[i] - adiag[i+1]-1; 401 402 /* decrement the offset */ 403 offset -= (nz+1); 404 405 /* first, set the diagonal elements */ 406 AjUp[offset] = (PetscInt) i; 407 AAUp[offset] = (MatScalar)1./v[nz]; 408 AiUp[i] = AiUp[i+1] - (nz+1); 409 410 ierr = PetscMemcpy(&(AjUp[offset+1]), vi, nz*sizeof(PetscInt));CHKERRQ(ierr); 411 ierr = PetscMemcpy(&(AAUp[offset+1]), v, nz*sizeof(PetscScalar));CHKERRQ(ierr); 412 } 413 414 /* allocate space for the triangular factor information */ 415 upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct; 416 417 /* Create the matrix description */ 418 stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUDA(stat); 419 stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat); 420 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUDA(stat); 421 stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUDA(stat); 422 stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUDA(stat); 423 424 /* Create the solve analysis information */ 425 stat = cusparseCreateSolveAnalysisInfo(&upTriFactor->solveInfo);CHKERRCUDA(stat); 426 427 /* set the operation */ 428 upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 429 430 /* set the matrix */ 431 upTriFactor->csrMat = new CsrMatrix; 432 upTriFactor->csrMat->num_rows = n; 433 upTriFactor->csrMat->num_cols = n; 434 upTriFactor->csrMat->num_entries = nzUpper; 435 436 upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1); 437 upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1); 438 439 upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper); 440 upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper); 441 442 upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper); 443 upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper); 444 445 /* perform the solve analysis */ 446 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, 447 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 448 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 449 upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo);CHKERRCUDA(stat); 450 451 /* assign the pointer. Is this really necessary? */ 452 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor; 453 454 ierr = cudaFreeHost(AiUp);CHKERRCUDA(ierr); 455 ierr = cudaFreeHost(AjUp);CHKERRCUDA(ierr); 456 ierr = cudaFreeHost(AAUp);CHKERRCUDA(ierr); 457 } catch(char *ex) { 458 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 459 } 460 } 461 PetscFunctionReturn(0); 462 } 463 464 #undef __FUNCT__ 465 #define __FUNCT__ "MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU" 466 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A) 467 { 468 PetscErrorCode ierr; 469 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 470 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 471 IS isrow = a->row,iscol = a->icol; 472 PetscBool row_identity,col_identity; 473 const PetscInt *r,*c; 474 PetscInt n = A->rmap->n; 475 476 PetscFunctionBegin; 477 ierr = MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);CHKERRQ(ierr); 478 ierr = MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);CHKERRQ(ierr); 479 480 cusparseTriFactors->workVector = new THRUSTARRAY; 481 cusparseTriFactors->workVector->resize(n); 482 cusparseTriFactors->nnz=a->nz; 483 484 A->valid_GPU_matrix = PETSC_CUDA_BOTH; 485 /*lower triangular indices */ 486 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 487 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 488 if (!row_identity) { 489 cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 490 cusparseTriFactors->rpermIndices->assign(r, r+n); 491 } 492 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 493 494 /*upper triangular indices */ 495 ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr); 496 ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 497 if (!col_identity) { 498 cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 499 cusparseTriFactors->cpermIndices->assign(c, c+n); 500 } 501 ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr); 502 PetscFunctionReturn(0); 503 } 504 505 #undef __FUNCT__ 506 #define __FUNCT__ "MatSeqAIJCUSPARSEBuildICCTriMatrices" 507 static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A) 508 { 509 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 510 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 511 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 512 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 513 cusparseStatus_t stat; 514 PetscErrorCode ierr; 515 PetscInt *AiUp, *AjUp; 516 PetscScalar *AAUp; 517 PetscScalar *AALo; 518 PetscInt nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j; 519 Mat_SeqSBAIJ *b = (Mat_SeqSBAIJ*)A->data; 520 const PetscInt *ai = b->i,*aj = b->j,*vj; 521 const MatScalar *aa = b->a,*v; 522 523 PetscFunctionBegin; 524 if (A->valid_GPU_matrix == PETSC_CUDA_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUDA_CPU) { 525 try { 526 /* Allocate Space for the upper triangular matrix */ 527 ierr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(ierr); 528 ierr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(ierr); 529 ierr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(ierr); 530 ierr = cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUDA(ierr); 531 532 /* Fill the upper triangular matrix */ 533 AiUp[0]=(PetscInt) 0; 534 AiUp[n]=nzUpper; 535 offset = 0; 536 for (i=0; i<n; i++) { 537 /* set the pointers */ 538 v = aa + ai[i]; 539 vj = aj + ai[i]; 540 nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */ 541 542 /* first, set the diagonal elements */ 543 AjUp[offset] = (PetscInt) i; 544 AAUp[offset] = (MatScalar)1.0/v[nz]; 545 AiUp[i] = offset; 546 AALo[offset] = (MatScalar)1.0/v[nz]; 547 548 offset+=1; 549 if (nz>0) { 550 ierr = PetscMemcpy(&(AjUp[offset]), vj, nz*sizeof(PetscInt));CHKERRQ(ierr); 551 ierr = PetscMemcpy(&(AAUp[offset]), v, nz*sizeof(PetscScalar));CHKERRQ(ierr); 552 for (j=offset; j<offset+nz; j++) { 553 AAUp[j] = -AAUp[j]; 554 AALo[j] = AAUp[j]/v[nz]; 555 } 556 offset+=nz; 557 } 558 } 559 560 /* allocate space for the triangular factor information */ 561 upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct; 562 563 /* Create the matrix description */ 564 stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUDA(stat); 565 stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat); 566 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUDA(stat); 567 stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUDA(stat); 568 stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUDA(stat); 569 570 /* Create the solve analysis information */ 571 stat = cusparseCreateSolveAnalysisInfo(&upTriFactor->solveInfo);CHKERRCUDA(stat); 572 573 /* set the operation */ 574 upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 575 576 /* set the matrix */ 577 upTriFactor->csrMat = new CsrMatrix; 578 upTriFactor->csrMat->num_rows = A->rmap->n; 579 upTriFactor->csrMat->num_cols = A->cmap->n; 580 upTriFactor->csrMat->num_entries = a->nz; 581 582 upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 583 upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1); 584 585 upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 586 upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz); 587 588 upTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 589 upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz); 590 591 /* perform the solve analysis */ 592 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, 593 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 594 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 595 upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo);CHKERRCUDA(stat); 596 597 /* assign the pointer. Is this really necessary? */ 598 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor; 599 600 /* allocate space for the triangular factor information */ 601 loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct; 602 603 /* Create the matrix description */ 604 stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUDA(stat); 605 stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat); 606 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUDA(stat); 607 stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUDA(stat); 608 stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUDA(stat); 609 610 /* Create the solve analysis information */ 611 stat = cusparseCreateSolveAnalysisInfo(&loTriFactor->solveInfo);CHKERRCUDA(stat); 612 613 /* set the operation */ 614 loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE; 615 616 /* set the matrix */ 617 loTriFactor->csrMat = new CsrMatrix; 618 loTriFactor->csrMat->num_rows = A->rmap->n; 619 loTriFactor->csrMat->num_cols = A->cmap->n; 620 loTriFactor->csrMat->num_entries = a->nz; 621 622 loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 623 loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1); 624 625 loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 626 loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz); 627 628 loTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 629 loTriFactor->csrMat->values->assign(AALo, AALo+a->nz); 630 631 /* perform the solve analysis */ 632 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, 633 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 634 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 635 loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo);CHKERRCUDA(stat); 636 637 /* assign the pointer. Is this really necessary? */ 638 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor; 639 640 A->valid_GPU_matrix = PETSC_CUDA_BOTH; 641 ierr = cudaFreeHost(AiUp);CHKERRCUDA(ierr); 642 ierr = cudaFreeHost(AjUp);CHKERRCUDA(ierr); 643 ierr = cudaFreeHost(AAUp);CHKERRCUDA(ierr); 644 ierr = cudaFreeHost(AALo);CHKERRCUDA(ierr); 645 } catch(char *ex) { 646 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 647 } 648 } 649 PetscFunctionReturn(0); 650 } 651 652 #undef __FUNCT__ 653 #define __FUNCT__ "MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU" 654 static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A) 655 { 656 PetscErrorCode ierr; 657 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 658 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 659 IS ip = a->row; 660 const PetscInt *rip; 661 PetscBool perm_identity; 662 PetscInt n = A->rmap->n; 663 664 PetscFunctionBegin; 665 ierr = MatSeqAIJCUSPARSEBuildICCTriMatrices(A);CHKERRQ(ierr); 666 cusparseTriFactors->workVector = new THRUSTARRAY; 667 cusparseTriFactors->workVector->resize(n); 668 cusparseTriFactors->nnz=(a->nz-n)*2 + n; 669 670 /*lower triangular indices */ 671 ierr = ISGetIndices(ip,&rip);CHKERRQ(ierr); 672 ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr); 673 if (!perm_identity) { 674 cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 675 cusparseTriFactors->rpermIndices->assign(rip, rip+n); 676 cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 677 cusparseTriFactors->cpermIndices->assign(rip, rip+n); 678 } 679 ierr = ISRestoreIndices(ip,&rip);CHKERRQ(ierr); 680 PetscFunctionReturn(0); 681 } 682 683 #undef __FUNCT__ 684 #define __FUNCT__ "MatLUFactorNumeric_SeqAIJCUSPARSE" 685 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info) 686 { 687 Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 688 IS isrow = b->row,iscol = b->col; 689 PetscBool row_identity,col_identity; 690 PetscErrorCode ierr; 691 692 PetscFunctionBegin; 693 ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 694 /* determine which version of MatSolve needs to be used. */ 695 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 696 ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 697 if (row_identity && col_identity) { 698 B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 699 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 700 } else { 701 B->ops->solve = MatSolve_SeqAIJCUSPARSE; 702 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 703 } 704 705 /* get the triangular factors */ 706 ierr = MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);CHKERRQ(ierr); 707 PetscFunctionReturn(0); 708 } 709 710 #undef __FUNCT__ 711 #define __FUNCT__ "MatCholeskyFactorNumeric_SeqAIJCUSPARSE" 712 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info) 713 { 714 Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 715 IS ip = b->row; 716 PetscBool perm_identity; 717 PetscErrorCode ierr; 718 719 PetscFunctionBegin; 720 ierr = MatCholeskyFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 721 722 /* determine which version of MatSolve needs to be used. */ 723 ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr); 724 if (perm_identity) { 725 B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 726 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 727 } else { 728 B->ops->solve = MatSolve_SeqAIJCUSPARSE; 729 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 730 } 731 732 /* get the triangular factors */ 733 ierr = MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);CHKERRQ(ierr); 734 PetscFunctionReturn(0); 735 } 736 737 #undef __FUNCT__ 738 #define __FUNCT__ "MatSeqAIJCUSPARSEAnalyzeTransposeForSolve" 739 static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A) 740 { 741 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 742 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 743 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 744 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 745 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 746 cusparseStatus_t stat; 747 cusparseIndexBase_t indexBase; 748 cusparseMatrixType_t matrixType; 749 cusparseFillMode_t fillMode; 750 cusparseDiagType_t diagType; 751 752 PetscFunctionBegin; 753 754 /*********************************************/ 755 /* Now the Transpose of the Lower Tri Factor */ 756 /*********************************************/ 757 758 /* allocate space for the transpose of the lower triangular factor */ 759 loTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct; 760 761 /* set the matrix descriptors of the lower triangular factor */ 762 matrixType = cusparseGetMatType(loTriFactor->descr); 763 indexBase = cusparseGetMatIndexBase(loTriFactor->descr); 764 fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ? 765 CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 766 diagType = cusparseGetMatDiagType(loTriFactor->descr); 767 768 /* Create the matrix description */ 769 stat = cusparseCreateMatDescr(&loTriFactorT->descr);CHKERRCUDA(stat); 770 stat = cusparseSetMatIndexBase(loTriFactorT->descr, indexBase);CHKERRCUDA(stat); 771 stat = cusparseSetMatType(loTriFactorT->descr, matrixType);CHKERRCUDA(stat); 772 stat = cusparseSetMatFillMode(loTriFactorT->descr, fillMode);CHKERRCUDA(stat); 773 stat = cusparseSetMatDiagType(loTriFactorT->descr, diagType);CHKERRCUDA(stat); 774 775 /* Create the solve analysis information */ 776 stat = cusparseCreateSolveAnalysisInfo(&loTriFactorT->solveInfo);CHKERRCUDA(stat); 777 778 /* set the operation */ 779 loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 780 781 /* allocate GPU space for the CSC of the lower triangular factor*/ 782 loTriFactorT->csrMat = new CsrMatrix; 783 loTriFactorT->csrMat->num_rows = loTriFactor->csrMat->num_rows; 784 loTriFactorT->csrMat->num_cols = loTriFactor->csrMat->num_cols; 785 loTriFactorT->csrMat->num_entries = loTriFactor->csrMat->num_entries; 786 loTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(loTriFactor->csrMat->num_rows+1); 787 loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactor->csrMat->num_entries); 788 loTriFactorT->csrMat->values = new THRUSTARRAY(loTriFactor->csrMat->num_entries); 789 790 /* compute the transpose of the lower triangular factor, i.e. the CSC */ 791 stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, 792 loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, 793 loTriFactor->csrMat->values->data().get(), 794 loTriFactor->csrMat->row_offsets->data().get(), 795 loTriFactor->csrMat->column_indices->data().get(), 796 loTriFactorT->csrMat->values->data().get(), 797 loTriFactorT->csrMat->column_indices->data().get(), 798 loTriFactorT->csrMat->row_offsets->data().get(), 799 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUDA(stat); 800 801 /* perform the solve analysis on the transposed matrix */ 802 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp, 803 loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, 804 loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(), 805 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), 806 loTriFactorT->solveInfo);CHKERRCUDA(stat); 807 808 /* assign the pointer. Is this really necessary? */ 809 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT; 810 811 /*********************************************/ 812 /* Now the Transpose of the Upper Tri Factor */ 813 /*********************************************/ 814 815 /* allocate space for the transpose of the upper triangular factor */ 816 upTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct; 817 818 /* set the matrix descriptors of the upper triangular factor */ 819 matrixType = cusparseGetMatType(upTriFactor->descr); 820 indexBase = cusparseGetMatIndexBase(upTriFactor->descr); 821 fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ? 822 CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 823 diagType = cusparseGetMatDiagType(upTriFactor->descr); 824 825 /* Create the matrix description */ 826 stat = cusparseCreateMatDescr(&upTriFactorT->descr);CHKERRCUDA(stat); 827 stat = cusparseSetMatIndexBase(upTriFactorT->descr, indexBase);CHKERRCUDA(stat); 828 stat = cusparseSetMatType(upTriFactorT->descr, matrixType);CHKERRCUDA(stat); 829 stat = cusparseSetMatFillMode(upTriFactorT->descr, fillMode);CHKERRCUDA(stat); 830 stat = cusparseSetMatDiagType(upTriFactorT->descr, diagType);CHKERRCUDA(stat); 831 832 /* Create the solve analysis information */ 833 stat = cusparseCreateSolveAnalysisInfo(&upTriFactorT->solveInfo);CHKERRCUDA(stat); 834 835 /* set the operation */ 836 upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 837 838 /* allocate GPU space for the CSC of the upper triangular factor*/ 839 upTriFactorT->csrMat = new CsrMatrix; 840 upTriFactorT->csrMat->num_rows = upTriFactor->csrMat->num_rows; 841 upTriFactorT->csrMat->num_cols = upTriFactor->csrMat->num_cols; 842 upTriFactorT->csrMat->num_entries = upTriFactor->csrMat->num_entries; 843 upTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(upTriFactor->csrMat->num_rows+1); 844 upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactor->csrMat->num_entries); 845 upTriFactorT->csrMat->values = new THRUSTARRAY(upTriFactor->csrMat->num_entries); 846 847 /* compute the transpose of the upper triangular factor, i.e. the CSC */ 848 stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, 849 upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, 850 upTriFactor->csrMat->values->data().get(), 851 upTriFactor->csrMat->row_offsets->data().get(), 852 upTriFactor->csrMat->column_indices->data().get(), 853 upTriFactorT->csrMat->values->data().get(), 854 upTriFactorT->csrMat->column_indices->data().get(), 855 upTriFactorT->csrMat->row_offsets->data().get(), 856 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUDA(stat); 857 858 /* perform the solve analysis on the transposed matrix */ 859 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp, 860 upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, 861 upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(), 862 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), 863 upTriFactorT->solveInfo);CHKERRCUDA(stat); 864 865 /* assign the pointer. Is this really necessary? */ 866 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT; 867 PetscFunctionReturn(0); 868 } 869 870 #undef __FUNCT__ 871 #define __FUNCT__ "MatSeqAIJCUSPARSEGenerateTransposeForMult" 872 static PetscErrorCode MatSeqAIJCUSPARSEGenerateTransposeForMult(Mat A) 873 { 874 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 875 Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 876 Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 877 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 878 cusparseStatus_t stat; 879 cusparseIndexBase_t indexBase; 880 cudaError_t err; 881 882 PetscFunctionBegin; 883 884 /* allocate space for the triangular factor information */ 885 matstructT = new Mat_SeqAIJCUSPARSEMultStruct; 886 stat = cusparseCreateMatDescr(&matstructT->descr);CHKERRCUDA(stat); 887 indexBase = cusparseGetMatIndexBase(matstruct->descr); 888 stat = cusparseSetMatIndexBase(matstructT->descr, indexBase);CHKERRCUDA(stat); 889 stat = cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUDA(stat); 890 891 /* set alpha and beta */ 892 err = cudaMalloc((void **)&(matstructT->alpha),sizeof(PetscScalar));CHKERRCUDA(err); 893 err = cudaMemcpy(matstructT->alpha,&ALPHA,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 894 err = cudaMalloc((void **)&(matstructT->beta),sizeof(PetscScalar));CHKERRCUDA(err); 895 err = cudaMemcpy(matstructT->beta,&BETA,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 896 stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUDA(stat); 897 898 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 899 CsrMatrix *matrix = (CsrMatrix*)matstruct->mat; 900 CsrMatrix *matrixT= new CsrMatrix; 901 matrixT->num_rows = A->rmap->n; 902 matrixT->num_cols = A->cmap->n; 903 matrixT->num_entries = a->nz; 904 matrixT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 905 matrixT->column_indices = new THRUSTINTARRAY32(a->nz); 906 matrixT->values = new THRUSTARRAY(a->nz); 907 908 /* compute the transpose of the upper triangular factor, i.e. the CSC */ 909 indexBase = cusparseGetMatIndexBase(matstruct->descr); 910 stat = cusparse_csr2csc(cusparsestruct->handle, matrix->num_rows, 911 matrix->num_cols, matrix->num_entries, 912 matrix->values->data().get(), 913 matrix->row_offsets->data().get(), 914 matrix->column_indices->data().get(), 915 matrixT->values->data().get(), 916 matrixT->column_indices->data().get(), 917 matrixT->row_offsets->data().get(), 918 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUDA(stat); 919 920 /* assign the pointer */ 921 matstructT->mat = matrixT; 922 923 } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) { 924 #if CUDA_VERSION>=5000 925 /* First convert HYB to CSR */ 926 CsrMatrix *temp= new CsrMatrix; 927 temp->num_rows = A->rmap->n; 928 temp->num_cols = A->cmap->n; 929 temp->num_entries = a->nz; 930 temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 931 temp->column_indices = new THRUSTINTARRAY32(a->nz); 932 temp->values = new THRUSTARRAY(a->nz); 933 934 935 stat = cusparse_hyb2csr(cusparsestruct->handle, 936 matstruct->descr, (cusparseHybMat_t)matstruct->mat, 937 temp->values->data().get(), 938 temp->row_offsets->data().get(), 939 temp->column_indices->data().get());CHKERRCUDA(stat); 940 941 /* Next, convert CSR to CSC (i.e. the matrix transpose) */ 942 CsrMatrix *tempT= new CsrMatrix; 943 tempT->num_rows = A->rmap->n; 944 tempT->num_cols = A->cmap->n; 945 tempT->num_entries = a->nz; 946 tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 947 tempT->column_indices = new THRUSTINTARRAY32(a->nz); 948 tempT->values = new THRUSTARRAY(a->nz); 949 950 stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows, 951 temp->num_cols, temp->num_entries, 952 temp->values->data().get(), 953 temp->row_offsets->data().get(), 954 temp->column_indices->data().get(), 955 tempT->values->data().get(), 956 tempT->column_indices->data().get(), 957 tempT->row_offsets->data().get(), 958 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUDA(stat); 959 960 /* Last, convert CSC to HYB */ 961 cusparseHybMat_t hybMat; 962 stat = cusparseCreateHybMat(&hybMat);CHKERRCUDA(stat); 963 cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ? 964 CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 965 stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n, 966 matstructT->descr, tempT->values->data().get(), 967 tempT->row_offsets->data().get(), 968 tempT->column_indices->data().get(), 969 hybMat, 0, partition);CHKERRCUDA(stat); 970 971 /* assign the pointer */ 972 matstructT->mat = hybMat; 973 974 /* delete temporaries */ 975 if (tempT) { 976 if (tempT->values) delete (THRUSTARRAY*) tempT->values; 977 if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices; 978 if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets; 979 delete (CsrMatrix*) tempT; 980 } 981 if (temp) { 982 if (temp->values) delete (THRUSTARRAY*) temp->values; 983 if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices; 984 if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets; 985 delete (CsrMatrix*) temp; 986 } 987 #else 988 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"ELL (Ellpack) and HYB (Hybrid) storage format for the Matrix Transpose (in MatMultTranspose) require CUDA 5.0 or later."); 989 #endif 990 } 991 /* assign the compressed row indices */ 992 matstructT->cprowIndices = new THRUSTINTARRAY; 993 994 /* assign the pointer */ 995 ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT; 996 PetscFunctionReturn(0); 997 } 998 999 #undef __FUNCT__ 1000 #define __FUNCT__ "MatSolveTranspose_SeqAIJCUSPARSE" 1001 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 1002 { 1003 PetscInt n = xx->map->n; 1004 const PetscScalar *barray; 1005 PetscScalar *xarray; 1006 thrust::device_ptr<const PetscScalar> bGPU; 1007 thrust::device_ptr<PetscScalar> xGPU; 1008 cusparseStatus_t stat; 1009 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1010 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1011 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1012 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1013 PetscErrorCode ierr; 1014 1015 PetscFunctionBegin; 1016 /* Analyze the matrix and create the transpose ... on the fly */ 1017 if (!loTriFactorT && !upTriFactorT) { 1018 ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr); 1019 loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1020 upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1021 } 1022 1023 /* Get the GPU pointers */ 1024 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1025 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1026 xGPU = thrust::device_pointer_cast(xarray); 1027 bGPU = thrust::device_pointer_cast(barray); 1028 1029 /* First, reorder with the row permutation */ 1030 thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), 1031 thrust::make_permutation_iterator(bGPU+n, cusparseTriFactors->rpermIndices->end()), 1032 xGPU); 1033 1034 /* First, solve U */ 1035 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, 1036 upTriFactorT->csrMat->num_rows, &ALPHA, upTriFactorT->descr, 1037 upTriFactorT->csrMat->values->data().get(), 1038 upTriFactorT->csrMat->row_offsets->data().get(), 1039 upTriFactorT->csrMat->column_indices->data().get(), 1040 upTriFactorT->solveInfo, 1041 xarray, tempGPU->data().get());CHKERRCUDA(stat); 1042 1043 /* Then, solve L */ 1044 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, 1045 loTriFactorT->csrMat->num_rows, &ALPHA, loTriFactorT->descr, 1046 loTriFactorT->csrMat->values->data().get(), 1047 loTriFactorT->csrMat->row_offsets->data().get(), 1048 loTriFactorT->csrMat->column_indices->data().get(), 1049 loTriFactorT->solveInfo, 1050 tempGPU->data().get(), xarray);CHKERRCUDA(stat); 1051 1052 /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */ 1053 thrust::copy(thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()), 1054 thrust::make_permutation_iterator(xGPU+n, cusparseTriFactors->cpermIndices->end()), 1055 tempGPU->begin()); 1056 1057 /* Copy the temporary to the full solution. */ 1058 thrust::copy(tempGPU->begin(), tempGPU->end(), xGPU); 1059 1060 /* restore */ 1061 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1062 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1063 ierr = WaitForGPU();CHKERRCUDA(ierr); 1064 1065 ierr = PetscLogFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1066 PetscFunctionReturn(0); 1067 } 1068 1069 #undef __FUNCT__ 1070 #define __FUNCT__ "MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering" 1071 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 1072 { 1073 const PetscScalar *barray; 1074 PetscScalar *xarray; 1075 cusparseStatus_t stat; 1076 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1077 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1078 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1079 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1080 PetscErrorCode ierr; 1081 1082 PetscFunctionBegin; 1083 /* Analyze the matrix and create the transpose ... on the fly */ 1084 if (!loTriFactorT && !upTriFactorT) { 1085 ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr); 1086 loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1087 upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1088 } 1089 1090 /* Get the GPU pointers */ 1091 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1092 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1093 1094 /* First, solve U */ 1095 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, 1096 upTriFactorT->csrMat->num_rows, &ALPHA, upTriFactorT->descr, 1097 upTriFactorT->csrMat->values->data().get(), 1098 upTriFactorT->csrMat->row_offsets->data().get(), 1099 upTriFactorT->csrMat->column_indices->data().get(), 1100 upTriFactorT->solveInfo, 1101 barray, tempGPU->data().get());CHKERRCUDA(stat); 1102 1103 /* Then, solve L */ 1104 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, 1105 loTriFactorT->csrMat->num_rows, &ALPHA, loTriFactorT->descr, 1106 loTriFactorT->csrMat->values->data().get(), 1107 loTriFactorT->csrMat->row_offsets->data().get(), 1108 loTriFactorT->csrMat->column_indices->data().get(), 1109 loTriFactorT->solveInfo, 1110 tempGPU->data().get(), xarray);CHKERRCUDA(stat); 1111 1112 /* restore */ 1113 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1114 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1115 ierr = WaitForGPU();CHKERRCUDA(ierr); 1116 ierr = PetscLogFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1117 PetscFunctionReturn(0); 1118 } 1119 1120 #undef __FUNCT__ 1121 #define __FUNCT__ "MatSolve_SeqAIJCUSPARSE" 1122 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 1123 { 1124 const PetscScalar *barray; 1125 PetscScalar *xarray; 1126 thrust::device_ptr<const PetscScalar> bGPU; 1127 thrust::device_ptr<PetscScalar> xGPU; 1128 cusparseStatus_t stat; 1129 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1130 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1131 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1132 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1133 PetscErrorCode ierr; 1134 VecType t; 1135 PetscBool flg; 1136 1137 PetscFunctionBegin; 1138 ierr = VecGetType(bb,&t);CHKERRQ(ierr); 1139 ierr = PetscStrcmp(t,VECSEQCUDA,&flg);CHKERRQ(ierr); 1140 if (!flg) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Vector of type %s passed into MatSolve_SeqAIJCUSPARSE (Arg #2). Can only deal with %s\n.",t,VECSEQCUDA); 1141 ierr = VecGetType(xx,&t);CHKERRQ(ierr); 1142 ierr = PetscStrcmp(t,VECSEQCUDA,&flg);CHKERRQ(ierr); 1143 if (!flg) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Vector of type %s passed into MatSolve_SeqAIJCUSPARSE (Arg #3). Can only deal with %s\n.",t,VECSEQCUDA); 1144 1145 /* Get the GPU pointers */ 1146 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1147 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1148 xGPU = thrust::device_pointer_cast(xarray); 1149 bGPU = thrust::device_pointer_cast(barray); 1150 1151 /* First, reorder with the row permutation */ 1152 thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), 1153 thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()), 1154 xGPU); 1155 1156 /* Next, solve L */ 1157 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp, 1158 loTriFactor->csrMat->num_rows, &ALPHA, loTriFactor->descr, 1159 loTriFactor->csrMat->values->data().get(), 1160 loTriFactor->csrMat->row_offsets->data().get(), 1161 loTriFactor->csrMat->column_indices->data().get(), 1162 loTriFactor->solveInfo, 1163 xarray, tempGPU->data().get());CHKERRCUDA(stat); 1164 1165 /* Then, solve U */ 1166 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp, 1167 upTriFactor->csrMat->num_rows, &ALPHA, upTriFactor->descr, 1168 upTriFactor->csrMat->values->data().get(), 1169 upTriFactor->csrMat->row_offsets->data().get(), 1170 upTriFactor->csrMat->column_indices->data().get(), 1171 upTriFactor->solveInfo, 1172 tempGPU->data().get(), xarray);CHKERRCUDA(stat); 1173 1174 /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */ 1175 thrust::copy(thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()), 1176 thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->end()), 1177 tempGPU->begin()); 1178 1179 /* Copy the temporary to the full solution. */ 1180 thrust::copy(tempGPU->begin(), tempGPU->end(), xGPU); 1181 1182 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1183 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1184 ierr = WaitForGPU();CHKERRCUDA(ierr); 1185 ierr = PetscLogFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1186 PetscFunctionReturn(0); 1187 } 1188 1189 #undef __FUNCT__ 1190 #define __FUNCT__ "MatSolve_SeqAIJCUSPARSE_NaturalOrdering" 1191 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 1192 { 1193 const PetscScalar *barray; 1194 PetscScalar *xarray; 1195 cusparseStatus_t stat; 1196 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1197 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1198 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1199 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1200 PetscErrorCode ierr; 1201 1202 PetscFunctionBegin; 1203 /* Get the GPU pointers */ 1204 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1205 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1206 1207 /* First, solve L */ 1208 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp, 1209 loTriFactor->csrMat->num_rows, &ALPHA, loTriFactor->descr, 1210 loTriFactor->csrMat->values->data().get(), 1211 loTriFactor->csrMat->row_offsets->data().get(), 1212 loTriFactor->csrMat->column_indices->data().get(), 1213 loTriFactor->solveInfo, 1214 barray, tempGPU->data().get());CHKERRCUDA(stat); 1215 1216 /* Next, solve U */ 1217 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp, 1218 upTriFactor->csrMat->num_rows, &ALPHA, upTriFactor->descr, 1219 upTriFactor->csrMat->values->data().get(), 1220 upTriFactor->csrMat->row_offsets->data().get(), 1221 upTriFactor->csrMat->column_indices->data().get(), 1222 upTriFactor->solveInfo, 1223 tempGPU->data().get(), xarray);CHKERRCUDA(stat); 1224 1225 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1226 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1227 ierr = WaitForGPU();CHKERRCUDA(ierr); 1228 ierr = PetscLogFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1229 PetscFunctionReturn(0); 1230 } 1231 1232 #undef __FUNCT__ 1233 #define __FUNCT__ "MatSeqAIJCUSPARSECopyToGPU" 1234 static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A) 1235 { 1236 1237 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1238 Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 1239 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1240 PetscInt m = A->rmap->n,*ii,*ridx; 1241 PetscErrorCode ierr; 1242 cusparseStatus_t stat; 1243 cudaError_t err; 1244 1245 PetscFunctionBegin; 1246 if (A->valid_GPU_matrix == PETSC_CUDA_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUDA_CPU) { 1247 ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1248 if (A->assembled && A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) { 1249 CsrMatrix *matrix = (CsrMatrix*)matstruct->mat; 1250 /* copy values only */ 1251 matrix->values->assign(a->a, a->a+a->nz); 1252 } else { 1253 Mat_SeqAIJCUSPARSEMultStruct_Destroy(&matstruct,cusparsestruct->format); 1254 try { 1255 cusparsestruct->nonzerorow=0; 1256 for (int j = 0; j<m; j++) cusparsestruct->nonzerorow += ((a->i[j+1]-a->i[j])>0); 1257 1258 if (a->compressedrow.use) { 1259 m = a->compressedrow.nrows; 1260 ii = a->compressedrow.i; 1261 ridx = a->compressedrow.rindex; 1262 } else { 1263 /* Forcing compressed row on the GPU */ 1264 int k=0; 1265 ierr = PetscMalloc1(cusparsestruct->nonzerorow+1, &ii);CHKERRQ(ierr); 1266 ierr = PetscMalloc1(cusparsestruct->nonzerorow, &ridx);CHKERRQ(ierr); 1267 ii[0]=0; 1268 for (int j = 0; j<m; j++) { 1269 if ((a->i[j+1]-a->i[j])>0) { 1270 ii[k] = a->i[j]; 1271 ridx[k]= j; 1272 k++; 1273 } 1274 } 1275 ii[cusparsestruct->nonzerorow] = a->nz; 1276 m = cusparsestruct->nonzerorow; 1277 } 1278 1279 /* allocate space for the triangular factor information */ 1280 matstruct = new Mat_SeqAIJCUSPARSEMultStruct; 1281 stat = cusparseCreateMatDescr(&matstruct->descr);CHKERRCUDA(stat); 1282 stat = cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat); 1283 stat = cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUDA(stat); 1284 1285 err = cudaMalloc((void **)&(matstruct->alpha),sizeof(PetscScalar));CHKERRCUDA(err); 1286 err = cudaMemcpy(matstruct->alpha,&ALPHA,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1287 err = cudaMalloc((void **)&(matstruct->beta),sizeof(PetscScalar));CHKERRCUDA(err); 1288 err = cudaMemcpy(matstruct->beta,&BETA,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1289 stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUDA(stat); 1290 1291 /* Build a hybrid/ellpack matrix if this option is chosen for the storage */ 1292 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1293 /* set the matrix */ 1294 CsrMatrix *matrix= new CsrMatrix; 1295 matrix->num_rows = m; 1296 matrix->num_cols = A->cmap->n; 1297 matrix->num_entries = a->nz; 1298 matrix->row_offsets = new THRUSTINTARRAY32(m+1); 1299 matrix->row_offsets->assign(ii, ii + m+1); 1300 1301 matrix->column_indices = new THRUSTINTARRAY32(a->nz); 1302 matrix->column_indices->assign(a->j, a->j+a->nz); 1303 1304 matrix->values = new THRUSTARRAY(a->nz); 1305 matrix->values->assign(a->a, a->a+a->nz); 1306 1307 /* assign the pointer */ 1308 matstruct->mat = matrix; 1309 1310 } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) { 1311 #if CUDA_VERSION>=4020 1312 CsrMatrix *matrix= new CsrMatrix; 1313 matrix->num_rows = m; 1314 matrix->num_cols = A->cmap->n; 1315 matrix->num_entries = a->nz; 1316 matrix->row_offsets = new THRUSTINTARRAY32(m+1); 1317 matrix->row_offsets->assign(ii, ii + m+1); 1318 1319 matrix->column_indices = new THRUSTINTARRAY32(a->nz); 1320 matrix->column_indices->assign(a->j, a->j+a->nz); 1321 1322 matrix->values = new THRUSTARRAY(a->nz); 1323 matrix->values->assign(a->a, a->a+a->nz); 1324 1325 cusparseHybMat_t hybMat; 1326 stat = cusparseCreateHybMat(&hybMat);CHKERRCUDA(stat); 1327 cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ? 1328 CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 1329 stat = cusparse_csr2hyb(cusparsestruct->handle, matrix->num_rows, matrix->num_cols, 1330 matstruct->descr, matrix->values->data().get(), 1331 matrix->row_offsets->data().get(), 1332 matrix->column_indices->data().get(), 1333 hybMat, 0, partition);CHKERRCUDA(stat); 1334 /* assign the pointer */ 1335 matstruct->mat = hybMat; 1336 1337 if (matrix) { 1338 if (matrix->values) delete (THRUSTARRAY*)matrix->values; 1339 if (matrix->column_indices) delete (THRUSTINTARRAY32*)matrix->column_indices; 1340 if (matrix->row_offsets) delete (THRUSTINTARRAY32*)matrix->row_offsets; 1341 delete (CsrMatrix*)matrix; 1342 } 1343 #endif 1344 } 1345 1346 /* assign the compressed row indices */ 1347 matstruct->cprowIndices = new THRUSTINTARRAY(m); 1348 matstruct->cprowIndices->assign(ridx,ridx+m); 1349 1350 /* assign the pointer */ 1351 cusparsestruct->mat = matstruct; 1352 1353 if (!a->compressedrow.use) { 1354 ierr = PetscFree(ii);CHKERRQ(ierr); 1355 ierr = PetscFree(ridx);CHKERRQ(ierr); 1356 } 1357 cusparsestruct->workVector = new THRUSTARRAY; 1358 cusparsestruct->workVector->resize(m); 1359 } catch(char *ex) { 1360 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 1361 } 1362 cusparsestruct->nonzerostate = A->nonzerostate; 1363 } 1364 ierr = WaitForGPU();CHKERRCUDA(ierr); 1365 A->valid_GPU_matrix = PETSC_CUDA_BOTH; 1366 ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1367 } 1368 PetscFunctionReturn(0); 1369 } 1370 1371 #undef __FUNCT__ 1372 #define __FUNCT__ "MatCreateVecs_SeqAIJCUSPARSE" 1373 static PetscErrorCode MatCreateVecs_SeqAIJCUSPARSE(Mat mat, Vec *right, Vec *left) 1374 { 1375 PetscErrorCode ierr; 1376 PetscInt rbs,cbs; 1377 1378 PetscFunctionBegin; 1379 ierr = MatGetBlockSizes(mat,&rbs,&cbs);CHKERRQ(ierr); 1380 if (right) { 1381 ierr = VecCreate(PetscObjectComm((PetscObject)mat),right);CHKERRQ(ierr); 1382 ierr = VecSetSizes(*right,mat->cmap->n,PETSC_DETERMINE);CHKERRQ(ierr); 1383 ierr = VecSetBlockSize(*right,cbs);CHKERRQ(ierr); 1384 ierr = VecSetType(*right,VECSEQCUDA);CHKERRQ(ierr); 1385 ierr = PetscLayoutReference(mat->cmap,&(*right)->map);CHKERRQ(ierr); 1386 } 1387 if (left) { 1388 ierr = VecCreate(PetscObjectComm((PetscObject)mat),left);CHKERRQ(ierr); 1389 ierr = VecSetSizes(*left,mat->rmap->n,PETSC_DETERMINE);CHKERRQ(ierr); 1390 ierr = VecSetBlockSize(*left,rbs);CHKERRQ(ierr); 1391 ierr = VecSetType(*left,VECSEQCUDA);CHKERRQ(ierr); 1392 ierr = PetscLayoutReference(mat->rmap,&(*left)->map);CHKERRQ(ierr); 1393 } 1394 PetscFunctionReturn(0); 1395 } 1396 1397 struct VecCUDAPlusEquals 1398 { 1399 template <typename Tuple> 1400 __host__ __device__ 1401 void operator()(Tuple t) 1402 { 1403 thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t); 1404 } 1405 }; 1406 1407 #undef __FUNCT__ 1408 #define __FUNCT__ "MatMult_SeqAIJCUSPARSE" 1409 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 1410 { 1411 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1412 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1413 Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 1414 const PetscScalar *xarray; 1415 PetscScalar *yarray; 1416 PetscErrorCode ierr; 1417 cusparseStatus_t stat; 1418 1419 PetscFunctionBegin; 1420 /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */ 1421 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 1422 ierr = VecCUDAGetArrayRead(xx,&xarray);CHKERRQ(ierr); 1423 ierr = VecCUDAGetArrayWrite(yy,&yarray);CHKERRQ(ierr); 1424 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1425 CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 1426 stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1427 mat->num_rows, mat->num_cols, mat->num_entries, 1428 matstruct->alpha, matstruct->descr, mat->values->data().get(), mat->row_offsets->data().get(), 1429 mat->column_indices->data().get(), xarray, matstruct->beta, 1430 yarray);CHKERRCUDA(stat); 1431 } else { 1432 #if CUDA_VERSION>=4020 1433 cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat; 1434 stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1435 matstruct->alpha, matstruct->descr, hybMat, 1436 xarray, matstruct->beta, 1437 yarray);CHKERRCUDA(stat); 1438 #endif 1439 } 1440 ierr = VecCUDARestoreArrayRead(xx,&xarray);CHKERRQ(ierr); 1441 ierr = VecCUDARestoreArrayWrite(yy,&yarray);CHKERRQ(ierr); 1442 if (!cusparsestruct->stream) { 1443 ierr = WaitForGPU();CHKERRCUDA(ierr); 1444 } 1445 ierr = PetscLogFlops(2.0*a->nz - cusparsestruct->nonzerorow);CHKERRQ(ierr); 1446 PetscFunctionReturn(0); 1447 } 1448 1449 #undef __FUNCT__ 1450 #define __FUNCT__ "MatMultTranspose_SeqAIJCUSPARSE" 1451 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 1452 { 1453 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1454 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1455 Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 1456 const PetscScalar *xarray; 1457 PetscScalar *yarray; 1458 PetscErrorCode ierr; 1459 cusparseStatus_t stat; 1460 1461 PetscFunctionBegin; 1462 /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */ 1463 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 1464 if (!matstructT) { 1465 ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr); 1466 matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 1467 } 1468 ierr = VecCUDAGetArrayRead(xx,&xarray);CHKERRQ(ierr); 1469 ierr = VecCUDAGetArrayWrite(yy,&yarray);CHKERRQ(ierr); 1470 1471 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1472 CsrMatrix *mat = (CsrMatrix*)matstructT->mat; 1473 stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1474 mat->num_rows, mat->num_cols, 1475 mat->num_entries, matstructT->alpha, matstructT->descr, 1476 mat->values->data().get(), mat->row_offsets->data().get(), 1477 mat->column_indices->data().get(), xarray, matstructT->beta, 1478 yarray);CHKERRCUDA(stat); 1479 } else { 1480 #if CUDA_VERSION>=4020 1481 cusparseHybMat_t hybMat = (cusparseHybMat_t)matstructT->mat; 1482 stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1483 matstructT->alpha, matstructT->descr, hybMat, 1484 xarray, matstructT->beta, 1485 yarray);CHKERRCUDA(stat); 1486 #endif 1487 } 1488 ierr = VecCUDARestoreArrayRead(xx,&xarray);CHKERRQ(ierr); 1489 ierr = VecCUDARestoreArrayWrite(yy,&yarray);CHKERRQ(ierr); 1490 if (!cusparsestruct->stream) { 1491 ierr = WaitForGPU();CHKERRCUDA(ierr); 1492 } 1493 ierr = PetscLogFlops(2.0*a->nz - cusparsestruct->nonzerorow);CHKERRQ(ierr); 1494 PetscFunctionReturn(0); 1495 } 1496 1497 1498 #undef __FUNCT__ 1499 #define __FUNCT__ "MatMultAdd_SeqAIJCUSPARSE" 1500 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 1501 { 1502 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1503 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1504 Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 1505 thrust::device_ptr<PetscScalar> zptr; 1506 const PetscScalar *xarray; 1507 PetscScalar *zarray; 1508 PetscErrorCode ierr; 1509 cusparseStatus_t stat; 1510 1511 PetscFunctionBegin; 1512 /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */ 1513 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 1514 try { 1515 ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr); 1516 ierr = VecCUDAGetArrayRead(xx,&xarray);CHKERRQ(ierr); 1517 ierr = VecCUDAGetArrayReadWrite(zz,&zarray);CHKERRQ(ierr); 1518 zptr = thrust::device_pointer_cast(zarray); 1519 1520 /* multiply add */ 1521 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1522 CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 1523 /* here we need to be careful to set the number of rows in the multiply to the 1524 number of compressed rows in the matrix ... which is equivalent to the 1525 size of the workVector */ 1526 stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1527 mat->num_rows, mat->num_cols, 1528 mat->num_entries, matstruct->alpha, matstruct->descr, 1529 mat->values->data().get(), mat->row_offsets->data().get(), 1530 mat->column_indices->data().get(), xarray, matstruct->beta, 1531 cusparsestruct->workVector->data().get());CHKERRCUDA(stat); 1532 } else { 1533 #if CUDA_VERSION>=4020 1534 cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat; 1535 if (cusparsestruct->workVector->size()) { 1536 stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1537 matstruct->alpha, matstruct->descr, hybMat, 1538 xarray, matstruct->beta, 1539 cusparsestruct->workVector->data().get());CHKERRCUDA(stat); 1540 } 1541 #endif 1542 } 1543 1544 /* scatter the data from the temporary into the full vector with a += operation */ 1545 thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))), 1546 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + cusparsestruct->workVector->size(), 1547 VecCUDAPlusEquals()); 1548 ierr = VecCUDARestoreArrayRead(xx,&xarray);CHKERRQ(ierr); 1549 ierr = VecCUDARestoreArrayReadWrite(zz,&zarray);CHKERRQ(ierr); 1550 1551 } catch(char *ex) { 1552 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 1553 } 1554 ierr = WaitForGPU();CHKERRCUDA(ierr); 1555 ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr); 1556 PetscFunctionReturn(0); 1557 } 1558 1559 #undef __FUNCT__ 1560 #define __FUNCT__ "MatMultTransposeAdd_SeqAIJCUSPARSE" 1561 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 1562 { 1563 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1564 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1565 Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 1566 thrust::device_ptr<PetscScalar> zptr; 1567 const PetscScalar *xarray; 1568 PetscScalar *zarray; 1569 PetscErrorCode ierr; 1570 cusparseStatus_t stat; 1571 1572 PetscFunctionBegin; 1573 /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */ 1574 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 1575 if (!matstructT) { 1576 ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr); 1577 matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 1578 } 1579 1580 try { 1581 ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr); 1582 ierr = VecCUDAGetArrayRead(xx,&xarray);CHKERRQ(ierr); 1583 ierr = VecCUDAGetArrayReadWrite(zz,&zarray);CHKERRQ(ierr); 1584 zptr = thrust::device_pointer_cast(zarray); 1585 1586 /* multiply add with matrix transpose */ 1587 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1588 CsrMatrix *mat = (CsrMatrix*)matstructT->mat; 1589 /* here we need to be careful to set the number of rows in the multiply to the 1590 number of compressed rows in the matrix ... which is equivalent to the 1591 size of the workVector */ 1592 stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1593 mat->num_rows, mat->num_cols, 1594 mat->num_entries, matstructT->alpha, matstructT->descr, 1595 mat->values->data().get(), mat->row_offsets->data().get(), 1596 mat->column_indices->data().get(), xarray, matstructT->beta, 1597 cusparsestruct->workVector->data().get());CHKERRCUDA(stat); 1598 } else { 1599 #if CUDA_VERSION>=4020 1600 cusparseHybMat_t hybMat = (cusparseHybMat_t)matstructT->mat; 1601 if (cusparsestruct->workVector->size()) { 1602 stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1603 matstructT->alpha, matstructT->descr, hybMat, 1604 xarray, matstructT->beta, 1605 cusparsestruct->workVector->data().get());CHKERRCUDA(stat); 1606 } 1607 #endif 1608 } 1609 1610 /* scatter the data from the temporary into the full vector with a += operation */ 1611 thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstructT->cprowIndices->begin()))), 1612 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstructT->cprowIndices->begin()))) + cusparsestruct->workVector->size(), 1613 VecCUDAPlusEquals()); 1614 1615 ierr = VecCUDARestoreArrayRead(xx,&xarray);CHKERRQ(ierr); 1616 ierr = VecCUDARestoreArrayReadWrite(zz,&zarray);CHKERRQ(ierr); 1617 1618 } catch(char *ex) { 1619 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 1620 } 1621 ierr = WaitForGPU();CHKERRCUDA(ierr); 1622 ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr); 1623 PetscFunctionReturn(0); 1624 } 1625 1626 #undef __FUNCT__ 1627 #define __FUNCT__ "MatAssemblyEnd_SeqAIJCUSPARSE" 1628 static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode) 1629 { 1630 PetscErrorCode ierr; 1631 1632 PetscFunctionBegin; 1633 ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr); 1634 if (A->factortype==MAT_FACTOR_NONE) { 1635 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 1636 } 1637 if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0); 1638 A->ops->mult = MatMult_SeqAIJCUSPARSE; 1639 A->ops->multadd = MatMultAdd_SeqAIJCUSPARSE; 1640 A->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE; 1641 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE; 1642 PetscFunctionReturn(0); 1643 } 1644 1645 /* --------------------------------------------------------------------------------*/ 1646 #undef __FUNCT__ 1647 #define __FUNCT__ "MatCreateSeqAIJCUSPARSE" 1648 /*@ 1649 MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format 1650 (the default parallel PETSc format). This matrix will ultimately pushed down 1651 to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix 1652 assembly performance the user should preallocate the matrix storage by setting 1653 the parameter nz (or the array nnz). By setting these parameters accurately, 1654 performance during matrix assembly can be increased by more than a factor of 50. 1655 1656 Collective on MPI_Comm 1657 1658 Input Parameters: 1659 + comm - MPI communicator, set to PETSC_COMM_SELF 1660 . m - number of rows 1661 . n - number of columns 1662 . nz - number of nonzeros per row (same for all rows) 1663 - nnz - array containing the number of nonzeros in the various rows 1664 (possibly different for each row) or NULL 1665 1666 Output Parameter: 1667 . A - the matrix 1668 1669 It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 1670 MatXXXXSetPreallocation() paradgm instead of this routine directly. 1671 [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 1672 1673 Notes: 1674 If nnz is given then nz is ignored 1675 1676 The AIJ format (also called the Yale sparse matrix format or 1677 compressed row storage), is fully compatible with standard Fortran 77 1678 storage. That is, the stored row and column indices can begin at 1679 either one (as in Fortran) or zero. See the users' manual for details. 1680 1681 Specify the preallocated storage with either nz or nnz (not both). 1682 Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 1683 allocation. For large problems you MUST preallocate memory or you 1684 will get TERRIBLE performance, see the users' manual chapter on matrices. 1685 1686 By default, this format uses inodes (identical nodes) when possible, to 1687 improve numerical efficiency of matrix-vector products and solves. We 1688 search for consecutive rows with the same nonzero structure, thereby 1689 reusing matrix information to achieve increased efficiency. 1690 1691 Level: intermediate 1692 1693 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE 1694 @*/ 1695 PetscErrorCode MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A) 1696 { 1697 PetscErrorCode ierr; 1698 1699 PetscFunctionBegin; 1700 ierr = MatCreate(comm,A);CHKERRQ(ierr); 1701 ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr); 1702 ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 1703 ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr); 1704 PetscFunctionReturn(0); 1705 } 1706 1707 #undef __FUNCT__ 1708 #define __FUNCT__ "MatDestroy_SeqAIJCUSPARSE" 1709 static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A) 1710 { 1711 PetscErrorCode ierr; 1712 1713 PetscFunctionBegin; 1714 if (A->factortype==MAT_FACTOR_NONE) { 1715 if (A->valid_GPU_matrix != PETSC_CUDA_UNALLOCATED) { 1716 ierr = Mat_SeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr);CHKERRQ(ierr); 1717 } 1718 } else { 1719 ierr = Mat_SeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr);CHKERRQ(ierr); 1720 } 1721 ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr); 1722 PetscFunctionReturn(0); 1723 } 1724 1725 #undef __FUNCT__ 1726 #define __FUNCT__ "MatCreate_SeqAIJCUSPARSE" 1727 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B) 1728 { 1729 PetscErrorCode ierr; 1730 cusparseStatus_t stat; 1731 cusparseHandle_t handle=0; 1732 1733 PetscFunctionBegin; 1734 ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr); 1735 if (B->factortype==MAT_FACTOR_NONE) { 1736 /* you cannot check the inode.use flag here since the matrix was just created. 1737 now build a GPU matrix data structure */ 1738 B->spptr = new Mat_SeqAIJCUSPARSE; 1739 ((Mat_SeqAIJCUSPARSE*)B->spptr)->mat = 0; 1740 ((Mat_SeqAIJCUSPARSE*)B->spptr)->matTranspose = 0; 1741 ((Mat_SeqAIJCUSPARSE*)B->spptr)->workVector = 0; 1742 ((Mat_SeqAIJCUSPARSE*)B->spptr)->format = MAT_CUSPARSE_CSR; 1743 ((Mat_SeqAIJCUSPARSE*)B->spptr)->stream = 0; 1744 ((Mat_SeqAIJCUSPARSE*)B->spptr)->handle = 0; 1745 stat = cusparseCreate(&handle);CHKERRCUDA(stat); 1746 ((Mat_SeqAIJCUSPARSE*)B->spptr)->handle = handle; 1747 ((Mat_SeqAIJCUSPARSE*)B->spptr)->stream = 0; 1748 } else { 1749 /* NEXT, set the pointers to the triangular factors */ 1750 B->spptr = new Mat_SeqAIJCUSPARSETriFactors; 1751 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->loTriFactorPtr = 0; 1752 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->upTriFactorPtr = 0; 1753 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->loTriFactorPtrTranspose = 0; 1754 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->upTriFactorPtrTranspose = 0; 1755 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->rpermIndices = 0; 1756 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->cpermIndices = 0; 1757 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->workVector = 0; 1758 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->handle = 0; 1759 stat = cusparseCreate(&handle);CHKERRCUDA(stat); 1760 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->handle = handle; 1761 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->nnz = 0; 1762 } 1763 1764 B->ops->assemblyend = MatAssemblyEnd_SeqAIJCUSPARSE; 1765 B->ops->destroy = MatDestroy_SeqAIJCUSPARSE; 1766 B->ops->getvecs = MatCreateVecs_SeqAIJCUSPARSE; 1767 B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE; 1768 B->ops->mult = MatMult_SeqAIJCUSPARSE; 1769 B->ops->multadd = MatMultAdd_SeqAIJCUSPARSE; 1770 B->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE; 1771 B->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE; 1772 1773 ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 1774 1775 B->valid_GPU_matrix = PETSC_CUDA_UNALLOCATED; 1776 1777 ierr = PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr); 1778 PetscFunctionReturn(0); 1779 } 1780 1781 /*M 1782 MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices. 1783 1784 A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either 1785 CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later. 1786 All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library. 1787 1788 Options Database Keys: 1789 + -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions() 1790 . -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 1791 . -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 1792 1793 Level: beginner 1794 1795 .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 1796 M*/ 1797 1798 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat,MatFactorType,Mat*); 1799 1800 1801 #undef __FUNCT__ 1802 #define __FUNCT__ "MatSolverPackageRegister_CUSPARSE" 1803 PETSC_EXTERN PetscErrorCode MatSolverPackageRegister_CUSPARSE(void) 1804 { 1805 PetscErrorCode ierr; 1806 1807 PetscFunctionBegin; 1808 ierr = MatSolverPackageRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 1809 ierr = MatSolverPackageRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 1810 ierr = MatSolverPackageRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 1811 ierr = MatSolverPackageRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 1812 PetscFunctionReturn(0); 1813 } 1814 1815 1816 #undef __FUNCT__ 1817 #define __FUNCT__ "Mat_SeqAIJCUSPARSE_Destroy" 1818 static PetscErrorCode Mat_SeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct) 1819 { 1820 cusparseStatus_t stat; 1821 cusparseHandle_t handle; 1822 1823 PetscFunctionBegin; 1824 if (*cusparsestruct) { 1825 Mat_SeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format); 1826 Mat_SeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format); 1827 delete (*cusparsestruct)->workVector; 1828 if (handle = (*cusparsestruct)->handle) { 1829 stat = cusparseDestroy(handle);CHKERRCUDA(stat); 1830 } 1831 delete *cusparsestruct; 1832 *cusparsestruct = 0; 1833 } 1834 PetscFunctionReturn(0); 1835 } 1836 1837 #undef __FUNCT__ 1838 #define __FUNCT__ "CsrMatrix_Destroy" 1839 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat) 1840 { 1841 PetscFunctionBegin; 1842 if (*mat) { 1843 delete (*mat)->values; 1844 delete (*mat)->column_indices; 1845 delete (*mat)->row_offsets; 1846 delete *mat; 1847 *mat = 0; 1848 } 1849 PetscFunctionReturn(0); 1850 } 1851 1852 #undef __FUNCT__ 1853 #define __FUNCT__ "Mat_SeqAIJCUSPARSETriFactorStruct_Destroy" 1854 static PetscErrorCode Mat_SeqAIJCUSPARSETriFactorStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor) 1855 { 1856 cusparseStatus_t stat; 1857 PetscErrorCode ierr; 1858 1859 PetscFunctionBegin; 1860 if (*trifactor) { 1861 if ((*trifactor)->descr) { stat = cusparseDestroyMatDescr((*trifactor)->descr);CHKERRCUDA(stat); } 1862 if ((*trifactor)->solveInfo) { stat = cusparseDestroySolveAnalysisInfo((*trifactor)->solveInfo);CHKERRCUDA(stat); } 1863 ierr = CsrMatrix_Destroy(&(*trifactor)->csrMat);CHKERRQ(ierr); 1864 delete *trifactor; 1865 *trifactor = 0; 1866 } 1867 PetscFunctionReturn(0); 1868 } 1869 1870 #undef __FUNCT__ 1871 #define __FUNCT__ "Mat_SeqAIJCUSPARSEMultStruct_Destroy" 1872 static PetscErrorCode Mat_SeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format) 1873 { 1874 CsrMatrix *mat; 1875 cusparseStatus_t stat; 1876 cudaError_t err; 1877 1878 PetscFunctionBegin; 1879 if (*matstruct) { 1880 if ((*matstruct)->mat) { 1881 if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) { 1882 cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat; 1883 stat = cusparseDestroyHybMat(hybMat);CHKERRCUDA(stat); 1884 } else { 1885 mat = (CsrMatrix*)(*matstruct)->mat; 1886 CsrMatrix_Destroy(&mat); 1887 } 1888 } 1889 if ((*matstruct)->descr) { stat = cusparseDestroyMatDescr((*matstruct)->descr);CHKERRCUDA(stat); } 1890 delete (*matstruct)->cprowIndices; 1891 if ((*matstruct)->alpha) { err=cudaFree((*matstruct)->alpha);CHKERRCUDA(err); } 1892 if ((*matstruct)->beta) { err=cudaFree((*matstruct)->beta);CHKERRCUDA(err); } 1893 delete *matstruct; 1894 *matstruct = 0; 1895 } 1896 PetscFunctionReturn(0); 1897 } 1898 1899 #undef __FUNCT__ 1900 #define __FUNCT__ "Mat_SeqAIJCUSPARSETriFactors_Destroy" 1901 static PetscErrorCode Mat_SeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors) 1902 { 1903 cusparseHandle_t handle; 1904 cusparseStatus_t stat; 1905 1906 PetscFunctionBegin; 1907 if (*trifactors) { 1908 Mat_SeqAIJCUSPARSETriFactorStruct_Destroy(&(*trifactors)->loTriFactorPtr); 1909 Mat_SeqAIJCUSPARSETriFactorStruct_Destroy(&(*trifactors)->upTriFactorPtr); 1910 Mat_SeqAIJCUSPARSETriFactorStruct_Destroy(&(*trifactors)->loTriFactorPtrTranspose); 1911 Mat_SeqAIJCUSPARSETriFactorStruct_Destroy(&(*trifactors)->upTriFactorPtrTranspose); 1912 delete (*trifactors)->rpermIndices; 1913 delete (*trifactors)->cpermIndices; 1914 delete (*trifactors)->workVector; 1915 if (handle = (*trifactors)->handle) { 1916 stat = cusparseDestroy(handle);CHKERRCUDA(stat); 1917 } 1918 delete *trifactors; 1919 *trifactors = 0; 1920 } 1921 PetscFunctionReturn(0); 1922 } 1923 1924