1 /* 2 Defines the basic matrix operations for the AIJ (compressed row) 3 matrix storage format using the CUSPARSE library, 4 */ 5 6 #include <petscconf.h> 7 #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/ 8 #include <../src/mat/impls/sbaij/seq/sbaij.h> 9 #include <../src/vec/vec/impls/dvecimpl.h> 10 #include <petsc-private/vecimpl.h> 11 #undef VecType 12 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h> 13 14 const char *const MatCUSPARSEStorageFormats[] = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0}; 15 16 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*); 17 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*); 18 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*); 19 20 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 21 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 22 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*); 23 24 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec); 25 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 26 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 27 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 28 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat); 29 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec); 30 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 31 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 32 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 33 34 #undef __FUNCT__ 35 #define __FUNCT__ "MatCUSPARSESetStream" 36 PetscErrorCode MatCUSPARSESetStream(Mat A,const cudaStream_t stream) 37 { 38 cusparseStatus_t stat; 39 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 40 41 PetscFunctionBegin; 42 cusparsestruct->stream = stream; 43 stat = cusparseSetStream(cusparsestruct->handle,cusparsestruct->stream);CHKERRCUSP(stat); 44 PetscFunctionReturn(0); 45 } 46 47 #undef __FUNCT__ 48 #define __FUNCT__ "MatCUSPARSESetHandle" 49 PetscErrorCode MatCUSPARSESetHandle(Mat A,const cusparseHandle_t handle) 50 { 51 cusparseStatus_t stat; 52 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 53 54 PetscFunctionBegin; 55 if (cusparsestruct->handle) 56 stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUSP(stat); 57 cusparsestruct->handle = handle; 58 stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSP(stat); 59 PetscFunctionReturn(0); 60 } 61 62 #undef __FUNCT__ 63 #define __FUNCT__ "MatCUSPARSEClearHandle" 64 PetscErrorCode MatCUSPARSEClearHandle(Mat A) 65 { 66 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 67 PetscFunctionBegin; 68 if (cusparsestruct->handle) 69 cusparsestruct->handle = 0; 70 PetscFunctionReturn(0); 71 } 72 73 #undef __FUNCT__ 74 #define __FUNCT__ "MatFactorGetSolverPackage_seqaij_cusparse" 75 PetscErrorCode MatFactorGetSolverPackage_seqaij_cusparse(Mat A,const MatSolverPackage *type) 76 { 77 PetscFunctionBegin; 78 *type = MATSOLVERCUSPARSE; 79 PetscFunctionReturn(0); 80 } 81 82 /*MC 83 MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices 84 on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported 85 algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer 86 performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the 87 CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these 88 algorithms are not recommended. This class does NOT support direct solver operations. 89 90 Level: beginner 91 92 .seealso: PCFactorSetMatSolverPackage(), MatSolverPackage, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 93 M*/ 94 95 #undef __FUNCT__ 96 #define __FUNCT__ "MatGetFactor_seqaij_cusparse" 97 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaij_cusparse(Mat A,MatFactorType ftype,Mat *B) 98 { 99 PetscErrorCode ierr; 100 PetscInt n = A->rmap->n; 101 102 PetscFunctionBegin; 103 ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr); 104 (*B)->factortype = ftype; 105 ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr); 106 ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 107 108 if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) { 109 ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr); 110 (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE; 111 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJCUSPARSE; 112 } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) { 113 (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJCUSPARSE; 114 (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE; 115 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types"); 116 117 ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr); 118 ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverPackage_C",MatFactorGetSolverPackage_seqaij_cusparse);CHKERRQ(ierr); 119 PetscFunctionReturn(0); 120 } 121 122 #undef __FUNCT__ 123 #define __FUNCT__ "MatCUSPARSESetFormat_SeqAIJCUSPARSE" 124 PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 125 { 126 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 127 128 PetscFunctionBegin; 129 #if CUDA_VERSION>=4020 130 switch (op) { 131 case MAT_CUSPARSE_MULT: 132 cusparsestruct->format = format; 133 break; 134 case MAT_CUSPARSE_ALL: 135 cusparsestruct->format = format; 136 break; 137 default: 138 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op); 139 } 140 #else 141 if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) 142 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"ELL (Ellpack) and HYB (Hybrid) storage format require CUDA 4.2 or later."); 143 #endif 144 PetscFunctionReturn(0); 145 } 146 147 /*@ 148 MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular 149 operation. Only the MatMult operation can use different GPU storage formats 150 for MPIAIJCUSPARSE matrices. 151 Not Collective 152 153 Input Parameters: 154 + A - Matrix of type SEQAIJCUSPARSE 155 . op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL. 156 - format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2) 157 158 Output Parameter: 159 160 Level: intermediate 161 162 .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 163 @*/ 164 #undef __FUNCT__ 165 #define __FUNCT__ "MatCUSPARSESetFormat" 166 PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 167 { 168 PetscErrorCode ierr; 169 170 PetscFunctionBegin; 171 PetscValidHeaderSpecific(A, MAT_CLASSID,1); 172 ierr = PetscTryMethod(A, "MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));CHKERRQ(ierr); 173 PetscFunctionReturn(0); 174 } 175 176 #undef __FUNCT__ 177 #define __FUNCT__ "MatSetFromOptions_SeqAIJCUSPARSE" 178 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat A) 179 { 180 PetscErrorCode ierr; 181 MatCUSPARSEStorageFormat format; 182 PetscBool flg; 183 184 PetscFunctionBegin; 185 ierr = PetscOptionsHead("SeqAIJCUSPARSE options");CHKERRQ(ierr); 186 ierr = PetscObjectOptionsBegin((PetscObject)A); 187 if (A->factortype==MAT_FACTOR_NONE) { 188 ierr = PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV", 189 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)MAT_CUSPARSE_CSR,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 190 if (flg) { 191 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);CHKERRQ(ierr); 192 } 193 } 194 ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve", 195 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)MAT_CUSPARSE_CSR,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 196 if (flg) { 197 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr); 198 } 199 ierr = PetscOptionsEnd();CHKERRQ(ierr); 200 PetscFunctionReturn(0); 201 202 } 203 204 #undef __FUNCT__ 205 #define __FUNCT__ "MatILUFactorSymbolic_SeqAIJCUSPARSE" 206 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 207 { 208 PetscErrorCode ierr; 209 210 PetscFunctionBegin; 211 ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 212 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 213 PetscFunctionReturn(0); 214 } 215 216 #undef __FUNCT__ 217 #define __FUNCT__ "MatLUFactorSymbolic_SeqAIJCUSPARSE" 218 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 219 { 220 PetscErrorCode ierr; 221 222 PetscFunctionBegin; 223 ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 224 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 225 PetscFunctionReturn(0); 226 } 227 228 #undef __FUNCT__ 229 #define __FUNCT__ "MatICCFactorSymbolic_SeqAIJCUSPARSE" 230 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info) 231 { 232 PetscErrorCode ierr; 233 234 PetscFunctionBegin; 235 ierr = MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr); 236 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 237 PetscFunctionReturn(0); 238 } 239 240 #undef __FUNCT__ 241 #define __FUNCT__ "MatCholeskyFactorSymbolic_SeqAIJCUSPARSE" 242 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info) 243 { 244 PetscErrorCode ierr; 245 246 PetscFunctionBegin; 247 ierr = MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr); 248 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 249 PetscFunctionReturn(0); 250 } 251 252 #undef __FUNCT__ 253 #define __FUNCT__ "MatSeqAIJCUSPARSEBuildILULowerTriMatrix" 254 static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A) 255 { 256 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 257 PetscInt n = A->rmap->n; 258 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 259 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 260 cusparseStatus_t stat; 261 const PetscInt *ai = a->i,*aj = a->j,*vi; 262 const MatScalar *aa = a->a,*v; 263 PetscInt *AiLo, *AjLo; 264 PetscScalar *AALo; 265 PetscInt i,nz, nzLower, offset, rowOffset; 266 PetscErrorCode ierr; 267 268 PetscFunctionBegin; 269 if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUSP_CPU) { 270 try { 271 /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */ 272 nzLower=n+ai[n]-ai[1]; 273 274 /* Allocate Space for the lower triangular matrix */ 275 ierr = cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUSP(ierr); 276 ierr = cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUSP(ierr); 277 ierr = cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUSP(ierr); 278 279 /* Fill the lower triangular matrix */ 280 AiLo[0] = (PetscInt) 0; 281 AiLo[n] = nzLower; 282 AjLo[0] = (PetscInt) 0; 283 AALo[0] = (MatScalar) 1.0; 284 v = aa; 285 vi = aj; 286 offset = 1; 287 rowOffset= 1; 288 for (i=1; i<n; i++) { 289 nz = ai[i+1] - ai[i]; 290 /* additional 1 for the term on the diagonal */ 291 AiLo[i] = rowOffset; 292 rowOffset += nz+1; 293 294 ierr = PetscMemcpy(&(AjLo[offset]), vi, nz*sizeof(PetscInt));CHKERRQ(ierr); 295 ierr = PetscMemcpy(&(AALo[offset]), v, nz*sizeof(PetscScalar));CHKERRQ(ierr); 296 297 offset += nz; 298 AjLo[offset] = (PetscInt) i; 299 AALo[offset] = (MatScalar) 1.0; 300 offset += 1; 301 302 v += nz; 303 vi += nz; 304 } 305 306 /* allocate space for the triangular factor information */ 307 loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct; 308 309 /* Create the matrix description */ 310 stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSP(stat); 311 stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSP(stat); 312 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSP(stat); 313 stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER);CHKERRCUSP(stat); 314 stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSP(stat); 315 316 /* Create the solve analysis information */ 317 stat = cusparseCreateSolveAnalysisInfo(&loTriFactor->solveInfo);CHKERRCUSP(stat); 318 319 /* set the operation */ 320 loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 321 322 /* set the matrix */ 323 loTriFactor->csrMat = new CsrMatrix; 324 loTriFactor->csrMat->num_rows = n; 325 loTriFactor->csrMat->num_cols = n; 326 loTriFactor->csrMat->num_entries = nzLower; 327 328 loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1); 329 loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1); 330 331 loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower); 332 loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower); 333 334 loTriFactor->csrMat->values = new THRUSTARRAY(nzLower); 335 loTriFactor->csrMat->values->assign(AALo, AALo+nzLower); 336 337 /* perform the solve analysis */ 338 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, 339 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 340 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 341 loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo);CHKERRCUSP(stat); 342 343 /* assign the pointer. Is this really necessary? */ 344 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor; 345 346 ierr = cudaFreeHost(AiLo);CHKERRCUSP(ierr); 347 ierr = cudaFreeHost(AjLo);CHKERRCUSP(ierr); 348 ierr = cudaFreeHost(AALo);CHKERRCUSP(ierr); 349 } catch(char *ex) { 350 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 351 } 352 } 353 PetscFunctionReturn(0); 354 } 355 356 #undef __FUNCT__ 357 #define __FUNCT__ "MatSeqAIJCUSPARSEBuildILUUpperTriMatrix" 358 static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A) 359 { 360 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 361 PetscInt n = A->rmap->n; 362 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 363 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 364 cusparseStatus_t stat; 365 const PetscInt *aj = a->j,*adiag = a->diag,*vi; 366 const MatScalar *aa = a->a,*v; 367 PetscInt *AiUp, *AjUp; 368 PetscScalar *AAUp; 369 PetscInt i,nz, nzUpper, offset; 370 PetscErrorCode ierr; 371 372 PetscFunctionBegin; 373 if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUSP_CPU) { 374 try { 375 /* next, figure out the number of nonzeros in the upper triangular matrix. */ 376 nzUpper = adiag[0]-adiag[n]; 377 378 /* Allocate Space for the upper triangular matrix */ 379 ierr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUSP(ierr); 380 ierr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUSP(ierr); 381 ierr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUSP(ierr); 382 383 /* Fill the upper triangular matrix */ 384 AiUp[0]=(PetscInt) 0; 385 AiUp[n]=nzUpper; 386 offset = nzUpper; 387 for (i=n-1; i>=0; i--) { 388 v = aa + adiag[i+1] + 1; 389 vi = aj + adiag[i+1] + 1; 390 391 /* number of elements NOT on the diagonal */ 392 nz = adiag[i] - adiag[i+1]-1; 393 394 /* decrement the offset */ 395 offset -= (nz+1); 396 397 /* first, set the diagonal elements */ 398 AjUp[offset] = (PetscInt) i; 399 AAUp[offset] = 1./v[nz]; 400 AiUp[i] = AiUp[i+1] - (nz+1); 401 402 ierr = PetscMemcpy(&(AjUp[offset+1]), vi, nz*sizeof(PetscInt));CHKERRQ(ierr); 403 ierr = PetscMemcpy(&(AAUp[offset+1]), v, nz*sizeof(PetscScalar));CHKERRQ(ierr); 404 } 405 406 /* allocate space for the triangular factor information */ 407 upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct; 408 409 /* Create the matrix description */ 410 stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSP(stat); 411 stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSP(stat); 412 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSP(stat); 413 stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSP(stat); 414 stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSP(stat); 415 416 /* Create the solve analysis information */ 417 stat = cusparseCreateSolveAnalysisInfo(&upTriFactor->solveInfo);CHKERRCUSP(stat); 418 419 /* set the operation */ 420 upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 421 422 /* set the matrix */ 423 upTriFactor->csrMat = new CsrMatrix; 424 upTriFactor->csrMat->num_rows = n; 425 upTriFactor->csrMat->num_cols = n; 426 upTriFactor->csrMat->num_entries = nzUpper; 427 428 upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1); 429 upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1); 430 431 upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper); 432 upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper); 433 434 upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper); 435 upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper); 436 437 /* perform the solve analysis */ 438 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, 439 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 440 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 441 upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo);CHKERRCUSP(stat); 442 443 /* assign the pointer. Is this really necessary? */ 444 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor; 445 446 ierr = cudaFreeHost(AiUp);CHKERRCUSP(ierr); 447 ierr = cudaFreeHost(AjUp);CHKERRCUSP(ierr); 448 ierr = cudaFreeHost(AAUp);CHKERRCUSP(ierr); 449 } catch(char *ex) { 450 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 451 } 452 } 453 PetscFunctionReturn(0); 454 } 455 456 #undef __FUNCT__ 457 #define __FUNCT__ "MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU" 458 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A) 459 { 460 PetscErrorCode ierr; 461 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 462 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 463 IS isrow = a->row,iscol = a->icol; 464 PetscBool row_identity,col_identity; 465 const PetscInt *r,*c; 466 PetscInt n = A->rmap->n; 467 468 PetscFunctionBegin; 469 ierr = MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);CHKERRQ(ierr); 470 ierr = MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);CHKERRQ(ierr); 471 472 cusparseTriFactors->workVector = new THRUSTARRAY; 473 cusparseTriFactors->workVector->resize(n); 474 cusparseTriFactors->nnz=a->nz; 475 476 A->valid_GPU_matrix = PETSC_CUSP_BOTH; 477 /*lower triangular indices */ 478 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 479 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 480 if (!row_identity) { 481 cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 482 cusparseTriFactors->rpermIndices->assign(r, r+n); 483 } 484 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 485 486 /*upper triangular indices */ 487 ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr); 488 ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 489 if (!col_identity) { 490 cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 491 cusparseTriFactors->cpermIndices->assign(c, c+n); 492 } 493 ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr); 494 PetscFunctionReturn(0); 495 } 496 497 #undef __FUNCT__ 498 #define __FUNCT__ "MatSeqAIJCUSPARSEBuildICCTriMatrices" 499 static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A) 500 { 501 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 502 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 503 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 504 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 505 cusparseStatus_t stat; 506 PetscErrorCode ierr; 507 PetscInt *AiUp, *AjUp; 508 PetscScalar *AAUp; 509 PetscScalar *AALo; 510 PetscInt nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j; 511 Mat_SeqSBAIJ *b = (Mat_SeqSBAIJ*)A->data; 512 const PetscInt *ai = b->i,*aj = b->j,*vj; 513 const MatScalar *aa = b->a,*v; 514 515 PetscFunctionBegin; 516 if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUSP_CPU) { 517 try { 518 /* Allocate Space for the upper triangular matrix */ 519 ierr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUSP(ierr); 520 ierr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUSP(ierr); 521 ierr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUSP(ierr); 522 ierr = cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUSP(ierr); 523 524 /* Fill the upper triangular matrix */ 525 AiUp[0]=(PetscInt) 0; 526 AiUp[n]=nzUpper; 527 offset = 0; 528 for (i=0; i<n; i++) { 529 /* set the pointers */ 530 v = aa + ai[i]; 531 vj = aj + ai[i]; 532 nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */ 533 534 /* first, set the diagonal elements */ 535 AjUp[offset] = (PetscInt) i; 536 AAUp[offset] = 1.0/v[nz]; 537 AiUp[i] = offset; 538 AALo[offset] = 1.0/v[nz]; 539 540 offset+=1; 541 if (nz>0) { 542 ierr = PetscMemcpy(&(AjUp[offset]), vj, nz*sizeof(PetscInt));CHKERRQ(ierr); 543 ierr = PetscMemcpy(&(AAUp[offset]), v, nz*sizeof(PetscScalar));CHKERRQ(ierr); 544 for (j=offset; j<offset+nz; j++) { 545 AAUp[j] = -AAUp[j]; 546 AALo[j] = AAUp[j]/v[nz]; 547 } 548 offset+=nz; 549 } 550 } 551 552 /* allocate space for the triangular factor information */ 553 upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct; 554 555 /* Create the matrix description */ 556 stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSP(stat); 557 stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSP(stat); 558 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSP(stat); 559 stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSP(stat); 560 stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSP(stat); 561 562 /* Create the solve analysis information */ 563 stat = cusparseCreateSolveAnalysisInfo(&upTriFactor->solveInfo);CHKERRCUSP(stat); 564 565 /* set the operation */ 566 upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 567 568 /* set the matrix */ 569 upTriFactor->csrMat = new CsrMatrix; 570 upTriFactor->csrMat->num_rows = A->rmap->n; 571 upTriFactor->csrMat->num_cols = A->cmap->n; 572 upTriFactor->csrMat->num_entries = a->nz; 573 574 upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 575 upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1); 576 577 upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 578 upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz); 579 580 upTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 581 upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz); 582 583 /* perform the solve analysis */ 584 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, 585 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 586 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 587 upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo);CHKERRCUSP(stat); 588 589 /* assign the pointer. Is this really necessary? */ 590 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor; 591 592 /* allocate space for the triangular factor information */ 593 loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct; 594 595 /* Create the matrix description */ 596 stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSP(stat); 597 stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSP(stat); 598 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSP(stat); 599 stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSP(stat); 600 stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSP(stat); 601 602 /* Create the solve analysis information */ 603 stat = cusparseCreateSolveAnalysisInfo(&loTriFactor->solveInfo);CHKERRCUSP(stat); 604 605 /* set the operation */ 606 loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE; 607 608 /* set the matrix */ 609 loTriFactor->csrMat = new CsrMatrix; 610 loTriFactor->csrMat->num_rows = A->rmap->n; 611 loTriFactor->csrMat->num_cols = A->cmap->n; 612 loTriFactor->csrMat->num_entries = a->nz; 613 614 loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 615 loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1); 616 617 loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 618 loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz); 619 620 loTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 621 loTriFactor->csrMat->values->assign(AALo, AALo+a->nz); 622 623 /* perform the solve analysis */ 624 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, 625 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 626 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 627 loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo);CHKERRCUSP(stat); 628 629 /* assign the pointer. Is this really necessary? */ 630 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor; 631 632 A->valid_GPU_matrix = PETSC_CUSP_BOTH; 633 ierr = cudaFreeHost(AiUp);CHKERRCUSP(ierr); 634 ierr = cudaFreeHost(AjUp);CHKERRCUSP(ierr); 635 ierr = cudaFreeHost(AAUp);CHKERRCUSP(ierr); 636 ierr = cudaFreeHost(AALo);CHKERRCUSP(ierr); 637 } catch(char *ex) { 638 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 639 } 640 } 641 PetscFunctionReturn(0); 642 } 643 644 #undef __FUNCT__ 645 #define __FUNCT__ "MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU" 646 static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A) 647 { 648 PetscErrorCode ierr; 649 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 650 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 651 IS ip = a->row; 652 const PetscInt *rip; 653 PetscBool perm_identity; 654 PetscInt n = A->rmap->n; 655 656 PetscFunctionBegin; 657 ierr = MatSeqAIJCUSPARSEBuildICCTriMatrices(A);CHKERRQ(ierr); 658 cusparseTriFactors->workVector = new THRUSTARRAY; 659 cusparseTriFactors->workVector->resize(n); 660 cusparseTriFactors->nnz=(a->nz-n)*2 + n; 661 662 /*lower triangular indices */ 663 ierr = ISGetIndices(ip,&rip);CHKERRQ(ierr); 664 ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr); 665 if (!perm_identity) { 666 cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 667 cusparseTriFactors->rpermIndices->assign(rip, rip+n); 668 cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 669 cusparseTriFactors->cpermIndices->assign(rip, rip+n); 670 } 671 ierr = ISRestoreIndices(ip,&rip);CHKERRQ(ierr); 672 PetscFunctionReturn(0); 673 } 674 675 #undef __FUNCT__ 676 #define __FUNCT__ "MatLUFactorNumeric_SeqAIJCUSPARSE" 677 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info) 678 { 679 Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 680 IS isrow = b->row,iscol = b->col; 681 PetscBool row_identity,col_identity; 682 PetscErrorCode ierr; 683 684 PetscFunctionBegin; 685 ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 686 /* determine which version of MatSolve needs to be used. */ 687 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 688 ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 689 if (row_identity && col_identity) { 690 B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 691 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 692 } else { 693 B->ops->solve = MatSolve_SeqAIJCUSPARSE; 694 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 695 } 696 697 /* get the triangular factors */ 698 ierr = MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);CHKERRQ(ierr); 699 PetscFunctionReturn(0); 700 } 701 702 #undef __FUNCT__ 703 #define __FUNCT__ "MatCholeskyFactorNumeric_SeqAIJCUSPARSE" 704 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info) 705 { 706 Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 707 IS ip = b->row; 708 PetscBool perm_identity; 709 PetscErrorCode ierr; 710 711 PetscFunctionBegin; 712 ierr = MatCholeskyFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 713 714 /* determine which version of MatSolve needs to be used. */ 715 ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr); 716 if (perm_identity) { 717 B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 718 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 719 } else { 720 B->ops->solve = MatSolve_SeqAIJCUSPARSE; 721 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 722 } 723 724 /* get the triangular factors */ 725 ierr = MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);CHKERRQ(ierr); 726 PetscFunctionReturn(0); 727 } 728 729 #undef __FUNCT__ 730 #define __FUNCT__ "MatSeqAIJCUSPARSEAnalyzeTransposeForSolve" 731 static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A) 732 { 733 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 734 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 735 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 736 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 737 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 738 cusparseStatus_t stat; 739 cusparseIndexBase_t indexBase; 740 cusparseMatrixType_t matrixType; 741 cusparseFillMode_t fillMode; 742 cusparseDiagType_t diagType; 743 744 PetscFunctionBegin; 745 746 /*********************************************/ 747 /* Now the Transpose of the Lower Tri Factor */ 748 /*********************************************/ 749 750 /* allocate space for the transpose of the lower triangular factor */ 751 loTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct; 752 753 /* set the matrix descriptors of the lower triangular factor */ 754 matrixType = cusparseGetMatType(loTriFactor->descr); 755 indexBase = cusparseGetMatIndexBase(loTriFactor->descr); 756 fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ? 757 CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 758 diagType = cusparseGetMatDiagType(loTriFactor->descr); 759 760 /* Create the matrix description */ 761 stat = cusparseCreateMatDescr(&loTriFactorT->descr);CHKERRCUSP(stat); 762 stat = cusparseSetMatIndexBase(loTriFactorT->descr, indexBase);CHKERRCUSP(stat); 763 stat = cusparseSetMatType(loTriFactorT->descr, matrixType);CHKERRCUSP(stat); 764 stat = cusparseSetMatFillMode(loTriFactorT->descr, fillMode);CHKERRCUSP(stat); 765 stat = cusparseSetMatDiagType(loTriFactorT->descr, diagType);CHKERRCUSP(stat); 766 767 /* Create the solve analysis information */ 768 stat = cusparseCreateSolveAnalysisInfo(&loTriFactorT->solveInfo);CHKERRCUSP(stat); 769 770 /* set the operation */ 771 loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 772 773 /* allocate GPU space for the CSC of the lower triangular factor*/ 774 loTriFactorT->csrMat = new CsrMatrix; 775 loTriFactorT->csrMat->num_rows = loTriFactor->csrMat->num_rows; 776 loTriFactorT->csrMat->num_cols = loTriFactor->csrMat->num_cols; 777 loTriFactorT->csrMat->num_entries = loTriFactor->csrMat->num_entries; 778 loTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(loTriFactor->csrMat->num_rows+1); 779 loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactor->csrMat->num_entries); 780 loTriFactorT->csrMat->values = new THRUSTARRAY(loTriFactor->csrMat->num_entries); 781 782 /* compute the transpose of the lower triangular factor, i.e. the CSC */ 783 stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, 784 loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, 785 loTriFactor->csrMat->values->data().get(), 786 loTriFactor->csrMat->row_offsets->data().get(), 787 loTriFactor->csrMat->column_indices->data().get(), 788 loTriFactorT->csrMat->values->data().get(), 789 loTriFactorT->csrMat->column_indices->data().get(), 790 loTriFactorT->csrMat->row_offsets->data().get(), 791 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSP(stat); 792 793 /* perform the solve analysis on the transposed matrix */ 794 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp, 795 loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, 796 loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(), 797 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), 798 loTriFactorT->solveInfo);CHKERRCUSP(stat); 799 800 /* assign the pointer. Is this really necessary? */ 801 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT; 802 803 /*********************************************/ 804 /* Now the Transpose of the Upper Tri Factor */ 805 /*********************************************/ 806 807 /* allocate space for the transpose of the upper triangular factor */ 808 upTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct; 809 810 /* set the matrix descriptors of the upper triangular factor */ 811 matrixType = cusparseGetMatType(upTriFactor->descr); 812 indexBase = cusparseGetMatIndexBase(upTriFactor->descr); 813 fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ? 814 CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 815 diagType = cusparseGetMatDiagType(upTriFactor->descr); 816 817 /* Create the matrix description */ 818 stat = cusparseCreateMatDescr(&upTriFactorT->descr);CHKERRCUSP(stat); 819 stat = cusparseSetMatIndexBase(upTriFactorT->descr, indexBase);CHKERRCUSP(stat); 820 stat = cusparseSetMatType(upTriFactorT->descr, matrixType);CHKERRCUSP(stat); 821 stat = cusparseSetMatFillMode(upTriFactorT->descr, fillMode);CHKERRCUSP(stat); 822 stat = cusparseSetMatDiagType(upTriFactorT->descr, diagType);CHKERRCUSP(stat); 823 824 /* Create the solve analysis information */ 825 stat = cusparseCreateSolveAnalysisInfo(&upTriFactorT->solveInfo);CHKERRCUSP(stat); 826 827 /* set the operation */ 828 upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 829 830 /* allocate GPU space for the CSC of the upper triangular factor*/ 831 upTriFactorT->csrMat = new CsrMatrix; 832 upTriFactorT->csrMat->num_rows = upTriFactor->csrMat->num_rows; 833 upTriFactorT->csrMat->num_cols = upTriFactor->csrMat->num_cols; 834 upTriFactorT->csrMat->num_entries = upTriFactor->csrMat->num_entries; 835 upTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(upTriFactor->csrMat->num_rows+1); 836 upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactor->csrMat->num_entries); 837 upTriFactorT->csrMat->values = new THRUSTARRAY(upTriFactor->csrMat->num_entries); 838 839 /* compute the transpose of the upper triangular factor, i.e. the CSC */ 840 stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, 841 upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, 842 upTriFactor->csrMat->values->data().get(), 843 upTriFactor->csrMat->row_offsets->data().get(), 844 upTriFactor->csrMat->column_indices->data().get(), 845 upTriFactorT->csrMat->values->data().get(), 846 upTriFactorT->csrMat->column_indices->data().get(), 847 upTriFactorT->csrMat->row_offsets->data().get(), 848 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSP(stat); 849 850 /* perform the solve analysis on the transposed matrix */ 851 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp, 852 upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, 853 upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(), 854 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), 855 upTriFactorT->solveInfo);CHKERRCUSP(stat); 856 857 /* assign the pointer. Is this really necessary? */ 858 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT; 859 PetscFunctionReturn(0); 860 } 861 862 #undef __FUNCT__ 863 #define __FUNCT__ "MatSeqAIJCUSPARSEGenerateTransposeForMult" 864 static PetscErrorCode MatSeqAIJCUSPARSEGenerateTransposeForMult(Mat A) 865 { 866 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 867 Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 868 Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 869 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 870 cusparseStatus_t stat; 871 cusparseIndexBase_t indexBase; 872 cudaError_t err; 873 874 PetscFunctionBegin; 875 876 /* allocate space for the triangular factor information */ 877 matstructT = new Mat_SeqAIJCUSPARSEMultStruct; 878 stat = cusparseCreateMatDescr(&matstructT->descr);CHKERRCUSP(stat); 879 indexBase = cusparseGetMatIndexBase(matstruct->descr); 880 stat = cusparseSetMatIndexBase(matstructT->descr, indexBase);CHKERRCUSP(stat); 881 stat = cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSP(stat); 882 883 /* set alpha and beta */ 884 err = cudaMalloc((void **)&(matstructT->alpha),sizeof(PetscScalar));CHKERRCUSP(err); 885 err = cudaMemcpy(matstructT->alpha,&ALPHA,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUSP(err); 886 err = cudaMalloc((void **)&(matstructT->beta),sizeof(PetscScalar));CHKERRCUSP(err); 887 err = cudaMemcpy(matstructT->beta,&BETA,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUSP(err); 888 stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSP(stat); 889 890 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 891 CsrMatrix *matrix = (CsrMatrix*)matstruct->mat; 892 CsrMatrix *matrixT= new CsrMatrix; 893 matrixT->num_rows = A->rmap->n; 894 matrixT->num_cols = A->cmap->n; 895 matrixT->num_entries = a->nz; 896 matrixT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 897 matrixT->column_indices = new THRUSTINTARRAY32(a->nz); 898 matrixT->values = new THRUSTARRAY(a->nz); 899 900 /* compute the transpose of the upper triangular factor, i.e. the CSC */ 901 indexBase = cusparseGetMatIndexBase(matstruct->descr); 902 stat = cusparse_csr2csc(cusparsestruct->handle, matrix->num_rows, 903 matrix->num_cols, matrix->num_entries, 904 matrix->values->data().get(), 905 matrix->row_offsets->data().get(), 906 matrix->column_indices->data().get(), 907 matrixT->values->data().get(), 908 matrixT->column_indices->data().get(), 909 matrixT->row_offsets->data().get(), 910 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSP(stat); 911 912 /* assign the pointer */ 913 matstructT->mat = matrixT; 914 915 } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) { 916 #if CUDA_VERSION>=5000 917 /* First convert HYB to CSR */ 918 CsrMatrix *temp= new CsrMatrix; 919 temp->num_rows = A->rmap->n; 920 temp->num_cols = A->cmap->n; 921 temp->num_entries = a->nz; 922 temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 923 temp->column_indices = new THRUSTINTARRAY32(a->nz); 924 temp->values = new THRUSTARRAY(a->nz); 925 926 927 stat = cusparse_hyb2csr(cusparsestruct->handle, 928 matstruct->descr, (cusparseHybMat_t)matstruct->mat, 929 temp->values->data().get(), 930 temp->row_offsets->data().get(), 931 temp->column_indices->data().get());CHKERRCUSP(stat); 932 933 /* Next, convert CSR to CSC (i.e. the matrix transpose) */ 934 CsrMatrix *tempT= new CsrMatrix; 935 tempT->num_rows = A->rmap->n; 936 tempT->num_cols = A->cmap->n; 937 tempT->num_entries = a->nz; 938 tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 939 tempT->column_indices = new THRUSTINTARRAY32(a->nz); 940 tempT->values = new THRUSTARRAY(a->nz); 941 942 stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows, 943 temp->num_cols, temp->num_entries, 944 temp->values->data().get(), 945 temp->row_offsets->data().get(), 946 temp->column_indices->data().get(), 947 tempT->values->data().get(), 948 tempT->column_indices->data().get(), 949 tempT->row_offsets->data().get(), 950 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSP(stat); 951 952 /* Last, convert CSC to HYB */ 953 cusparseHybMat_t hybMat; 954 stat = cusparseCreateHybMat(&hybMat);CHKERRCUSP(stat); 955 cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ? 956 CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 957 stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n, 958 matstructT->descr, tempT->values->data().get(), 959 tempT->row_offsets->data().get(), 960 tempT->column_indices->data().get(), 961 hybMat, 0, partition);CHKERRCUSP(stat); 962 963 /* assign the pointer */ 964 matstructT->mat = hybMat; 965 966 /* delete temporaries */ 967 if (tempT) { 968 if (tempT->values) delete (THRUSTARRAY*) tempT->values; 969 if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices; 970 if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets; 971 delete (CsrMatrix*) tempT; 972 } 973 if (temp) { 974 if (temp->values) delete (THRUSTARRAY*) temp->values; 975 if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices; 976 if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets; 977 delete (CsrMatrix*) temp; 978 } 979 #else 980 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"ELL (Ellpack) and HYB (Hybrid) storage format for the Matrix Transpose (in MatMultTranspose) require CUDA 5.0 or later."); 981 #endif 982 } 983 /* assign the compressed row indices */ 984 matstructT->cprowIndices = new THRUSTINTARRAY; 985 986 /* assign the pointer */ 987 ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT; 988 PetscFunctionReturn(0); 989 } 990 991 #undef __FUNCT__ 992 #define __FUNCT__ "MatSolveTranspose_SeqAIJCUSPARSE" 993 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 994 { 995 CUSPARRAY *xGPU, *bGPU; 996 cusparseStatus_t stat; 997 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 998 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 999 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1000 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1001 PetscErrorCode ierr; 1002 1003 PetscFunctionBegin; 1004 /* Analyze the matrix and create the transpose ... on the fly */ 1005 if (!loTriFactorT && !upTriFactorT) { 1006 ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr); 1007 loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1008 upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1009 } 1010 1011 /* Get the GPU pointers */ 1012 ierr = VecCUSPGetArrayWrite(xx,&xGPU);CHKERRQ(ierr); 1013 ierr = VecCUSPGetArrayRead(bb,&bGPU);CHKERRQ(ierr); 1014 1015 /* First, reorder with the row permutation */ 1016 thrust::copy(thrust::make_permutation_iterator(bGPU->begin(), cusparseTriFactors->rpermIndices->begin()), 1017 thrust::make_permutation_iterator(bGPU->end(), cusparseTriFactors->rpermIndices->end()), 1018 xGPU->begin()); 1019 1020 /* First, solve U */ 1021 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, 1022 upTriFactorT->csrMat->num_rows, &ALPHA, upTriFactorT->descr, 1023 upTriFactorT->csrMat->values->data().get(), 1024 upTriFactorT->csrMat->row_offsets->data().get(), 1025 upTriFactorT->csrMat->column_indices->data().get(), 1026 upTriFactorT->solveInfo, 1027 xGPU->data().get(), tempGPU->data().get());CHKERRCUSP(stat); 1028 1029 /* Then, solve L */ 1030 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, 1031 loTriFactorT->csrMat->num_rows, &ALPHA, loTriFactorT->descr, 1032 loTriFactorT->csrMat->values->data().get(), 1033 loTriFactorT->csrMat->row_offsets->data().get(), 1034 loTriFactorT->csrMat->column_indices->data().get(), 1035 loTriFactorT->solveInfo, 1036 tempGPU->data().get(), xGPU->data().get());CHKERRCUSP(stat); 1037 1038 /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */ 1039 thrust::copy(thrust::make_permutation_iterator(xGPU->begin(), cusparseTriFactors->cpermIndices->begin()), 1040 thrust::make_permutation_iterator(xGPU->end(), cusparseTriFactors->cpermIndices->end()), 1041 tempGPU->begin()); 1042 1043 /* Copy the temporary to the full solution. */ 1044 thrust::copy(tempGPU->begin(), tempGPU->end(), xGPU->begin()); 1045 1046 /* restore */ 1047 ierr = VecCUSPRestoreArrayRead(bb,&bGPU);CHKERRQ(ierr); 1048 ierr = VecCUSPRestoreArrayWrite(xx,&xGPU);CHKERRQ(ierr); 1049 ierr = WaitForGPU();CHKERRCUSP(ierr); 1050 1051 ierr = PetscLogFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1052 PetscFunctionReturn(0); 1053 } 1054 1055 #undef __FUNCT__ 1056 #define __FUNCT__ "MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering" 1057 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 1058 { 1059 CUSPARRAY *xGPU,*bGPU; 1060 cusparseStatus_t stat; 1061 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1062 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1063 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1064 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1065 PetscErrorCode ierr; 1066 1067 PetscFunctionBegin; 1068 /* Analyze the matrix and create the transpose ... on the fly */ 1069 if (!loTriFactorT && !upTriFactorT) { 1070 ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr); 1071 loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1072 upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1073 } 1074 1075 /* Get the GPU pointers */ 1076 ierr = VecCUSPGetArrayWrite(xx,&xGPU);CHKERRQ(ierr); 1077 ierr = VecCUSPGetArrayRead(bb,&bGPU);CHKERRQ(ierr); 1078 1079 /* First, solve U */ 1080 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, 1081 upTriFactorT->csrMat->num_rows, &ALPHA, upTriFactorT->descr, 1082 upTriFactorT->csrMat->values->data().get(), 1083 upTriFactorT->csrMat->row_offsets->data().get(), 1084 upTriFactorT->csrMat->column_indices->data().get(), 1085 upTriFactorT->solveInfo, 1086 bGPU->data().get(), tempGPU->data().get());CHKERRCUSP(stat); 1087 1088 /* Then, solve L */ 1089 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, 1090 loTriFactorT->csrMat->num_rows, &ALPHA, loTriFactorT->descr, 1091 loTriFactorT->csrMat->values->data().get(), 1092 loTriFactorT->csrMat->row_offsets->data().get(), 1093 loTriFactorT->csrMat->column_indices->data().get(), 1094 loTriFactorT->solveInfo, 1095 tempGPU->data().get(), xGPU->data().get());CHKERRCUSP(stat); 1096 1097 /* restore */ 1098 ierr = VecCUSPRestoreArrayRead(bb,&bGPU);CHKERRQ(ierr); 1099 ierr = VecCUSPRestoreArrayWrite(xx,&xGPU);CHKERRQ(ierr); 1100 ierr = WaitForGPU();CHKERRCUSP(ierr); 1101 ierr = PetscLogFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1102 PetscFunctionReturn(0); 1103 } 1104 1105 #undef __FUNCT__ 1106 #define __FUNCT__ "MatSolve_SeqAIJCUSPARSE" 1107 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 1108 { 1109 CUSPARRAY *xGPU,*bGPU; 1110 cusparseStatus_t stat; 1111 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1112 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1113 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1114 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1115 PetscErrorCode ierr; 1116 1117 PetscFunctionBegin; 1118 /* Get the GPU pointers */ 1119 ierr = VecCUSPGetArrayWrite(xx,&xGPU);CHKERRQ(ierr); 1120 ierr = VecCUSPGetArrayRead(bb,&bGPU);CHKERRQ(ierr); 1121 1122 /* First, reorder with the row permutation */ 1123 thrust::copy(thrust::make_permutation_iterator(bGPU->begin(), cusparseTriFactors->rpermIndices->begin()), 1124 thrust::make_permutation_iterator(bGPU->end(), cusparseTriFactors->rpermIndices->end()), 1125 xGPU->begin()); 1126 1127 /* Next, solve L */ 1128 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp, 1129 loTriFactor->csrMat->num_rows, &ALPHA, loTriFactor->descr, 1130 loTriFactor->csrMat->values->data().get(), 1131 loTriFactor->csrMat->row_offsets->data().get(), 1132 loTriFactor->csrMat->column_indices->data().get(), 1133 loTriFactor->solveInfo, 1134 xGPU->data().get(), tempGPU->data().get());CHKERRCUSP(stat); 1135 1136 /* Then, solve U */ 1137 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp, 1138 upTriFactor->csrMat->num_rows, &ALPHA, upTriFactor->descr, 1139 upTriFactor->csrMat->values->data().get(), 1140 upTriFactor->csrMat->row_offsets->data().get(), 1141 upTriFactor->csrMat->column_indices->data().get(), 1142 upTriFactor->solveInfo, 1143 tempGPU->data().get(), xGPU->data().get());CHKERRCUSP(stat); 1144 1145 /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */ 1146 thrust::copy(thrust::make_permutation_iterator(xGPU->begin(), cusparseTriFactors->cpermIndices->begin()), 1147 thrust::make_permutation_iterator(xGPU->end(), cusparseTriFactors->cpermIndices->end()), 1148 tempGPU->begin()); 1149 1150 /* Copy the temporary to the full solution. */ 1151 thrust::copy(tempGPU->begin(), tempGPU->end(), xGPU->begin()); 1152 1153 ierr = VecCUSPRestoreArrayRead(bb,&bGPU);CHKERRQ(ierr); 1154 ierr = VecCUSPRestoreArrayWrite(xx,&xGPU);CHKERRQ(ierr); 1155 ierr = WaitForGPU();CHKERRCUSP(ierr); 1156 ierr = PetscLogFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1157 PetscFunctionReturn(0); 1158 } 1159 1160 #undef __FUNCT__ 1161 #define __FUNCT__ "MatSolve_SeqAIJCUSPARSE_NaturalOrdering" 1162 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 1163 { 1164 CUSPARRAY *xGPU,*bGPU; 1165 cusparseStatus_t stat; 1166 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1167 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1168 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1169 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1170 PetscErrorCode ierr; 1171 1172 PetscFunctionBegin; 1173 /* Get the GPU pointers */ 1174 ierr = VecCUSPGetArrayWrite(xx,&xGPU);CHKERRQ(ierr); 1175 ierr = VecCUSPGetArrayRead(bb,&bGPU);CHKERRQ(ierr); 1176 1177 /* First, solve L */ 1178 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp, 1179 loTriFactor->csrMat->num_rows, &ALPHA, loTriFactor->descr, 1180 loTriFactor->csrMat->values->data().get(), 1181 loTriFactor->csrMat->row_offsets->data().get(), 1182 loTriFactor->csrMat->column_indices->data().get(), 1183 loTriFactor->solveInfo, 1184 bGPU->data().get(), tempGPU->data().get());CHKERRCUSP(stat); 1185 1186 /* Next, solve U */ 1187 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp, 1188 upTriFactor->csrMat->num_rows, &ALPHA, upTriFactor->descr, 1189 upTriFactor->csrMat->values->data().get(), 1190 upTriFactor->csrMat->row_offsets->data().get(), 1191 upTriFactor->csrMat->column_indices->data().get(), 1192 upTriFactor->solveInfo, 1193 tempGPU->data().get(), xGPU->data().get());CHKERRCUSP(stat); 1194 1195 ierr = VecCUSPRestoreArrayRead(bb,&bGPU);CHKERRQ(ierr); 1196 ierr = VecCUSPRestoreArrayWrite(xx,&xGPU);CHKERRQ(ierr); 1197 ierr = WaitForGPU();CHKERRCUSP(ierr); 1198 ierr = PetscLogFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1199 PetscFunctionReturn(0); 1200 } 1201 1202 #undef __FUNCT__ 1203 #define __FUNCT__ "MatSeqAIJCUSPARSECopyToGPU" 1204 static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A) 1205 { 1206 1207 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1208 Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 1209 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1210 PetscInt m = A->rmap->n,*ii,*ridx; 1211 PetscErrorCode ierr; 1212 cusparseStatus_t stat; 1213 cudaError_t err; 1214 1215 PetscFunctionBegin; 1216 if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUSP_CPU) { 1217 ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1218 if (matstruct) { 1219 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Data structure should not be initialized."); 1220 } 1221 try { 1222 cusparsestruct->nonzerorow=0; 1223 for (int j = 0; j<m; j++) cusparsestruct->nonzerorow += ((a->i[j+1]-a->i[j])>0); 1224 1225 if (a->compressedrow.use) { 1226 m = a->compressedrow.nrows; 1227 ii = a->compressedrow.i; 1228 ridx = a->compressedrow.rindex; 1229 } else { 1230 /* Forcing compressed row on the GPU */ 1231 int k=0; 1232 ierr = PetscMalloc1((cusparsestruct->nonzerorow+1), &ii);CHKERRQ(ierr); 1233 ierr = PetscMalloc1((cusparsestruct->nonzerorow), &ridx);CHKERRQ(ierr); 1234 ii[0]=0; 1235 for (int j = 0; j<m; j++) { 1236 if ((a->i[j+1]-a->i[j])>0) { 1237 ii[k] = a->i[j]; 1238 ridx[k]= j; 1239 k++; 1240 } 1241 } 1242 ii[cusparsestruct->nonzerorow] = a->nz; 1243 m = cusparsestruct->nonzerorow; 1244 } 1245 1246 /* allocate space for the triangular factor information */ 1247 matstruct = new Mat_SeqAIJCUSPARSEMultStruct; 1248 stat = cusparseCreateMatDescr(&matstruct->descr);CHKERRCUSP(stat); 1249 stat = cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSP(stat); 1250 stat = cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSP(stat); 1251 1252 err = cudaMalloc((void **)&(matstruct->alpha),sizeof(PetscScalar));CHKERRCUSP(err); 1253 err = cudaMemcpy(matstruct->alpha,&ALPHA,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUSP(err); 1254 err = cudaMalloc((void **)&(matstruct->beta),sizeof(PetscScalar));CHKERRCUSP(err); 1255 err = cudaMemcpy(matstruct->beta,&BETA,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUSP(err); 1256 stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSP(stat); 1257 1258 /* Build a hybrid/ellpack matrix if this option is chosen for the storage */ 1259 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1260 /* set the matrix */ 1261 CsrMatrix *matrix= new CsrMatrix; 1262 matrix->num_rows = m; 1263 matrix->num_cols = A->cmap->n; 1264 matrix->num_entries = a->nz; 1265 matrix->row_offsets = new THRUSTINTARRAY32(m+1); 1266 matrix->row_offsets->assign(ii, ii + m+1); 1267 1268 matrix->column_indices = new THRUSTINTARRAY32(a->nz); 1269 matrix->column_indices->assign(a->j, a->j+a->nz); 1270 1271 matrix->values = new THRUSTARRAY(a->nz); 1272 matrix->values->assign(a->a, a->a+a->nz); 1273 1274 /* assign the pointer */ 1275 matstruct->mat = matrix; 1276 1277 } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) { 1278 #if CUDA_VERSION>=4020 1279 CsrMatrix *matrix= new CsrMatrix; 1280 matrix->num_rows = m; 1281 matrix->num_cols = A->cmap->n; 1282 matrix->num_entries = a->nz; 1283 matrix->row_offsets = new THRUSTINTARRAY32(m+1); 1284 matrix->row_offsets->assign(ii, ii + m+1); 1285 1286 matrix->column_indices = new THRUSTINTARRAY32(a->nz); 1287 matrix->column_indices->assign(a->j, a->j+a->nz); 1288 1289 matrix->values = new THRUSTARRAY(a->nz); 1290 matrix->values->assign(a->a, a->a+a->nz); 1291 1292 cusparseHybMat_t hybMat; 1293 stat = cusparseCreateHybMat(&hybMat);CHKERRCUSP(stat); 1294 cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ? 1295 CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 1296 stat = cusparse_csr2hyb(cusparsestruct->handle, matrix->num_rows, matrix->num_cols, 1297 matstruct->descr, matrix->values->data().get(), 1298 matrix->row_offsets->data().get(), 1299 matrix->column_indices->data().get(), 1300 hybMat, 0, partition);CHKERRCUSP(stat); 1301 /* assign the pointer */ 1302 matstruct->mat = hybMat; 1303 1304 if (matrix) { 1305 if (matrix->values) delete (THRUSTARRAY*)matrix->values; 1306 if (matrix->column_indices) delete (THRUSTINTARRAY32*)matrix->column_indices; 1307 if (matrix->row_offsets) delete (THRUSTINTARRAY32*)matrix->row_offsets; 1308 delete (CsrMatrix*)matrix; 1309 } 1310 #endif 1311 } 1312 1313 /* assign the compressed row indices */ 1314 matstruct->cprowIndices = new THRUSTINTARRAY(m); 1315 matstruct->cprowIndices->assign(ridx,ridx+m); 1316 1317 /* assign the pointer */ 1318 cusparsestruct->mat = matstruct; 1319 1320 if (!a->compressedrow.use) { 1321 ierr = PetscFree(ii);CHKERRQ(ierr); 1322 ierr = PetscFree(ridx);CHKERRQ(ierr); 1323 } 1324 cusparsestruct->workVector = new THRUSTARRAY; 1325 cusparsestruct->workVector->resize(m); 1326 } catch(char *ex) { 1327 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 1328 } 1329 ierr = WaitForGPU();CHKERRCUSP(ierr); 1330 1331 A->valid_GPU_matrix = PETSC_CUSP_BOTH; 1332 1333 ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1334 } 1335 PetscFunctionReturn(0); 1336 } 1337 1338 #undef __FUNCT__ 1339 #define __FUNCT__ "MatGetVecs_SeqAIJCUSPARSE" 1340 static PetscErrorCode MatGetVecs_SeqAIJCUSPARSE(Mat mat, Vec *right, Vec *left) 1341 { 1342 PetscErrorCode ierr; 1343 PetscInt rbs,cbs; 1344 1345 PetscFunctionBegin; 1346 ierr = MatGetBlockSizes(mat,&rbs,&cbs);CHKERRQ(ierr); 1347 if (right) { 1348 ierr = VecCreate(PetscObjectComm((PetscObject)mat),right);CHKERRQ(ierr); 1349 ierr = VecSetSizes(*right,mat->cmap->n,PETSC_DETERMINE);CHKERRQ(ierr); 1350 ierr = VecSetBlockSize(*right,cbs);CHKERRQ(ierr); 1351 ierr = VecSetType(*right,VECSEQCUSP);CHKERRQ(ierr); 1352 ierr = PetscLayoutReference(mat->cmap,&(*right)->map);CHKERRQ(ierr); 1353 } 1354 if (left) { 1355 ierr = VecCreate(PetscObjectComm((PetscObject)mat),left);CHKERRQ(ierr); 1356 ierr = VecSetSizes(*left,mat->rmap->n,PETSC_DETERMINE);CHKERRQ(ierr); 1357 ierr = VecSetBlockSize(*left,rbs);CHKERRQ(ierr); 1358 ierr = VecSetType(*left,VECSEQCUSP);CHKERRQ(ierr); 1359 ierr = PetscLayoutReference(mat->rmap,&(*left)->map);CHKERRQ(ierr); 1360 } 1361 PetscFunctionReturn(0); 1362 } 1363 1364 struct VecCUSPPlusEquals 1365 { 1366 template <typename Tuple> 1367 __host__ __device__ 1368 void operator()(Tuple t) 1369 { 1370 thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t); 1371 } 1372 }; 1373 1374 #undef __FUNCT__ 1375 #define __FUNCT__ "MatMult_SeqAIJCUSPARSE" 1376 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 1377 { 1378 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1379 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1380 Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 1381 CUSPARRAY *xarray,*yarray; 1382 PetscErrorCode ierr; 1383 cusparseStatus_t stat; 1384 1385 PetscFunctionBegin; 1386 /* The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE 1387 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); */ 1388 ierr = VecCUSPGetArrayRead(xx,&xarray);CHKERRQ(ierr); 1389 ierr = VecCUSPGetArrayWrite(yy,&yarray);CHKERRQ(ierr); 1390 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1391 CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 1392 stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1393 mat->num_rows, mat->num_cols, mat->num_entries, 1394 matstruct->alpha, matstruct->descr, mat->values->data().get(), mat->row_offsets->data().get(), 1395 mat->column_indices->data().get(), xarray->data().get(), matstruct->beta, 1396 yarray->data().get());CHKERRCUSP(stat); 1397 } else { 1398 #if CUDA_VERSION>=4020 1399 cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat; 1400 stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1401 matstruct->alpha, matstruct->descr, hybMat, 1402 xarray->data().get(), matstruct->beta, 1403 yarray->data().get());CHKERRCUSP(stat); 1404 #endif 1405 } 1406 ierr = VecCUSPRestoreArrayRead(xx,&xarray);CHKERRQ(ierr); 1407 ierr = VecCUSPRestoreArrayWrite(yy,&yarray);CHKERRQ(ierr); 1408 if (!cusparsestruct->stream) { 1409 ierr = WaitForGPU();CHKERRCUSP(ierr); 1410 } 1411 ierr = PetscLogFlops(2.0*a->nz - cusparsestruct->nonzerorow);CHKERRQ(ierr); 1412 PetscFunctionReturn(0); 1413 } 1414 1415 #undef __FUNCT__ 1416 #define __FUNCT__ "MatMultTranspose_SeqAIJCUSPARSE" 1417 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 1418 { 1419 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1420 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1421 Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 1422 CUSPARRAY *xarray,*yarray; 1423 PetscErrorCode ierr; 1424 cusparseStatus_t stat; 1425 1426 PetscFunctionBegin; 1427 /* The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE 1428 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); */ 1429 if (!matstructT) { 1430 ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr); 1431 matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 1432 } 1433 ierr = VecCUSPGetArrayRead(xx,&xarray);CHKERRQ(ierr); 1434 ierr = VecCUSPGetArrayWrite(yy,&yarray);CHKERRQ(ierr); 1435 1436 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1437 CsrMatrix *mat = (CsrMatrix*)matstructT->mat; 1438 stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1439 mat->num_rows, mat->num_cols, 1440 mat->num_entries, matstructT->alpha, matstructT->descr, 1441 mat->values->data().get(), mat->row_offsets->data().get(), 1442 mat->column_indices->data().get(), xarray->data().get(), matstructT->beta, 1443 yarray->data().get());CHKERRCUSP(stat); 1444 } else { 1445 #if CUDA_VERSION>=4020 1446 cusparseHybMat_t hybMat = (cusparseHybMat_t)matstructT->mat; 1447 stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1448 matstructT->alpha, matstructT->descr, hybMat, 1449 xarray->data().get(), matstructT->beta, 1450 yarray->data().get());CHKERRCUSP(stat); 1451 #endif 1452 } 1453 ierr = VecCUSPRestoreArrayRead(xx,&xarray);CHKERRQ(ierr); 1454 ierr = VecCUSPRestoreArrayWrite(yy,&yarray);CHKERRQ(ierr); 1455 if (!cusparsestruct->stream) { 1456 ierr = WaitForGPU();CHKERRCUSP(ierr); 1457 } 1458 ierr = PetscLogFlops(2.0*a->nz - cusparsestruct->nonzerorow);CHKERRQ(ierr); 1459 PetscFunctionReturn(0); 1460 } 1461 1462 1463 #undef __FUNCT__ 1464 #define __FUNCT__ "MatMultAdd_SeqAIJCUSPARSE" 1465 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 1466 { 1467 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1468 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1469 Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 1470 CUSPARRAY *xarray,*yarray,*zarray; 1471 PetscErrorCode ierr; 1472 cusparseStatus_t stat; 1473 1474 PetscFunctionBegin; 1475 /* The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE 1476 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); */ 1477 try { 1478 ierr = VecCopy_SeqCUSP(yy,zz);CHKERRQ(ierr); 1479 ierr = VecCUSPGetArrayRead(xx,&xarray);CHKERRQ(ierr); 1480 ierr = VecCUSPGetArrayRead(yy,&yarray);CHKERRQ(ierr); 1481 ierr = VecCUSPGetArrayWrite(zz,&zarray);CHKERRQ(ierr); 1482 1483 /* multiply add */ 1484 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1485 CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 1486 /* here we need to be careful to set the number of rows in the multiply to the 1487 number of compressed rows in the matrix ... which is equivalent to the 1488 size of the workVector */ 1489 stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1490 mat->num_rows, mat->num_cols, 1491 mat->num_entries, matstruct->alpha, matstruct->descr, 1492 mat->values->data().get(), mat->row_offsets->data().get(), 1493 mat->column_indices->data().get(), xarray->data().get(), matstruct->beta, 1494 cusparsestruct->workVector->data().get());CHKERRCUSP(stat); 1495 } else { 1496 #if CUDA_VERSION>=4020 1497 cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat; 1498 if (cusparsestruct->workVector->size()) { 1499 stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1500 matstruct->alpha, matstruct->descr, hybMat, 1501 xarray->data().get(), matstruct->beta, 1502 cusparsestruct->workVector->data().get());CHKERRCUSP(stat); 1503 } 1504 #endif 1505 } 1506 1507 /* scatter the data from the temporary into the full vector with a += operation */ 1508 thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zarray->begin(), matstruct->cprowIndices->begin()))), 1509 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zarray->begin(), matstruct->cprowIndices->begin()))) + cusparsestruct->workVector->size(), 1510 VecCUSPPlusEquals()); 1511 ierr = VecCUSPRestoreArrayRead(xx,&xarray);CHKERRQ(ierr); 1512 ierr = VecCUSPRestoreArrayRead(yy,&yarray);CHKERRQ(ierr); 1513 ierr = VecCUSPRestoreArrayWrite(zz,&zarray);CHKERRQ(ierr); 1514 1515 } catch(char *ex) { 1516 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 1517 } 1518 ierr = WaitForGPU();CHKERRCUSP(ierr); 1519 ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr); 1520 PetscFunctionReturn(0); 1521 } 1522 1523 #undef __FUNCT__ 1524 #define __FUNCT__ "MatMultTransposeAdd_SeqAIJCUSPARSE" 1525 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 1526 { 1527 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1528 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1529 Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 1530 CUSPARRAY *xarray,*yarray,*zarray; 1531 PetscErrorCode ierr; 1532 cusparseStatus_t stat; 1533 1534 PetscFunctionBegin; 1535 /* The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE 1536 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); */ 1537 if (!matstructT) { 1538 ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr); 1539 matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 1540 } 1541 1542 try { 1543 ierr = VecCopy_SeqCUSP(yy,zz);CHKERRQ(ierr); 1544 ierr = VecCUSPGetArrayRead(xx,&xarray);CHKERRQ(ierr); 1545 ierr = VecCUSPGetArrayRead(yy,&yarray);CHKERRQ(ierr); 1546 ierr = VecCUSPGetArrayWrite(zz,&zarray);CHKERRQ(ierr); 1547 1548 /* multiply add with matrix transpose */ 1549 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1550 CsrMatrix *mat = (CsrMatrix*)matstructT->mat; 1551 /* here we need to be careful to set the number of rows in the multiply to the 1552 number of compressed rows in the matrix ... which is equivalent to the 1553 size of the workVector */ 1554 stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1555 mat->num_rows, mat->num_cols, 1556 mat->num_entries, matstructT->alpha, matstructT->descr, 1557 mat->values->data().get(), mat->row_offsets->data().get(), 1558 mat->column_indices->data().get(), xarray->data().get(), matstructT->beta, 1559 cusparsestruct->workVector->data().get());CHKERRCUSP(stat); 1560 } else { 1561 #if CUDA_VERSION>=4020 1562 cusparseHybMat_t hybMat = (cusparseHybMat_t)matstructT->mat; 1563 if (cusparsestruct->workVector->size()) { 1564 stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1565 matstructT->alpha, matstructT->descr, hybMat, 1566 xarray->data().get(), matstructT->beta, 1567 cusparsestruct->workVector->data().get());CHKERRCUSP(stat); 1568 } 1569 #endif 1570 } 1571 1572 /* scatter the data from the temporary into the full vector with a += operation */ 1573 thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zarray->begin(), matstructT->cprowIndices->begin()))), 1574 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zarray->begin(), matstructT->cprowIndices->begin()))) + cusparsestruct->workVector->size(), 1575 VecCUSPPlusEquals()); 1576 1577 ierr = VecCUSPRestoreArrayRead(xx,&xarray);CHKERRQ(ierr); 1578 ierr = VecCUSPRestoreArrayRead(yy,&yarray);CHKERRQ(ierr); 1579 ierr = VecCUSPRestoreArrayWrite(zz,&zarray);CHKERRQ(ierr); 1580 1581 } catch(char *ex) { 1582 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 1583 } 1584 ierr = WaitForGPU();CHKERRCUSP(ierr); 1585 ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr); 1586 PetscFunctionReturn(0); 1587 } 1588 1589 #undef __FUNCT__ 1590 #define __FUNCT__ "MatAssemblyEnd_SeqAIJCUSPARSE" 1591 static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode) 1592 { 1593 PetscErrorCode ierr; 1594 1595 PetscFunctionBegin; 1596 ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr); 1597 if (A->factortype==MAT_FACTOR_NONE) { 1598 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 1599 } 1600 if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0); 1601 A->ops->mult = MatMult_SeqAIJCUSPARSE; 1602 A->ops->multadd = MatMultAdd_SeqAIJCUSPARSE; 1603 A->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE; 1604 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE; 1605 PetscFunctionReturn(0); 1606 } 1607 1608 /* --------------------------------------------------------------------------------*/ 1609 #undef __FUNCT__ 1610 #define __FUNCT__ "MatCreateSeqAIJCUSPARSE" 1611 /*@ 1612 MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format 1613 (the default parallel PETSc format). This matrix will ultimately pushed down 1614 to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix 1615 assembly performance the user should preallocate the matrix storage by setting 1616 the parameter nz (or the array nnz). By setting these parameters accurately, 1617 performance during matrix assembly can be increased by more than a factor of 50. 1618 1619 Collective on MPI_Comm 1620 1621 Input Parameters: 1622 + comm - MPI communicator, set to PETSC_COMM_SELF 1623 . m - number of rows 1624 . n - number of columns 1625 . nz - number of nonzeros per row (same for all rows) 1626 - nnz - array containing the number of nonzeros in the various rows 1627 (possibly different for each row) or NULL 1628 1629 Output Parameter: 1630 . A - the matrix 1631 1632 It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 1633 MatXXXXSetPreallocation() paradgm instead of this routine directly. 1634 [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 1635 1636 Notes: 1637 If nnz is given then nz is ignored 1638 1639 The AIJ format (also called the Yale sparse matrix format or 1640 compressed row storage), is fully compatible with standard Fortran 77 1641 storage. That is, the stored row and column indices can begin at 1642 either one (as in Fortran) or zero. See the users' manual for details. 1643 1644 Specify the preallocated storage with either nz or nnz (not both). 1645 Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 1646 allocation. For large problems you MUST preallocate memory or you 1647 will get TERRIBLE performance, see the users' manual chapter on matrices. 1648 1649 By default, this format uses inodes (identical nodes) when possible, to 1650 improve numerical efficiency of matrix-vector products and solves. We 1651 search for consecutive rows with the same nonzero structure, thereby 1652 reusing matrix information to achieve increased efficiency. 1653 1654 Level: intermediate 1655 1656 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE 1657 @*/ 1658 PetscErrorCode MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A) 1659 { 1660 PetscErrorCode ierr; 1661 1662 PetscFunctionBegin; 1663 ierr = MatCreate(comm,A);CHKERRQ(ierr); 1664 ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr); 1665 ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 1666 ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr); 1667 PetscFunctionReturn(0); 1668 } 1669 1670 #undef __FUNCT__ 1671 #define __FUNCT__ "MatDestroy_SeqAIJCUSPARSE" 1672 static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A) 1673 { 1674 PetscErrorCode ierr; 1675 cusparseStatus_t stat; 1676 cudaError_t err; 1677 PetscFunctionBegin; 1678 if (A->factortype==MAT_FACTOR_NONE) { 1679 try { 1680 if (A->valid_GPU_matrix != PETSC_CUSP_UNALLOCATED) { 1681 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1682 Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 1683 Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 1684 1685 /* delete all the members in each of the data structures */ 1686 if (matstruct) { 1687 if (matstruct->descr) { stat = cusparseDestroyMatDescr(matstruct->descr);CHKERRCUSP(stat); } 1688 if (matstruct->mat) { 1689 if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) { 1690 #if CUDA_VERSION>=4020 1691 cusparseHybMat_t hybMat = (cusparseHybMat_t) matstruct->mat; 1692 stat = cusparseDestroyHybMat(hybMat);CHKERRCUSP(stat); 1693 #endif 1694 } else { 1695 CsrMatrix* mat = (CsrMatrix*)matstruct->mat; 1696 if (mat->values) delete (THRUSTARRAY*)mat->values; 1697 if (mat->column_indices) delete (THRUSTINTARRAY32*)mat->column_indices; 1698 if (mat->row_offsets) delete (THRUSTINTARRAY32*)mat->row_offsets; 1699 delete (CsrMatrix*)matstruct->mat; 1700 } 1701 } 1702 if (matstruct->alpha) { err=cudaFree(matstruct->alpha);CHKERRCUSP(err); } 1703 if (matstruct->beta) { err=cudaFree(matstruct->beta);CHKERRCUSP(err); } 1704 if (matstruct->cprowIndices) delete (THRUSTINTARRAY*)matstruct->cprowIndices; 1705 if (matstruct) delete (Mat_SeqAIJCUSPARSEMultStruct*)matstruct; 1706 } 1707 if (matstructT) { 1708 if (matstructT->descr) { stat = cusparseDestroyMatDescr(matstructT->descr);CHKERRCUSP(stat); } 1709 if (matstructT->mat) { 1710 if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) { 1711 #if CUDA_VERSION>=4020 1712 cusparseHybMat_t hybMat = (cusparseHybMat_t) matstructT->mat; 1713 stat = cusparseDestroyHybMat(hybMat);CHKERRCUSP(stat); 1714 #endif 1715 } else { 1716 CsrMatrix* matT = (CsrMatrix*)matstructT->mat; 1717 if (matT->values) delete (THRUSTARRAY*)matT->values; 1718 if (matT->column_indices) delete (THRUSTINTARRAY32*)matT->column_indices; 1719 if (matT->row_offsets) delete (THRUSTINTARRAY32*)matT->row_offsets; 1720 delete (CsrMatrix*)matstructT->mat; 1721 } 1722 } 1723 if (matstructT->alpha) { err=cudaFree(matstructT->alpha);CHKERRCUSP(err); } 1724 if (matstructT->beta) { err=cudaFree(matstructT->beta);CHKERRCUSP(err); } 1725 if (matstructT->cprowIndices) delete (THRUSTINTARRAY*)matstructT->cprowIndices; 1726 if (matstructT) delete (Mat_SeqAIJCUSPARSEMultStruct*)matstructT; 1727 } 1728 if (cusparsestruct->workVector) delete (THRUSTARRAY*)cusparsestruct->workVector; 1729 if (cusparsestruct->handle) { stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUSP(stat); } 1730 if (cusparsestruct) delete (Mat_SeqAIJCUSPARSE*)cusparsestruct; 1731 A->valid_GPU_matrix = PETSC_CUSP_UNALLOCATED; 1732 } else { 1733 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1734 if (cusparsestruct->handle) { stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUSP(stat); } 1735 if (cusparsestruct) delete (Mat_SeqAIJCUSPARSE*)cusparsestruct; 1736 } 1737 } catch(char *ex) { 1738 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 1739 } 1740 } else { 1741 /* The triangular factors */ 1742 try { 1743 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1744 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1745 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1746 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1747 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1748 1749 /* delete all the members in each of the data structures */ 1750 if (loTriFactor) { 1751 if (loTriFactor->descr) { stat = cusparseDestroyMatDescr(loTriFactor->descr);CHKERRCUSP(stat); } 1752 if (loTriFactor->solveInfo) { stat = cusparseDestroySolveAnalysisInfo(loTriFactor->solveInfo);CHKERRCUSP(stat); } 1753 if (loTriFactor->csrMat) { 1754 if (loTriFactor->csrMat->values) delete (THRUSTARRAY*)loTriFactor->csrMat->values; 1755 if (loTriFactor->csrMat->column_indices) delete (THRUSTINTARRAY32*)loTriFactor->csrMat->column_indices; 1756 if (loTriFactor->csrMat->row_offsets) delete (THRUSTINTARRAY32*)loTriFactor->csrMat->row_offsets; 1757 delete (CsrMatrix*)loTriFactor->csrMat; 1758 } 1759 if (loTriFactor) delete (Mat_SeqAIJCUSPARSETriFactorStruct*)loTriFactor; 1760 } 1761 1762 if (upTriFactor) { 1763 if (upTriFactor->descr) { stat = cusparseDestroyMatDescr(upTriFactor->descr);CHKERRCUSP(stat); } 1764 if (upTriFactor->solveInfo) { stat = cusparseDestroySolveAnalysisInfo(upTriFactor->solveInfo);CHKERRCUSP(stat); } 1765 if (upTriFactor->csrMat) { 1766 if (upTriFactor->csrMat->values) delete (THRUSTARRAY*)upTriFactor->csrMat->values; 1767 if (upTriFactor->csrMat->column_indices) delete (THRUSTINTARRAY32*)upTriFactor->csrMat->column_indices; 1768 if (upTriFactor->csrMat->row_offsets) delete (THRUSTINTARRAY32*)upTriFactor->csrMat->row_offsets; 1769 delete (CsrMatrix*)upTriFactor->csrMat; 1770 } 1771 if (upTriFactor) delete (Mat_SeqAIJCUSPARSETriFactorStruct*)upTriFactor; 1772 } 1773 1774 if (loTriFactorT) { 1775 if (loTriFactorT->descr) { stat = cusparseDestroyMatDescr(loTriFactorT->descr);CHKERRCUSP(stat); } 1776 if (loTriFactorT->solveInfo) { stat = cusparseDestroySolveAnalysisInfo(loTriFactorT->solveInfo);CHKERRCUSP(stat); } 1777 if (loTriFactorT->csrMat) { 1778 if (loTriFactorT->csrMat->values) delete (THRUSTARRAY*)loTriFactorT->csrMat->values; 1779 if (loTriFactorT->csrMat->column_indices) delete (THRUSTINTARRAY32*)loTriFactorT->csrMat->column_indices; 1780 if (loTriFactorT->csrMat->row_offsets) delete (THRUSTINTARRAY32*)loTriFactorT->csrMat->row_offsets; 1781 delete (CsrMatrix*)loTriFactorT->csrMat; 1782 } 1783 if (loTriFactorT) delete (Mat_SeqAIJCUSPARSETriFactorStruct*)loTriFactorT; 1784 } 1785 1786 if (upTriFactorT) { 1787 if (upTriFactorT->descr) { stat = cusparseDestroyMatDescr(upTriFactorT->descr);CHKERRCUSP(stat); } 1788 if (upTriFactorT->solveInfo) { stat = cusparseDestroySolveAnalysisInfo(upTriFactorT->solveInfo);CHKERRCUSP(stat); } 1789 if (upTriFactorT->csrMat) { 1790 if (upTriFactorT->csrMat->values) delete (THRUSTARRAY*)upTriFactorT->csrMat->values; 1791 if (upTriFactorT->csrMat->column_indices) delete (THRUSTINTARRAY32*)upTriFactorT->csrMat->column_indices; 1792 if (upTriFactorT->csrMat->row_offsets) delete (THRUSTINTARRAY32*)upTriFactorT->csrMat->row_offsets; 1793 delete (CsrMatrix*)upTriFactorT->csrMat; 1794 } 1795 if (upTriFactorT) delete (Mat_SeqAIJCUSPARSETriFactorStruct*)upTriFactorT; 1796 } 1797 if (cusparseTriFactors->rpermIndices) delete (THRUSTINTARRAY*)cusparseTriFactors->rpermIndices; 1798 if (cusparseTriFactors->cpermIndices) delete (THRUSTINTARRAY*)cusparseTriFactors->cpermIndices; 1799 if (cusparseTriFactors->workVector) delete (THRUSTARRAY*)cusparseTriFactors->workVector; 1800 if (cusparseTriFactors->handle) { stat = cusparseDestroy(cusparseTriFactors->handle);CHKERRCUSP(stat); } 1801 if (cusparseTriFactors) delete (Mat_SeqAIJCUSPARSETriFactors*)cusparseTriFactors; 1802 } catch(char *ex) { 1803 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 1804 } 1805 } 1806 /*this next line is because MatDestroy tries to PetscFree spptr if it is not zero, and PetscFree only works if the memory was allocated with PetscNew or PetscMalloc, which don't call the constructor */ 1807 A->spptr = 0; 1808 1809 ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr); 1810 PetscFunctionReturn(0); 1811 } 1812 1813 #undef __FUNCT__ 1814 #define __FUNCT__ "MatCreate_SeqAIJCUSPARSE" 1815 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B) 1816 { 1817 PetscErrorCode ierr; 1818 cusparseStatus_t stat; 1819 cusparseHandle_t handle=0; 1820 1821 PetscFunctionBegin; 1822 ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr); 1823 if (B->factortype==MAT_FACTOR_NONE) { 1824 /* you cannot check the inode.use flag here since the matrix was just created. 1825 now build a GPU matrix data structure */ 1826 B->spptr = new Mat_SeqAIJCUSPARSE; 1827 ((Mat_SeqAIJCUSPARSE*)B->spptr)->mat = 0; 1828 ((Mat_SeqAIJCUSPARSE*)B->spptr)->matTranspose = 0; 1829 ((Mat_SeqAIJCUSPARSE*)B->spptr)->workVector = 0; 1830 ((Mat_SeqAIJCUSPARSE*)B->spptr)->format = MAT_CUSPARSE_CSR; 1831 ((Mat_SeqAIJCUSPARSE*)B->spptr)->stream = 0; 1832 ((Mat_SeqAIJCUSPARSE*)B->spptr)->handle = 0; 1833 stat = cusparseCreate(&handle);CHKERRCUSP(stat); 1834 ((Mat_SeqAIJCUSPARSE*)B->spptr)->handle = handle; 1835 ((Mat_SeqAIJCUSPARSE*)B->spptr)->stream = 0; 1836 } else { 1837 /* NEXT, set the pointers to the triangular factors */ 1838 B->spptr = new Mat_SeqAIJCUSPARSETriFactors; 1839 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->loTriFactorPtr = 0; 1840 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->upTriFactorPtr = 0; 1841 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->loTriFactorPtrTranspose = 0; 1842 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->upTriFactorPtrTranspose = 0; 1843 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->rpermIndices = 0; 1844 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->cpermIndices = 0; 1845 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->workVector = 0; 1846 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->handle = 0; 1847 stat = cusparseCreate(&handle);CHKERRCUSP(stat); 1848 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->handle = handle; 1849 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->nnz = 0; 1850 } 1851 1852 /* Here we overload MatGetFactor_petsc_C which enables -mat_type aijcusparse to use the 1853 default cusparse tri solve. Note the difference with the implementation in 1854 MatCreate_SeqAIJCUSP in ../seqcusp/aijcusp.cu */ 1855 ierr = PetscObjectComposeFunction((PetscObject)B,"MatGetFactor_petsc_C",MatGetFactor_seqaij_cusparse);CHKERRQ(ierr); 1856 1857 B->ops->assemblyend = MatAssemblyEnd_SeqAIJCUSPARSE; 1858 B->ops->destroy = MatDestroy_SeqAIJCUSPARSE; 1859 B->ops->getvecs = MatGetVecs_SeqAIJCUSPARSE; 1860 B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE; 1861 B->ops->mult = MatMult_SeqAIJCUSPARSE; 1862 B->ops->multadd = MatMultAdd_SeqAIJCUSPARSE; 1863 B->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE; 1864 B->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE; 1865 1866 ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 1867 1868 B->valid_GPU_matrix = PETSC_CUSP_UNALLOCATED; 1869 1870 ierr = PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr); 1871 PetscFunctionReturn(0); 1872 } 1873 1874 /*M 1875 MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices. 1876 1877 A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either 1878 CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later. 1879 All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library. 1880 1881 Options Database Keys: 1882 + -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions() 1883 . -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 1884 . -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 1885 1886 Level: beginner 1887 1888 .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 1889 M*/ 1890