1 /* 2 Defines the basic matrix operations for the AIJ (compressed row) 3 matrix storage format using the CUSPARSE library, 4 */ 5 6 #include <petscconf.h> 7 #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/ 8 #include <../src/mat/impls/sbaij/seq/sbaij.h> 9 #include <../src/vec/vec/impls/dvecimpl.h> 10 #include <petsc-private/vecimpl.h> 11 #undef VecType 12 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h> 13 14 const char *const MatCUSPARSEStorageFormats[] = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0}; 15 16 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*); 17 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*); 18 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*); 19 20 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 21 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 22 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*); 23 24 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec); 25 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 26 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 27 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 28 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat); 29 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec); 30 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 31 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 32 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 33 34 #undef __FUNCT__ 35 #define __FUNCT__ "MatCUSPARSESetStream" 36 PetscErrorCode MatCUSPARSESetStream(Mat A,const cudaStream_t stream) 37 { 38 cusparseStatus_t stat; 39 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 40 41 PetscFunctionBegin; 42 cusparsestruct->stream = stream; 43 stat = cusparseSetStream(cusparsestruct->handle,cusparsestruct->stream);CHKERRCUSP(stat); 44 PetscFunctionReturn(0); 45 } 46 47 #undef __FUNCT__ 48 #define __FUNCT__ "MatCUSPARSESetHandle" 49 PetscErrorCode MatCUSPARSESetHandle(Mat A,const cusparseHandle_t handle) 50 { 51 cusparseStatus_t stat; 52 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 53 54 PetscFunctionBegin; 55 if (cusparsestruct->handle) 56 stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUSP(stat); 57 cusparsestruct->handle = handle; 58 stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSP(stat); 59 PetscFunctionReturn(0); 60 } 61 62 #undef __FUNCT__ 63 #define __FUNCT__ "MatCUSPARSEClearHandle" 64 PetscErrorCode MatCUSPARSEClearHandle(Mat A) 65 { 66 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 67 PetscFunctionBegin; 68 if (cusparsestruct->handle) 69 cusparsestruct->handle = 0; 70 PetscFunctionReturn(0); 71 } 72 73 #undef __FUNCT__ 74 #define __FUNCT__ "MatFactorGetSolverPackage_seqaij_cusparse" 75 PetscErrorCode MatFactorGetSolverPackage_seqaij_cusparse(Mat A,const MatSolverPackage *type) 76 { 77 PetscFunctionBegin; 78 *type = MATSOLVERCUSPARSE; 79 PetscFunctionReturn(0); 80 } 81 82 /*MC 83 MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices 84 on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported 85 algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer 86 performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the 87 CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these 88 algorithms are not recommended. This class does NOT support direct solver operations. 89 90 Level: beginner 91 92 .seealso: PCFactorSetMatSolverPackage(), MatSolverPackage, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 93 M*/ 94 95 #undef __FUNCT__ 96 #define __FUNCT__ "MatGetFactor_seqaij_cusparse" 97 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaij_cusparse(Mat A,MatFactorType ftype,Mat *B) 98 { 99 PetscErrorCode ierr; 100 PetscInt n = A->rmap->n; 101 102 PetscFunctionBegin; 103 ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr); 104 (*B)->factortype = ftype; 105 ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr); 106 ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 107 108 if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) { 109 ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr); 110 (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE; 111 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJCUSPARSE; 112 } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) { 113 (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJCUSPARSE; 114 (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE; 115 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types"); 116 117 ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr); 118 ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverPackage_C",MatFactorGetSolverPackage_seqaij_cusparse);CHKERRQ(ierr); 119 PetscFunctionReturn(0); 120 } 121 122 #undef __FUNCT__ 123 #define __FUNCT__ "MatCUSPARSESetFormat_SeqAIJCUSPARSE" 124 PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 125 { 126 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 127 128 PetscFunctionBegin; 129 #if CUDA_VERSION>=4020 130 switch (op) { 131 case MAT_CUSPARSE_MULT: 132 cusparsestruct->format = format; 133 break; 134 case MAT_CUSPARSE_ALL: 135 cusparsestruct->format = format; 136 break; 137 default: 138 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op); 139 } 140 #else 141 if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) 142 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"ELL (Ellpack) and HYB (Hybrid) storage format require CUDA 4.2 or later."); 143 #endif 144 PetscFunctionReturn(0); 145 } 146 147 /*@ 148 MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular 149 operation. Only the MatMult operation can use different GPU storage formats 150 for MPIAIJCUSPARSE matrices. 151 Not Collective 152 153 Input Parameters: 154 + A - Matrix of type SEQAIJCUSPARSE 155 . op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL. 156 - format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2) 157 158 Output Parameter: 159 160 Level: intermediate 161 162 .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 163 @*/ 164 #undef __FUNCT__ 165 #define __FUNCT__ "MatCUSPARSESetFormat" 166 PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 167 { 168 PetscErrorCode ierr; 169 170 PetscFunctionBegin; 171 PetscValidHeaderSpecific(A, MAT_CLASSID,1); 172 ierr = PetscTryMethod(A, "MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));CHKERRQ(ierr); 173 PetscFunctionReturn(0); 174 } 175 176 #undef __FUNCT__ 177 #define __FUNCT__ "MatSetFromOptions_SeqAIJCUSPARSE" 178 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat A) 179 { 180 PetscErrorCode ierr; 181 MatCUSPARSEStorageFormat format; 182 PetscBool flg; 183 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 184 185 PetscFunctionBegin; 186 ierr = PetscOptionsHead("SeqAIJCUSPARSE options");CHKERRQ(ierr); 187 ierr = PetscObjectOptionsBegin((PetscObject)A); 188 if (A->factortype==MAT_FACTOR_NONE) { 189 ierr = PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV", 190 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 191 if (flg) { 192 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);CHKERRQ(ierr); 193 } 194 } 195 ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve", 196 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 197 if (flg) { 198 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr); 199 } 200 ierr = PetscOptionsEnd();CHKERRQ(ierr); 201 PetscFunctionReturn(0); 202 203 } 204 205 #undef __FUNCT__ 206 #define __FUNCT__ "MatILUFactorSymbolic_SeqAIJCUSPARSE" 207 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 208 { 209 PetscErrorCode ierr; 210 211 PetscFunctionBegin; 212 ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 213 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 214 PetscFunctionReturn(0); 215 } 216 217 #undef __FUNCT__ 218 #define __FUNCT__ "MatLUFactorSymbolic_SeqAIJCUSPARSE" 219 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 220 { 221 PetscErrorCode ierr; 222 223 PetscFunctionBegin; 224 ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 225 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 226 PetscFunctionReturn(0); 227 } 228 229 #undef __FUNCT__ 230 #define __FUNCT__ "MatICCFactorSymbolic_SeqAIJCUSPARSE" 231 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info) 232 { 233 PetscErrorCode ierr; 234 235 PetscFunctionBegin; 236 ierr = MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr); 237 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 238 PetscFunctionReturn(0); 239 } 240 241 #undef __FUNCT__ 242 #define __FUNCT__ "MatCholeskyFactorSymbolic_SeqAIJCUSPARSE" 243 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info) 244 { 245 PetscErrorCode ierr; 246 247 PetscFunctionBegin; 248 ierr = MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr); 249 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 250 PetscFunctionReturn(0); 251 } 252 253 #undef __FUNCT__ 254 #define __FUNCT__ "MatSeqAIJCUSPARSEBuildILULowerTriMatrix" 255 static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A) 256 { 257 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 258 PetscInt n = A->rmap->n; 259 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 260 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 261 cusparseStatus_t stat; 262 const PetscInt *ai = a->i,*aj = a->j,*vi; 263 const MatScalar *aa = a->a,*v; 264 PetscInt *AiLo, *AjLo; 265 PetscScalar *AALo; 266 PetscInt i,nz, nzLower, offset, rowOffset; 267 PetscErrorCode ierr; 268 269 PetscFunctionBegin; 270 if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUSP_CPU) { 271 try { 272 /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */ 273 nzLower=n+ai[n]-ai[1]; 274 275 /* Allocate Space for the lower triangular matrix */ 276 ierr = cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUSP(ierr); 277 ierr = cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUSP(ierr); 278 ierr = cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUSP(ierr); 279 280 /* Fill the lower triangular matrix */ 281 AiLo[0] = (PetscInt) 0; 282 AiLo[n] = nzLower; 283 AjLo[0] = (PetscInt) 0; 284 AALo[0] = (MatScalar) 1.0; 285 v = aa; 286 vi = aj; 287 offset = 1; 288 rowOffset= 1; 289 for (i=1; i<n; i++) { 290 nz = ai[i+1] - ai[i]; 291 /* additional 1 for the term on the diagonal */ 292 AiLo[i] = rowOffset; 293 rowOffset += nz+1; 294 295 ierr = PetscMemcpy(&(AjLo[offset]), vi, nz*sizeof(PetscInt));CHKERRQ(ierr); 296 ierr = PetscMemcpy(&(AALo[offset]), v, nz*sizeof(PetscScalar));CHKERRQ(ierr); 297 298 offset += nz; 299 AjLo[offset] = (PetscInt) i; 300 AALo[offset] = (MatScalar) 1.0; 301 offset += 1; 302 303 v += nz; 304 vi += nz; 305 } 306 307 /* allocate space for the triangular factor information */ 308 loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct; 309 310 /* Create the matrix description */ 311 stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSP(stat); 312 stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSP(stat); 313 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSP(stat); 314 stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER);CHKERRCUSP(stat); 315 stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSP(stat); 316 317 /* Create the solve analysis information */ 318 stat = cusparseCreateSolveAnalysisInfo(&loTriFactor->solveInfo);CHKERRCUSP(stat); 319 320 /* set the operation */ 321 loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 322 323 /* set the matrix */ 324 loTriFactor->csrMat = new CsrMatrix; 325 loTriFactor->csrMat->num_rows = n; 326 loTriFactor->csrMat->num_cols = n; 327 loTriFactor->csrMat->num_entries = nzLower; 328 329 loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1); 330 loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1); 331 332 loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower); 333 loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower); 334 335 loTriFactor->csrMat->values = new THRUSTARRAY(nzLower); 336 loTriFactor->csrMat->values->assign(AALo, AALo+nzLower); 337 338 /* perform the solve analysis */ 339 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, 340 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 341 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 342 loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo);CHKERRCUSP(stat); 343 344 /* assign the pointer. Is this really necessary? */ 345 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor; 346 347 ierr = cudaFreeHost(AiLo);CHKERRCUSP(ierr); 348 ierr = cudaFreeHost(AjLo);CHKERRCUSP(ierr); 349 ierr = cudaFreeHost(AALo);CHKERRCUSP(ierr); 350 } catch(char *ex) { 351 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 352 } 353 } 354 PetscFunctionReturn(0); 355 } 356 357 #undef __FUNCT__ 358 #define __FUNCT__ "MatSeqAIJCUSPARSEBuildILUUpperTriMatrix" 359 static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A) 360 { 361 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 362 PetscInt n = A->rmap->n; 363 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 364 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 365 cusparseStatus_t stat; 366 const PetscInt *aj = a->j,*adiag = a->diag,*vi; 367 const MatScalar *aa = a->a,*v; 368 PetscInt *AiUp, *AjUp; 369 PetscScalar *AAUp; 370 PetscInt i,nz, nzUpper, offset; 371 PetscErrorCode ierr; 372 373 PetscFunctionBegin; 374 if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUSP_CPU) { 375 try { 376 /* next, figure out the number of nonzeros in the upper triangular matrix. */ 377 nzUpper = adiag[0]-adiag[n]; 378 379 /* Allocate Space for the upper triangular matrix */ 380 ierr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUSP(ierr); 381 ierr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUSP(ierr); 382 ierr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUSP(ierr); 383 384 /* Fill the upper triangular matrix */ 385 AiUp[0]=(PetscInt) 0; 386 AiUp[n]=nzUpper; 387 offset = nzUpper; 388 for (i=n-1; i>=0; i--) { 389 v = aa + adiag[i+1] + 1; 390 vi = aj + adiag[i+1] + 1; 391 392 /* number of elements NOT on the diagonal */ 393 nz = adiag[i] - adiag[i+1]-1; 394 395 /* decrement the offset */ 396 offset -= (nz+1); 397 398 /* first, set the diagonal elements */ 399 AjUp[offset] = (PetscInt) i; 400 AAUp[offset] = 1./v[nz]; 401 AiUp[i] = AiUp[i+1] - (nz+1); 402 403 ierr = PetscMemcpy(&(AjUp[offset+1]), vi, nz*sizeof(PetscInt));CHKERRQ(ierr); 404 ierr = PetscMemcpy(&(AAUp[offset+1]), v, nz*sizeof(PetscScalar));CHKERRQ(ierr); 405 } 406 407 /* allocate space for the triangular factor information */ 408 upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct; 409 410 /* Create the matrix description */ 411 stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSP(stat); 412 stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSP(stat); 413 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSP(stat); 414 stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSP(stat); 415 stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSP(stat); 416 417 /* Create the solve analysis information */ 418 stat = cusparseCreateSolveAnalysisInfo(&upTriFactor->solveInfo);CHKERRCUSP(stat); 419 420 /* set the operation */ 421 upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 422 423 /* set the matrix */ 424 upTriFactor->csrMat = new CsrMatrix; 425 upTriFactor->csrMat->num_rows = n; 426 upTriFactor->csrMat->num_cols = n; 427 upTriFactor->csrMat->num_entries = nzUpper; 428 429 upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1); 430 upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1); 431 432 upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper); 433 upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper); 434 435 upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper); 436 upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper); 437 438 /* perform the solve analysis */ 439 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, 440 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 441 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 442 upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo);CHKERRCUSP(stat); 443 444 /* assign the pointer. Is this really necessary? */ 445 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor; 446 447 ierr = cudaFreeHost(AiUp);CHKERRCUSP(ierr); 448 ierr = cudaFreeHost(AjUp);CHKERRCUSP(ierr); 449 ierr = cudaFreeHost(AAUp);CHKERRCUSP(ierr); 450 } catch(char *ex) { 451 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 452 } 453 } 454 PetscFunctionReturn(0); 455 } 456 457 #undef __FUNCT__ 458 #define __FUNCT__ "MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU" 459 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A) 460 { 461 PetscErrorCode ierr; 462 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 463 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 464 IS isrow = a->row,iscol = a->icol; 465 PetscBool row_identity,col_identity; 466 const PetscInt *r,*c; 467 PetscInt n = A->rmap->n; 468 469 PetscFunctionBegin; 470 ierr = MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);CHKERRQ(ierr); 471 ierr = MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);CHKERRQ(ierr); 472 473 cusparseTriFactors->workVector = new THRUSTARRAY; 474 cusparseTriFactors->workVector->resize(n); 475 cusparseTriFactors->nnz=a->nz; 476 477 A->valid_GPU_matrix = PETSC_CUSP_BOTH; 478 /*lower triangular indices */ 479 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 480 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 481 if (!row_identity) { 482 cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 483 cusparseTriFactors->rpermIndices->assign(r, r+n); 484 } 485 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 486 487 /*upper triangular indices */ 488 ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr); 489 ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 490 if (!col_identity) { 491 cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 492 cusparseTriFactors->cpermIndices->assign(c, c+n); 493 } 494 ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr); 495 PetscFunctionReturn(0); 496 } 497 498 #undef __FUNCT__ 499 #define __FUNCT__ "MatSeqAIJCUSPARSEBuildICCTriMatrices" 500 static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A) 501 { 502 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 503 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 504 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 505 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 506 cusparseStatus_t stat; 507 PetscErrorCode ierr; 508 PetscInt *AiUp, *AjUp; 509 PetscScalar *AAUp; 510 PetscScalar *AALo; 511 PetscInt nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j; 512 Mat_SeqSBAIJ *b = (Mat_SeqSBAIJ*)A->data; 513 const PetscInt *ai = b->i,*aj = b->j,*vj; 514 const MatScalar *aa = b->a,*v; 515 516 PetscFunctionBegin; 517 if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUSP_CPU) { 518 try { 519 /* Allocate Space for the upper triangular matrix */ 520 ierr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUSP(ierr); 521 ierr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUSP(ierr); 522 ierr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUSP(ierr); 523 ierr = cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUSP(ierr); 524 525 /* Fill the upper triangular matrix */ 526 AiUp[0]=(PetscInt) 0; 527 AiUp[n]=nzUpper; 528 offset = 0; 529 for (i=0; i<n; i++) { 530 /* set the pointers */ 531 v = aa + ai[i]; 532 vj = aj + ai[i]; 533 nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */ 534 535 /* first, set the diagonal elements */ 536 AjUp[offset] = (PetscInt) i; 537 AAUp[offset] = 1.0/v[nz]; 538 AiUp[i] = offset; 539 AALo[offset] = 1.0/v[nz]; 540 541 offset+=1; 542 if (nz>0) { 543 ierr = PetscMemcpy(&(AjUp[offset]), vj, nz*sizeof(PetscInt));CHKERRQ(ierr); 544 ierr = PetscMemcpy(&(AAUp[offset]), v, nz*sizeof(PetscScalar));CHKERRQ(ierr); 545 for (j=offset; j<offset+nz; j++) { 546 AAUp[j] = -AAUp[j]; 547 AALo[j] = AAUp[j]/v[nz]; 548 } 549 offset+=nz; 550 } 551 } 552 553 /* allocate space for the triangular factor information */ 554 upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct; 555 556 /* Create the matrix description */ 557 stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSP(stat); 558 stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSP(stat); 559 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSP(stat); 560 stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSP(stat); 561 stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSP(stat); 562 563 /* Create the solve analysis information */ 564 stat = cusparseCreateSolveAnalysisInfo(&upTriFactor->solveInfo);CHKERRCUSP(stat); 565 566 /* set the operation */ 567 upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 568 569 /* set the matrix */ 570 upTriFactor->csrMat = new CsrMatrix; 571 upTriFactor->csrMat->num_rows = A->rmap->n; 572 upTriFactor->csrMat->num_cols = A->cmap->n; 573 upTriFactor->csrMat->num_entries = a->nz; 574 575 upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 576 upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1); 577 578 upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 579 upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz); 580 581 upTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 582 upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz); 583 584 /* perform the solve analysis */ 585 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, 586 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 587 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 588 upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo);CHKERRCUSP(stat); 589 590 /* assign the pointer. Is this really necessary? */ 591 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor; 592 593 /* allocate space for the triangular factor information */ 594 loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct; 595 596 /* Create the matrix description */ 597 stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSP(stat); 598 stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSP(stat); 599 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSP(stat); 600 stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSP(stat); 601 stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSP(stat); 602 603 /* Create the solve analysis information */ 604 stat = cusparseCreateSolveAnalysisInfo(&loTriFactor->solveInfo);CHKERRCUSP(stat); 605 606 /* set the operation */ 607 loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE; 608 609 /* set the matrix */ 610 loTriFactor->csrMat = new CsrMatrix; 611 loTriFactor->csrMat->num_rows = A->rmap->n; 612 loTriFactor->csrMat->num_cols = A->cmap->n; 613 loTriFactor->csrMat->num_entries = a->nz; 614 615 loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 616 loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1); 617 618 loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 619 loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz); 620 621 loTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 622 loTriFactor->csrMat->values->assign(AALo, AALo+a->nz); 623 624 /* perform the solve analysis */ 625 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, 626 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 627 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 628 loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo);CHKERRCUSP(stat); 629 630 /* assign the pointer. Is this really necessary? */ 631 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor; 632 633 A->valid_GPU_matrix = PETSC_CUSP_BOTH; 634 ierr = cudaFreeHost(AiUp);CHKERRCUSP(ierr); 635 ierr = cudaFreeHost(AjUp);CHKERRCUSP(ierr); 636 ierr = cudaFreeHost(AAUp);CHKERRCUSP(ierr); 637 ierr = cudaFreeHost(AALo);CHKERRCUSP(ierr); 638 } catch(char *ex) { 639 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 640 } 641 } 642 PetscFunctionReturn(0); 643 } 644 645 #undef __FUNCT__ 646 #define __FUNCT__ "MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU" 647 static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A) 648 { 649 PetscErrorCode ierr; 650 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 651 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 652 IS ip = a->row; 653 const PetscInt *rip; 654 PetscBool perm_identity; 655 PetscInt n = A->rmap->n; 656 657 PetscFunctionBegin; 658 ierr = MatSeqAIJCUSPARSEBuildICCTriMatrices(A);CHKERRQ(ierr); 659 cusparseTriFactors->workVector = new THRUSTARRAY; 660 cusparseTriFactors->workVector->resize(n); 661 cusparseTriFactors->nnz=(a->nz-n)*2 + n; 662 663 /*lower triangular indices */ 664 ierr = ISGetIndices(ip,&rip);CHKERRQ(ierr); 665 ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr); 666 if (!perm_identity) { 667 cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 668 cusparseTriFactors->rpermIndices->assign(rip, rip+n); 669 cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 670 cusparseTriFactors->cpermIndices->assign(rip, rip+n); 671 } 672 ierr = ISRestoreIndices(ip,&rip);CHKERRQ(ierr); 673 PetscFunctionReturn(0); 674 } 675 676 #undef __FUNCT__ 677 #define __FUNCT__ "MatLUFactorNumeric_SeqAIJCUSPARSE" 678 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info) 679 { 680 Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 681 IS isrow = b->row,iscol = b->col; 682 PetscBool row_identity,col_identity; 683 PetscErrorCode ierr; 684 685 PetscFunctionBegin; 686 ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 687 /* determine which version of MatSolve needs to be used. */ 688 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 689 ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 690 if (row_identity && col_identity) { 691 B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 692 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 693 } else { 694 B->ops->solve = MatSolve_SeqAIJCUSPARSE; 695 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 696 } 697 698 /* get the triangular factors */ 699 ierr = MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);CHKERRQ(ierr); 700 PetscFunctionReturn(0); 701 } 702 703 #undef __FUNCT__ 704 #define __FUNCT__ "MatCholeskyFactorNumeric_SeqAIJCUSPARSE" 705 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info) 706 { 707 Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 708 IS ip = b->row; 709 PetscBool perm_identity; 710 PetscErrorCode ierr; 711 712 PetscFunctionBegin; 713 ierr = MatCholeskyFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 714 715 /* determine which version of MatSolve needs to be used. */ 716 ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr); 717 if (perm_identity) { 718 B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 719 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 720 } else { 721 B->ops->solve = MatSolve_SeqAIJCUSPARSE; 722 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 723 } 724 725 /* get the triangular factors */ 726 ierr = MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);CHKERRQ(ierr); 727 PetscFunctionReturn(0); 728 } 729 730 #undef __FUNCT__ 731 #define __FUNCT__ "MatSeqAIJCUSPARSEAnalyzeTransposeForSolve" 732 static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A) 733 { 734 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 735 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 736 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 737 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 738 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 739 cusparseStatus_t stat; 740 cusparseIndexBase_t indexBase; 741 cusparseMatrixType_t matrixType; 742 cusparseFillMode_t fillMode; 743 cusparseDiagType_t diagType; 744 745 PetscFunctionBegin; 746 747 /*********************************************/ 748 /* Now the Transpose of the Lower Tri Factor */ 749 /*********************************************/ 750 751 /* allocate space for the transpose of the lower triangular factor */ 752 loTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct; 753 754 /* set the matrix descriptors of the lower triangular factor */ 755 matrixType = cusparseGetMatType(loTriFactor->descr); 756 indexBase = cusparseGetMatIndexBase(loTriFactor->descr); 757 fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ? 758 CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 759 diagType = cusparseGetMatDiagType(loTriFactor->descr); 760 761 /* Create the matrix description */ 762 stat = cusparseCreateMatDescr(&loTriFactorT->descr);CHKERRCUSP(stat); 763 stat = cusparseSetMatIndexBase(loTriFactorT->descr, indexBase);CHKERRCUSP(stat); 764 stat = cusparseSetMatType(loTriFactorT->descr, matrixType);CHKERRCUSP(stat); 765 stat = cusparseSetMatFillMode(loTriFactorT->descr, fillMode);CHKERRCUSP(stat); 766 stat = cusparseSetMatDiagType(loTriFactorT->descr, diagType);CHKERRCUSP(stat); 767 768 /* Create the solve analysis information */ 769 stat = cusparseCreateSolveAnalysisInfo(&loTriFactorT->solveInfo);CHKERRCUSP(stat); 770 771 /* set the operation */ 772 loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 773 774 /* allocate GPU space for the CSC of the lower triangular factor*/ 775 loTriFactorT->csrMat = new CsrMatrix; 776 loTriFactorT->csrMat->num_rows = loTriFactor->csrMat->num_rows; 777 loTriFactorT->csrMat->num_cols = loTriFactor->csrMat->num_cols; 778 loTriFactorT->csrMat->num_entries = loTriFactor->csrMat->num_entries; 779 loTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(loTriFactor->csrMat->num_rows+1); 780 loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactor->csrMat->num_entries); 781 loTriFactorT->csrMat->values = new THRUSTARRAY(loTriFactor->csrMat->num_entries); 782 783 /* compute the transpose of the lower triangular factor, i.e. the CSC */ 784 stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, 785 loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, 786 loTriFactor->csrMat->values->data().get(), 787 loTriFactor->csrMat->row_offsets->data().get(), 788 loTriFactor->csrMat->column_indices->data().get(), 789 loTriFactorT->csrMat->values->data().get(), 790 loTriFactorT->csrMat->column_indices->data().get(), 791 loTriFactorT->csrMat->row_offsets->data().get(), 792 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSP(stat); 793 794 /* perform the solve analysis on the transposed matrix */ 795 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp, 796 loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, 797 loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(), 798 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), 799 loTriFactorT->solveInfo);CHKERRCUSP(stat); 800 801 /* assign the pointer. Is this really necessary? */ 802 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT; 803 804 /*********************************************/ 805 /* Now the Transpose of the Upper Tri Factor */ 806 /*********************************************/ 807 808 /* allocate space for the transpose of the upper triangular factor */ 809 upTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct; 810 811 /* set the matrix descriptors of the upper triangular factor */ 812 matrixType = cusparseGetMatType(upTriFactor->descr); 813 indexBase = cusparseGetMatIndexBase(upTriFactor->descr); 814 fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ? 815 CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 816 diagType = cusparseGetMatDiagType(upTriFactor->descr); 817 818 /* Create the matrix description */ 819 stat = cusparseCreateMatDescr(&upTriFactorT->descr);CHKERRCUSP(stat); 820 stat = cusparseSetMatIndexBase(upTriFactorT->descr, indexBase);CHKERRCUSP(stat); 821 stat = cusparseSetMatType(upTriFactorT->descr, matrixType);CHKERRCUSP(stat); 822 stat = cusparseSetMatFillMode(upTriFactorT->descr, fillMode);CHKERRCUSP(stat); 823 stat = cusparseSetMatDiagType(upTriFactorT->descr, diagType);CHKERRCUSP(stat); 824 825 /* Create the solve analysis information */ 826 stat = cusparseCreateSolveAnalysisInfo(&upTriFactorT->solveInfo);CHKERRCUSP(stat); 827 828 /* set the operation */ 829 upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 830 831 /* allocate GPU space for the CSC of the upper triangular factor*/ 832 upTriFactorT->csrMat = new CsrMatrix; 833 upTriFactorT->csrMat->num_rows = upTriFactor->csrMat->num_rows; 834 upTriFactorT->csrMat->num_cols = upTriFactor->csrMat->num_cols; 835 upTriFactorT->csrMat->num_entries = upTriFactor->csrMat->num_entries; 836 upTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(upTriFactor->csrMat->num_rows+1); 837 upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactor->csrMat->num_entries); 838 upTriFactorT->csrMat->values = new THRUSTARRAY(upTriFactor->csrMat->num_entries); 839 840 /* compute the transpose of the upper triangular factor, i.e. the CSC */ 841 stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, 842 upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, 843 upTriFactor->csrMat->values->data().get(), 844 upTriFactor->csrMat->row_offsets->data().get(), 845 upTriFactor->csrMat->column_indices->data().get(), 846 upTriFactorT->csrMat->values->data().get(), 847 upTriFactorT->csrMat->column_indices->data().get(), 848 upTriFactorT->csrMat->row_offsets->data().get(), 849 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSP(stat); 850 851 /* perform the solve analysis on the transposed matrix */ 852 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp, 853 upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, 854 upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(), 855 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), 856 upTriFactorT->solveInfo);CHKERRCUSP(stat); 857 858 /* assign the pointer. Is this really necessary? */ 859 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT; 860 PetscFunctionReturn(0); 861 } 862 863 #undef __FUNCT__ 864 #define __FUNCT__ "MatSeqAIJCUSPARSEGenerateTransposeForMult" 865 static PetscErrorCode MatSeqAIJCUSPARSEGenerateTransposeForMult(Mat A) 866 { 867 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 868 Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 869 Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 870 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 871 cusparseStatus_t stat; 872 cusparseIndexBase_t indexBase; 873 cudaError_t err; 874 875 PetscFunctionBegin; 876 877 /* allocate space for the triangular factor information */ 878 matstructT = new Mat_SeqAIJCUSPARSEMultStruct; 879 stat = cusparseCreateMatDescr(&matstructT->descr);CHKERRCUSP(stat); 880 indexBase = cusparseGetMatIndexBase(matstruct->descr); 881 stat = cusparseSetMatIndexBase(matstructT->descr, indexBase);CHKERRCUSP(stat); 882 stat = cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSP(stat); 883 884 /* set alpha and beta */ 885 err = cudaMalloc((void **)&(matstructT->alpha),sizeof(PetscScalar));CHKERRCUSP(err); 886 err = cudaMemcpy(matstructT->alpha,&ALPHA,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUSP(err); 887 err = cudaMalloc((void **)&(matstructT->beta),sizeof(PetscScalar));CHKERRCUSP(err); 888 err = cudaMemcpy(matstructT->beta,&BETA,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUSP(err); 889 stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSP(stat); 890 891 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 892 CsrMatrix *matrix = (CsrMatrix*)matstruct->mat; 893 CsrMatrix *matrixT= new CsrMatrix; 894 matrixT->num_rows = A->rmap->n; 895 matrixT->num_cols = A->cmap->n; 896 matrixT->num_entries = a->nz; 897 matrixT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 898 matrixT->column_indices = new THRUSTINTARRAY32(a->nz); 899 matrixT->values = new THRUSTARRAY(a->nz); 900 901 /* compute the transpose of the upper triangular factor, i.e. the CSC */ 902 indexBase = cusparseGetMatIndexBase(matstruct->descr); 903 stat = cusparse_csr2csc(cusparsestruct->handle, matrix->num_rows, 904 matrix->num_cols, matrix->num_entries, 905 matrix->values->data().get(), 906 matrix->row_offsets->data().get(), 907 matrix->column_indices->data().get(), 908 matrixT->values->data().get(), 909 matrixT->column_indices->data().get(), 910 matrixT->row_offsets->data().get(), 911 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSP(stat); 912 913 /* assign the pointer */ 914 matstructT->mat = matrixT; 915 916 } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) { 917 #if CUDA_VERSION>=5000 918 /* First convert HYB to CSR */ 919 CsrMatrix *temp= new CsrMatrix; 920 temp->num_rows = A->rmap->n; 921 temp->num_cols = A->cmap->n; 922 temp->num_entries = a->nz; 923 temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 924 temp->column_indices = new THRUSTINTARRAY32(a->nz); 925 temp->values = new THRUSTARRAY(a->nz); 926 927 928 stat = cusparse_hyb2csr(cusparsestruct->handle, 929 matstruct->descr, (cusparseHybMat_t)matstruct->mat, 930 temp->values->data().get(), 931 temp->row_offsets->data().get(), 932 temp->column_indices->data().get());CHKERRCUSP(stat); 933 934 /* Next, convert CSR to CSC (i.e. the matrix transpose) */ 935 CsrMatrix *tempT= new CsrMatrix; 936 tempT->num_rows = A->rmap->n; 937 tempT->num_cols = A->cmap->n; 938 tempT->num_entries = a->nz; 939 tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 940 tempT->column_indices = new THRUSTINTARRAY32(a->nz); 941 tempT->values = new THRUSTARRAY(a->nz); 942 943 stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows, 944 temp->num_cols, temp->num_entries, 945 temp->values->data().get(), 946 temp->row_offsets->data().get(), 947 temp->column_indices->data().get(), 948 tempT->values->data().get(), 949 tempT->column_indices->data().get(), 950 tempT->row_offsets->data().get(), 951 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSP(stat); 952 953 /* Last, convert CSC to HYB */ 954 cusparseHybMat_t hybMat; 955 stat = cusparseCreateHybMat(&hybMat);CHKERRCUSP(stat); 956 cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ? 957 CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 958 stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n, 959 matstructT->descr, tempT->values->data().get(), 960 tempT->row_offsets->data().get(), 961 tempT->column_indices->data().get(), 962 hybMat, 0, partition);CHKERRCUSP(stat); 963 964 /* assign the pointer */ 965 matstructT->mat = hybMat; 966 967 /* delete temporaries */ 968 if (tempT) { 969 if (tempT->values) delete (THRUSTARRAY*) tempT->values; 970 if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices; 971 if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets; 972 delete (CsrMatrix*) tempT; 973 } 974 if (temp) { 975 if (temp->values) delete (THRUSTARRAY*) temp->values; 976 if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices; 977 if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets; 978 delete (CsrMatrix*) temp; 979 } 980 #else 981 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"ELL (Ellpack) and HYB (Hybrid) storage format for the Matrix Transpose (in MatMultTranspose) require CUDA 5.0 or later."); 982 #endif 983 } 984 /* assign the compressed row indices */ 985 matstructT->cprowIndices = new THRUSTINTARRAY; 986 987 /* assign the pointer */ 988 ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT; 989 PetscFunctionReturn(0); 990 } 991 992 #undef __FUNCT__ 993 #define __FUNCT__ "MatSolveTranspose_SeqAIJCUSPARSE" 994 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 995 { 996 CUSPARRAY *xGPU, *bGPU; 997 cusparseStatus_t stat; 998 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 999 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1000 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1001 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1002 PetscErrorCode ierr; 1003 1004 PetscFunctionBegin; 1005 /* Analyze the matrix and create the transpose ... on the fly */ 1006 if (!loTriFactorT && !upTriFactorT) { 1007 ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr); 1008 loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1009 upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1010 } 1011 1012 /* Get the GPU pointers */ 1013 ierr = VecCUSPGetArrayWrite(xx,&xGPU);CHKERRQ(ierr); 1014 ierr = VecCUSPGetArrayRead(bb,&bGPU);CHKERRQ(ierr); 1015 1016 /* First, reorder with the row permutation */ 1017 thrust::copy(thrust::make_permutation_iterator(bGPU->begin(), cusparseTriFactors->rpermIndices->begin()), 1018 thrust::make_permutation_iterator(bGPU->end(), cusparseTriFactors->rpermIndices->end()), 1019 xGPU->begin()); 1020 1021 /* First, solve U */ 1022 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, 1023 upTriFactorT->csrMat->num_rows, &ALPHA, upTriFactorT->descr, 1024 upTriFactorT->csrMat->values->data().get(), 1025 upTriFactorT->csrMat->row_offsets->data().get(), 1026 upTriFactorT->csrMat->column_indices->data().get(), 1027 upTriFactorT->solveInfo, 1028 xGPU->data().get(), tempGPU->data().get());CHKERRCUSP(stat); 1029 1030 /* Then, solve L */ 1031 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, 1032 loTriFactorT->csrMat->num_rows, &ALPHA, loTriFactorT->descr, 1033 loTriFactorT->csrMat->values->data().get(), 1034 loTriFactorT->csrMat->row_offsets->data().get(), 1035 loTriFactorT->csrMat->column_indices->data().get(), 1036 loTriFactorT->solveInfo, 1037 tempGPU->data().get(), xGPU->data().get());CHKERRCUSP(stat); 1038 1039 /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */ 1040 thrust::copy(thrust::make_permutation_iterator(xGPU->begin(), cusparseTriFactors->cpermIndices->begin()), 1041 thrust::make_permutation_iterator(xGPU->end(), cusparseTriFactors->cpermIndices->end()), 1042 tempGPU->begin()); 1043 1044 /* Copy the temporary to the full solution. */ 1045 thrust::copy(tempGPU->begin(), tempGPU->end(), xGPU->begin()); 1046 1047 /* restore */ 1048 ierr = VecCUSPRestoreArrayRead(bb,&bGPU);CHKERRQ(ierr); 1049 ierr = VecCUSPRestoreArrayWrite(xx,&xGPU);CHKERRQ(ierr); 1050 ierr = WaitForGPU();CHKERRCUSP(ierr); 1051 1052 ierr = PetscLogFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1053 PetscFunctionReturn(0); 1054 } 1055 1056 #undef __FUNCT__ 1057 #define __FUNCT__ "MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering" 1058 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 1059 { 1060 CUSPARRAY *xGPU,*bGPU; 1061 cusparseStatus_t stat; 1062 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1063 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1064 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1065 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1066 PetscErrorCode ierr; 1067 1068 PetscFunctionBegin; 1069 /* Analyze the matrix and create the transpose ... on the fly */ 1070 if (!loTriFactorT && !upTriFactorT) { 1071 ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr); 1072 loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1073 upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1074 } 1075 1076 /* Get the GPU pointers */ 1077 ierr = VecCUSPGetArrayWrite(xx,&xGPU);CHKERRQ(ierr); 1078 ierr = VecCUSPGetArrayRead(bb,&bGPU);CHKERRQ(ierr); 1079 1080 /* First, solve U */ 1081 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, 1082 upTriFactorT->csrMat->num_rows, &ALPHA, upTriFactorT->descr, 1083 upTriFactorT->csrMat->values->data().get(), 1084 upTriFactorT->csrMat->row_offsets->data().get(), 1085 upTriFactorT->csrMat->column_indices->data().get(), 1086 upTriFactorT->solveInfo, 1087 bGPU->data().get(), tempGPU->data().get());CHKERRCUSP(stat); 1088 1089 /* Then, solve L */ 1090 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, 1091 loTriFactorT->csrMat->num_rows, &ALPHA, loTriFactorT->descr, 1092 loTriFactorT->csrMat->values->data().get(), 1093 loTriFactorT->csrMat->row_offsets->data().get(), 1094 loTriFactorT->csrMat->column_indices->data().get(), 1095 loTriFactorT->solveInfo, 1096 tempGPU->data().get(), xGPU->data().get());CHKERRCUSP(stat); 1097 1098 /* restore */ 1099 ierr = VecCUSPRestoreArrayRead(bb,&bGPU);CHKERRQ(ierr); 1100 ierr = VecCUSPRestoreArrayWrite(xx,&xGPU);CHKERRQ(ierr); 1101 ierr = WaitForGPU();CHKERRCUSP(ierr); 1102 ierr = PetscLogFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1103 PetscFunctionReturn(0); 1104 } 1105 1106 #undef __FUNCT__ 1107 #define __FUNCT__ "MatSolve_SeqAIJCUSPARSE" 1108 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 1109 { 1110 CUSPARRAY *xGPU,*bGPU; 1111 cusparseStatus_t stat; 1112 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1113 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1114 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1115 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1116 PetscErrorCode ierr; 1117 1118 PetscFunctionBegin; 1119 /* Get the GPU pointers */ 1120 ierr = VecCUSPGetArrayWrite(xx,&xGPU);CHKERRQ(ierr); 1121 ierr = VecCUSPGetArrayRead(bb,&bGPU);CHKERRQ(ierr); 1122 1123 /* First, reorder with the row permutation */ 1124 thrust::copy(thrust::make_permutation_iterator(bGPU->begin(), cusparseTriFactors->rpermIndices->begin()), 1125 thrust::make_permutation_iterator(bGPU->end(), cusparseTriFactors->rpermIndices->end()), 1126 xGPU->begin()); 1127 1128 /* Next, solve L */ 1129 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp, 1130 loTriFactor->csrMat->num_rows, &ALPHA, loTriFactor->descr, 1131 loTriFactor->csrMat->values->data().get(), 1132 loTriFactor->csrMat->row_offsets->data().get(), 1133 loTriFactor->csrMat->column_indices->data().get(), 1134 loTriFactor->solveInfo, 1135 xGPU->data().get(), tempGPU->data().get());CHKERRCUSP(stat); 1136 1137 /* Then, solve U */ 1138 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp, 1139 upTriFactor->csrMat->num_rows, &ALPHA, upTriFactor->descr, 1140 upTriFactor->csrMat->values->data().get(), 1141 upTriFactor->csrMat->row_offsets->data().get(), 1142 upTriFactor->csrMat->column_indices->data().get(), 1143 upTriFactor->solveInfo, 1144 tempGPU->data().get(), xGPU->data().get());CHKERRCUSP(stat); 1145 1146 /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */ 1147 thrust::copy(thrust::make_permutation_iterator(xGPU->begin(), cusparseTriFactors->cpermIndices->begin()), 1148 thrust::make_permutation_iterator(xGPU->end(), cusparseTriFactors->cpermIndices->end()), 1149 tempGPU->begin()); 1150 1151 /* Copy the temporary to the full solution. */ 1152 thrust::copy(tempGPU->begin(), tempGPU->end(), xGPU->begin()); 1153 1154 ierr = VecCUSPRestoreArrayRead(bb,&bGPU);CHKERRQ(ierr); 1155 ierr = VecCUSPRestoreArrayWrite(xx,&xGPU);CHKERRQ(ierr); 1156 ierr = WaitForGPU();CHKERRCUSP(ierr); 1157 ierr = PetscLogFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1158 PetscFunctionReturn(0); 1159 } 1160 1161 #undef __FUNCT__ 1162 #define __FUNCT__ "MatSolve_SeqAIJCUSPARSE_NaturalOrdering" 1163 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 1164 { 1165 CUSPARRAY *xGPU,*bGPU; 1166 cusparseStatus_t stat; 1167 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1168 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1169 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1170 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1171 PetscErrorCode ierr; 1172 1173 PetscFunctionBegin; 1174 /* Get the GPU pointers */ 1175 ierr = VecCUSPGetArrayWrite(xx,&xGPU);CHKERRQ(ierr); 1176 ierr = VecCUSPGetArrayRead(bb,&bGPU);CHKERRQ(ierr); 1177 1178 /* First, solve L */ 1179 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp, 1180 loTriFactor->csrMat->num_rows, &ALPHA, loTriFactor->descr, 1181 loTriFactor->csrMat->values->data().get(), 1182 loTriFactor->csrMat->row_offsets->data().get(), 1183 loTriFactor->csrMat->column_indices->data().get(), 1184 loTriFactor->solveInfo, 1185 bGPU->data().get(), tempGPU->data().get());CHKERRCUSP(stat); 1186 1187 /* Next, solve U */ 1188 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp, 1189 upTriFactor->csrMat->num_rows, &ALPHA, upTriFactor->descr, 1190 upTriFactor->csrMat->values->data().get(), 1191 upTriFactor->csrMat->row_offsets->data().get(), 1192 upTriFactor->csrMat->column_indices->data().get(), 1193 upTriFactor->solveInfo, 1194 tempGPU->data().get(), xGPU->data().get());CHKERRCUSP(stat); 1195 1196 ierr = VecCUSPRestoreArrayRead(bb,&bGPU);CHKERRQ(ierr); 1197 ierr = VecCUSPRestoreArrayWrite(xx,&xGPU);CHKERRQ(ierr); 1198 ierr = WaitForGPU();CHKERRCUSP(ierr); 1199 ierr = PetscLogFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1200 PetscFunctionReturn(0); 1201 } 1202 1203 #undef __FUNCT__ 1204 #define __FUNCT__ "MatSeqAIJCUSPARSECopyToGPU" 1205 static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A) 1206 { 1207 1208 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1209 Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 1210 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1211 PetscInt m = A->rmap->n,*ii,*ridx; 1212 PetscErrorCode ierr; 1213 cusparseStatus_t stat; 1214 cudaError_t err; 1215 1216 PetscFunctionBegin; 1217 if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUSP_CPU) { 1218 ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1219 if (matstruct) { 1220 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Data structure should not be initialized."); 1221 } 1222 try { 1223 cusparsestruct->nonzerorow=0; 1224 for (int j = 0; j<m; j++) cusparsestruct->nonzerorow += ((a->i[j+1]-a->i[j])>0); 1225 1226 if (a->compressedrow.use) { 1227 m = a->compressedrow.nrows; 1228 ii = a->compressedrow.i; 1229 ridx = a->compressedrow.rindex; 1230 } else { 1231 /* Forcing compressed row on the GPU */ 1232 int k=0; 1233 ierr = PetscMalloc1((cusparsestruct->nonzerorow+1), &ii);CHKERRQ(ierr); 1234 ierr = PetscMalloc1((cusparsestruct->nonzerorow), &ridx);CHKERRQ(ierr); 1235 ii[0]=0; 1236 for (int j = 0; j<m; j++) { 1237 if ((a->i[j+1]-a->i[j])>0) { 1238 ii[k] = a->i[j]; 1239 ridx[k]= j; 1240 k++; 1241 } 1242 } 1243 ii[cusparsestruct->nonzerorow] = a->nz; 1244 m = cusparsestruct->nonzerorow; 1245 } 1246 1247 /* allocate space for the triangular factor information */ 1248 matstruct = new Mat_SeqAIJCUSPARSEMultStruct; 1249 stat = cusparseCreateMatDescr(&matstruct->descr);CHKERRCUSP(stat); 1250 stat = cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSP(stat); 1251 stat = cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSP(stat); 1252 1253 err = cudaMalloc((void **)&(matstruct->alpha),sizeof(PetscScalar));CHKERRCUSP(err); 1254 err = cudaMemcpy(matstruct->alpha,&ALPHA,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUSP(err); 1255 err = cudaMalloc((void **)&(matstruct->beta),sizeof(PetscScalar));CHKERRCUSP(err); 1256 err = cudaMemcpy(matstruct->beta,&BETA,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUSP(err); 1257 stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSP(stat); 1258 1259 /* Build a hybrid/ellpack matrix if this option is chosen for the storage */ 1260 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1261 /* set the matrix */ 1262 CsrMatrix *matrix= new CsrMatrix; 1263 matrix->num_rows = m; 1264 matrix->num_cols = A->cmap->n; 1265 matrix->num_entries = a->nz; 1266 matrix->row_offsets = new THRUSTINTARRAY32(m+1); 1267 matrix->row_offsets->assign(ii, ii + m+1); 1268 1269 matrix->column_indices = new THRUSTINTARRAY32(a->nz); 1270 matrix->column_indices->assign(a->j, a->j+a->nz); 1271 1272 matrix->values = new THRUSTARRAY(a->nz); 1273 matrix->values->assign(a->a, a->a+a->nz); 1274 1275 /* assign the pointer */ 1276 matstruct->mat = matrix; 1277 1278 } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) { 1279 #if CUDA_VERSION>=4020 1280 CsrMatrix *matrix= new CsrMatrix; 1281 matrix->num_rows = m; 1282 matrix->num_cols = A->cmap->n; 1283 matrix->num_entries = a->nz; 1284 matrix->row_offsets = new THRUSTINTARRAY32(m+1); 1285 matrix->row_offsets->assign(ii, ii + m+1); 1286 1287 matrix->column_indices = new THRUSTINTARRAY32(a->nz); 1288 matrix->column_indices->assign(a->j, a->j+a->nz); 1289 1290 matrix->values = new THRUSTARRAY(a->nz); 1291 matrix->values->assign(a->a, a->a+a->nz); 1292 1293 cusparseHybMat_t hybMat; 1294 stat = cusparseCreateHybMat(&hybMat);CHKERRCUSP(stat); 1295 cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ? 1296 CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 1297 stat = cusparse_csr2hyb(cusparsestruct->handle, matrix->num_rows, matrix->num_cols, 1298 matstruct->descr, matrix->values->data().get(), 1299 matrix->row_offsets->data().get(), 1300 matrix->column_indices->data().get(), 1301 hybMat, 0, partition);CHKERRCUSP(stat); 1302 /* assign the pointer */ 1303 matstruct->mat = hybMat; 1304 1305 if (matrix) { 1306 if (matrix->values) delete (THRUSTARRAY*)matrix->values; 1307 if (matrix->column_indices) delete (THRUSTINTARRAY32*)matrix->column_indices; 1308 if (matrix->row_offsets) delete (THRUSTINTARRAY32*)matrix->row_offsets; 1309 delete (CsrMatrix*)matrix; 1310 } 1311 #endif 1312 } 1313 1314 /* assign the compressed row indices */ 1315 matstruct->cprowIndices = new THRUSTINTARRAY(m); 1316 matstruct->cprowIndices->assign(ridx,ridx+m); 1317 1318 /* assign the pointer */ 1319 cusparsestruct->mat = matstruct; 1320 1321 if (!a->compressedrow.use) { 1322 ierr = PetscFree(ii);CHKERRQ(ierr); 1323 ierr = PetscFree(ridx);CHKERRQ(ierr); 1324 } 1325 cusparsestruct->workVector = new THRUSTARRAY; 1326 cusparsestruct->workVector->resize(m); 1327 } catch(char *ex) { 1328 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 1329 } 1330 ierr = WaitForGPU();CHKERRCUSP(ierr); 1331 1332 A->valid_GPU_matrix = PETSC_CUSP_BOTH; 1333 1334 ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1335 } 1336 PetscFunctionReturn(0); 1337 } 1338 1339 #undef __FUNCT__ 1340 #define __FUNCT__ "MatCreateVecs_SeqAIJCUSPARSE" 1341 static PetscErrorCode MatCreateVecs_SeqAIJCUSPARSE(Mat mat, Vec *right, Vec *left) 1342 { 1343 PetscErrorCode ierr; 1344 PetscInt rbs,cbs; 1345 1346 PetscFunctionBegin; 1347 ierr = MatGetBlockSizes(mat,&rbs,&cbs);CHKERRQ(ierr); 1348 if (right) { 1349 ierr = VecCreate(PetscObjectComm((PetscObject)mat),right);CHKERRQ(ierr); 1350 ierr = VecSetSizes(*right,mat->cmap->n,PETSC_DETERMINE);CHKERRQ(ierr); 1351 ierr = VecSetBlockSize(*right,cbs);CHKERRQ(ierr); 1352 ierr = VecSetType(*right,VECSEQCUSP);CHKERRQ(ierr); 1353 ierr = PetscLayoutReference(mat->cmap,&(*right)->map);CHKERRQ(ierr); 1354 } 1355 if (left) { 1356 ierr = VecCreate(PetscObjectComm((PetscObject)mat),left);CHKERRQ(ierr); 1357 ierr = VecSetSizes(*left,mat->rmap->n,PETSC_DETERMINE);CHKERRQ(ierr); 1358 ierr = VecSetBlockSize(*left,rbs);CHKERRQ(ierr); 1359 ierr = VecSetType(*left,VECSEQCUSP);CHKERRQ(ierr); 1360 ierr = PetscLayoutReference(mat->rmap,&(*left)->map);CHKERRQ(ierr); 1361 } 1362 PetscFunctionReturn(0); 1363 } 1364 1365 struct VecCUSPPlusEquals 1366 { 1367 template <typename Tuple> 1368 __host__ __device__ 1369 void operator()(Tuple t) 1370 { 1371 thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t); 1372 } 1373 }; 1374 1375 #undef __FUNCT__ 1376 #define __FUNCT__ "MatMult_SeqAIJCUSPARSE" 1377 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 1378 { 1379 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1380 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1381 Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 1382 CUSPARRAY *xarray,*yarray; 1383 PetscErrorCode ierr; 1384 cusparseStatus_t stat; 1385 1386 PetscFunctionBegin; 1387 /* The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE 1388 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); */ 1389 ierr = VecCUSPGetArrayRead(xx,&xarray);CHKERRQ(ierr); 1390 ierr = VecCUSPGetArrayWrite(yy,&yarray);CHKERRQ(ierr); 1391 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1392 CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 1393 stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1394 mat->num_rows, mat->num_cols, mat->num_entries, 1395 matstruct->alpha, matstruct->descr, mat->values->data().get(), mat->row_offsets->data().get(), 1396 mat->column_indices->data().get(), xarray->data().get(), matstruct->beta, 1397 yarray->data().get());CHKERRCUSP(stat); 1398 } else { 1399 #if CUDA_VERSION>=4020 1400 cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat; 1401 stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1402 matstruct->alpha, matstruct->descr, hybMat, 1403 xarray->data().get(), matstruct->beta, 1404 yarray->data().get());CHKERRCUSP(stat); 1405 #endif 1406 } 1407 ierr = VecCUSPRestoreArrayRead(xx,&xarray);CHKERRQ(ierr); 1408 ierr = VecCUSPRestoreArrayWrite(yy,&yarray);CHKERRQ(ierr); 1409 if (!cusparsestruct->stream) { 1410 ierr = WaitForGPU();CHKERRCUSP(ierr); 1411 } 1412 ierr = PetscLogFlops(2.0*a->nz - cusparsestruct->nonzerorow);CHKERRQ(ierr); 1413 PetscFunctionReturn(0); 1414 } 1415 1416 #undef __FUNCT__ 1417 #define __FUNCT__ "MatMultTranspose_SeqAIJCUSPARSE" 1418 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 1419 { 1420 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1421 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1422 Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 1423 CUSPARRAY *xarray,*yarray; 1424 PetscErrorCode ierr; 1425 cusparseStatus_t stat; 1426 1427 PetscFunctionBegin; 1428 /* The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE 1429 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); */ 1430 if (!matstructT) { 1431 ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr); 1432 matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 1433 } 1434 ierr = VecCUSPGetArrayRead(xx,&xarray);CHKERRQ(ierr); 1435 ierr = VecCUSPGetArrayWrite(yy,&yarray);CHKERRQ(ierr); 1436 1437 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1438 CsrMatrix *mat = (CsrMatrix*)matstructT->mat; 1439 stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1440 mat->num_rows, mat->num_cols, 1441 mat->num_entries, matstructT->alpha, matstructT->descr, 1442 mat->values->data().get(), mat->row_offsets->data().get(), 1443 mat->column_indices->data().get(), xarray->data().get(), matstructT->beta, 1444 yarray->data().get());CHKERRCUSP(stat); 1445 } else { 1446 #if CUDA_VERSION>=4020 1447 cusparseHybMat_t hybMat = (cusparseHybMat_t)matstructT->mat; 1448 stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1449 matstructT->alpha, matstructT->descr, hybMat, 1450 xarray->data().get(), matstructT->beta, 1451 yarray->data().get());CHKERRCUSP(stat); 1452 #endif 1453 } 1454 ierr = VecCUSPRestoreArrayRead(xx,&xarray);CHKERRQ(ierr); 1455 ierr = VecCUSPRestoreArrayWrite(yy,&yarray);CHKERRQ(ierr); 1456 if (!cusparsestruct->stream) { 1457 ierr = WaitForGPU();CHKERRCUSP(ierr); 1458 } 1459 ierr = PetscLogFlops(2.0*a->nz - cusparsestruct->nonzerorow);CHKERRQ(ierr); 1460 PetscFunctionReturn(0); 1461 } 1462 1463 1464 #undef __FUNCT__ 1465 #define __FUNCT__ "MatMultAdd_SeqAIJCUSPARSE" 1466 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 1467 { 1468 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1469 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1470 Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 1471 CUSPARRAY *xarray,*yarray,*zarray; 1472 PetscErrorCode ierr; 1473 cusparseStatus_t stat; 1474 1475 PetscFunctionBegin; 1476 /* The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE 1477 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); */ 1478 try { 1479 ierr = VecCopy_SeqCUSP(yy,zz);CHKERRQ(ierr); 1480 ierr = VecCUSPGetArrayRead(xx,&xarray);CHKERRQ(ierr); 1481 ierr = VecCUSPGetArrayRead(yy,&yarray);CHKERRQ(ierr); 1482 ierr = VecCUSPGetArrayWrite(zz,&zarray);CHKERRQ(ierr); 1483 1484 /* multiply add */ 1485 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1486 CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 1487 /* here we need to be careful to set the number of rows in the multiply to the 1488 number of compressed rows in the matrix ... which is equivalent to the 1489 size of the workVector */ 1490 stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1491 mat->num_rows, mat->num_cols, 1492 mat->num_entries, matstruct->alpha, matstruct->descr, 1493 mat->values->data().get(), mat->row_offsets->data().get(), 1494 mat->column_indices->data().get(), xarray->data().get(), matstruct->beta, 1495 cusparsestruct->workVector->data().get());CHKERRCUSP(stat); 1496 } else { 1497 #if CUDA_VERSION>=4020 1498 cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat; 1499 if (cusparsestruct->workVector->size()) { 1500 stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1501 matstruct->alpha, matstruct->descr, hybMat, 1502 xarray->data().get(), matstruct->beta, 1503 cusparsestruct->workVector->data().get());CHKERRCUSP(stat); 1504 } 1505 #endif 1506 } 1507 1508 /* scatter the data from the temporary into the full vector with a += operation */ 1509 thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zarray->begin(), matstruct->cprowIndices->begin()))), 1510 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zarray->begin(), matstruct->cprowIndices->begin()))) + cusparsestruct->workVector->size(), 1511 VecCUSPPlusEquals()); 1512 ierr = VecCUSPRestoreArrayRead(xx,&xarray);CHKERRQ(ierr); 1513 ierr = VecCUSPRestoreArrayRead(yy,&yarray);CHKERRQ(ierr); 1514 ierr = VecCUSPRestoreArrayWrite(zz,&zarray);CHKERRQ(ierr); 1515 1516 } catch(char *ex) { 1517 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 1518 } 1519 ierr = WaitForGPU();CHKERRCUSP(ierr); 1520 ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr); 1521 PetscFunctionReturn(0); 1522 } 1523 1524 #undef __FUNCT__ 1525 #define __FUNCT__ "MatMultTransposeAdd_SeqAIJCUSPARSE" 1526 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 1527 { 1528 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1529 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1530 Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 1531 CUSPARRAY *xarray,*yarray,*zarray; 1532 PetscErrorCode ierr; 1533 cusparseStatus_t stat; 1534 1535 PetscFunctionBegin; 1536 /* The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE 1537 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); */ 1538 if (!matstructT) { 1539 ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr); 1540 matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 1541 } 1542 1543 try { 1544 ierr = VecCopy_SeqCUSP(yy,zz);CHKERRQ(ierr); 1545 ierr = VecCUSPGetArrayRead(xx,&xarray);CHKERRQ(ierr); 1546 ierr = VecCUSPGetArrayRead(yy,&yarray);CHKERRQ(ierr); 1547 ierr = VecCUSPGetArrayWrite(zz,&zarray);CHKERRQ(ierr); 1548 1549 /* multiply add with matrix transpose */ 1550 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1551 CsrMatrix *mat = (CsrMatrix*)matstructT->mat; 1552 /* here we need to be careful to set the number of rows in the multiply to the 1553 number of compressed rows in the matrix ... which is equivalent to the 1554 size of the workVector */ 1555 stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1556 mat->num_rows, mat->num_cols, 1557 mat->num_entries, matstructT->alpha, matstructT->descr, 1558 mat->values->data().get(), mat->row_offsets->data().get(), 1559 mat->column_indices->data().get(), xarray->data().get(), matstructT->beta, 1560 cusparsestruct->workVector->data().get());CHKERRCUSP(stat); 1561 } else { 1562 #if CUDA_VERSION>=4020 1563 cusparseHybMat_t hybMat = (cusparseHybMat_t)matstructT->mat; 1564 if (cusparsestruct->workVector->size()) { 1565 stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1566 matstructT->alpha, matstructT->descr, hybMat, 1567 xarray->data().get(), matstructT->beta, 1568 cusparsestruct->workVector->data().get());CHKERRCUSP(stat); 1569 } 1570 #endif 1571 } 1572 1573 /* scatter the data from the temporary into the full vector with a += operation */ 1574 thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zarray->begin(), matstructT->cprowIndices->begin()))), 1575 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zarray->begin(), matstructT->cprowIndices->begin()))) + cusparsestruct->workVector->size(), 1576 VecCUSPPlusEquals()); 1577 1578 ierr = VecCUSPRestoreArrayRead(xx,&xarray);CHKERRQ(ierr); 1579 ierr = VecCUSPRestoreArrayRead(yy,&yarray);CHKERRQ(ierr); 1580 ierr = VecCUSPRestoreArrayWrite(zz,&zarray);CHKERRQ(ierr); 1581 1582 } catch(char *ex) { 1583 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 1584 } 1585 ierr = WaitForGPU();CHKERRCUSP(ierr); 1586 ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr); 1587 PetscFunctionReturn(0); 1588 } 1589 1590 #undef __FUNCT__ 1591 #define __FUNCT__ "MatAssemblyEnd_SeqAIJCUSPARSE" 1592 static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode) 1593 { 1594 PetscErrorCode ierr; 1595 1596 PetscFunctionBegin; 1597 ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr); 1598 if (A->factortype==MAT_FACTOR_NONE) { 1599 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 1600 } 1601 if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0); 1602 A->ops->mult = MatMult_SeqAIJCUSPARSE; 1603 A->ops->multadd = MatMultAdd_SeqAIJCUSPARSE; 1604 A->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE; 1605 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE; 1606 PetscFunctionReturn(0); 1607 } 1608 1609 /* --------------------------------------------------------------------------------*/ 1610 #undef __FUNCT__ 1611 #define __FUNCT__ "MatCreateSeqAIJCUSPARSE" 1612 /*@ 1613 MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format 1614 (the default parallel PETSc format). This matrix will ultimately pushed down 1615 to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix 1616 assembly performance the user should preallocate the matrix storage by setting 1617 the parameter nz (or the array nnz). By setting these parameters accurately, 1618 performance during matrix assembly can be increased by more than a factor of 50. 1619 1620 Collective on MPI_Comm 1621 1622 Input Parameters: 1623 + comm - MPI communicator, set to PETSC_COMM_SELF 1624 . m - number of rows 1625 . n - number of columns 1626 . nz - number of nonzeros per row (same for all rows) 1627 - nnz - array containing the number of nonzeros in the various rows 1628 (possibly different for each row) or NULL 1629 1630 Output Parameter: 1631 . A - the matrix 1632 1633 It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 1634 MatXXXXSetPreallocation() paradgm instead of this routine directly. 1635 [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 1636 1637 Notes: 1638 If nnz is given then nz is ignored 1639 1640 The AIJ format (also called the Yale sparse matrix format or 1641 compressed row storage), is fully compatible with standard Fortran 77 1642 storage. That is, the stored row and column indices can begin at 1643 either one (as in Fortran) or zero. See the users' manual for details. 1644 1645 Specify the preallocated storage with either nz or nnz (not both). 1646 Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 1647 allocation. For large problems you MUST preallocate memory or you 1648 will get TERRIBLE performance, see the users' manual chapter on matrices. 1649 1650 By default, this format uses inodes (identical nodes) when possible, to 1651 improve numerical efficiency of matrix-vector products and solves. We 1652 search for consecutive rows with the same nonzero structure, thereby 1653 reusing matrix information to achieve increased efficiency. 1654 1655 Level: intermediate 1656 1657 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE 1658 @*/ 1659 PetscErrorCode MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A) 1660 { 1661 PetscErrorCode ierr; 1662 1663 PetscFunctionBegin; 1664 ierr = MatCreate(comm,A);CHKERRQ(ierr); 1665 ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr); 1666 ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 1667 ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr); 1668 PetscFunctionReturn(0); 1669 } 1670 1671 #undef __FUNCT__ 1672 #define __FUNCT__ "MatDestroy_SeqAIJCUSPARSE" 1673 static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A) 1674 { 1675 PetscErrorCode ierr; 1676 cusparseStatus_t stat; 1677 cudaError_t err; 1678 PetscFunctionBegin; 1679 if (A->factortype==MAT_FACTOR_NONE) { 1680 try { 1681 if (A->valid_GPU_matrix != PETSC_CUSP_UNALLOCATED) { 1682 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1683 Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 1684 Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 1685 1686 /* delete all the members in each of the data structures */ 1687 if (matstruct) { 1688 if (matstruct->descr) { stat = cusparseDestroyMatDescr(matstruct->descr);CHKERRCUSP(stat); } 1689 if (matstruct->mat) { 1690 if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) { 1691 #if CUDA_VERSION>=4020 1692 cusparseHybMat_t hybMat = (cusparseHybMat_t) matstruct->mat; 1693 stat = cusparseDestroyHybMat(hybMat);CHKERRCUSP(stat); 1694 #endif 1695 } else { 1696 CsrMatrix* mat = (CsrMatrix*)matstruct->mat; 1697 if (mat->values) delete (THRUSTARRAY*)mat->values; 1698 if (mat->column_indices) delete (THRUSTINTARRAY32*)mat->column_indices; 1699 if (mat->row_offsets) delete (THRUSTINTARRAY32*)mat->row_offsets; 1700 delete (CsrMatrix*)matstruct->mat; 1701 } 1702 } 1703 if (matstruct->alpha) { err=cudaFree(matstruct->alpha);CHKERRCUSP(err); } 1704 if (matstruct->beta) { err=cudaFree(matstruct->beta);CHKERRCUSP(err); } 1705 if (matstruct->cprowIndices) delete (THRUSTINTARRAY*)matstruct->cprowIndices; 1706 if (matstruct) delete (Mat_SeqAIJCUSPARSEMultStruct*)matstruct; 1707 } 1708 if (matstructT) { 1709 if (matstructT->descr) { stat = cusparseDestroyMatDescr(matstructT->descr);CHKERRCUSP(stat); } 1710 if (matstructT->mat) { 1711 if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) { 1712 #if CUDA_VERSION>=4020 1713 cusparseHybMat_t hybMat = (cusparseHybMat_t) matstructT->mat; 1714 stat = cusparseDestroyHybMat(hybMat);CHKERRCUSP(stat); 1715 #endif 1716 } else { 1717 CsrMatrix* matT = (CsrMatrix*)matstructT->mat; 1718 if (matT->values) delete (THRUSTARRAY*)matT->values; 1719 if (matT->column_indices) delete (THRUSTINTARRAY32*)matT->column_indices; 1720 if (matT->row_offsets) delete (THRUSTINTARRAY32*)matT->row_offsets; 1721 delete (CsrMatrix*)matstructT->mat; 1722 } 1723 } 1724 if (matstructT->alpha) { err=cudaFree(matstructT->alpha);CHKERRCUSP(err); } 1725 if (matstructT->beta) { err=cudaFree(matstructT->beta);CHKERRCUSP(err); } 1726 if (matstructT->cprowIndices) delete (THRUSTINTARRAY*)matstructT->cprowIndices; 1727 if (matstructT) delete (Mat_SeqAIJCUSPARSEMultStruct*)matstructT; 1728 } 1729 if (cusparsestruct->workVector) delete (THRUSTARRAY*)cusparsestruct->workVector; 1730 if (cusparsestruct->handle) { stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUSP(stat); } 1731 if (cusparsestruct) delete (Mat_SeqAIJCUSPARSE*)cusparsestruct; 1732 A->valid_GPU_matrix = PETSC_CUSP_UNALLOCATED; 1733 } else { 1734 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1735 if (cusparsestruct->handle) { stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUSP(stat); } 1736 if (cusparsestruct) delete (Mat_SeqAIJCUSPARSE*)cusparsestruct; 1737 } 1738 } catch(char *ex) { 1739 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 1740 } 1741 } else { 1742 /* The triangular factors */ 1743 try { 1744 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1745 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1746 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1747 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1748 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1749 1750 /* delete all the members in each of the data structures */ 1751 if (loTriFactor) { 1752 if (loTriFactor->descr) { stat = cusparseDestroyMatDescr(loTriFactor->descr);CHKERRCUSP(stat); } 1753 if (loTriFactor->solveInfo) { stat = cusparseDestroySolveAnalysisInfo(loTriFactor->solveInfo);CHKERRCUSP(stat); } 1754 if (loTriFactor->csrMat) { 1755 if (loTriFactor->csrMat->values) delete (THRUSTARRAY*)loTriFactor->csrMat->values; 1756 if (loTriFactor->csrMat->column_indices) delete (THRUSTINTARRAY32*)loTriFactor->csrMat->column_indices; 1757 if (loTriFactor->csrMat->row_offsets) delete (THRUSTINTARRAY32*)loTriFactor->csrMat->row_offsets; 1758 delete (CsrMatrix*)loTriFactor->csrMat; 1759 } 1760 if (loTriFactor) delete (Mat_SeqAIJCUSPARSETriFactorStruct*)loTriFactor; 1761 } 1762 1763 if (upTriFactor) { 1764 if (upTriFactor->descr) { stat = cusparseDestroyMatDescr(upTriFactor->descr);CHKERRCUSP(stat); } 1765 if (upTriFactor->solveInfo) { stat = cusparseDestroySolveAnalysisInfo(upTriFactor->solveInfo);CHKERRCUSP(stat); } 1766 if (upTriFactor->csrMat) { 1767 if (upTriFactor->csrMat->values) delete (THRUSTARRAY*)upTriFactor->csrMat->values; 1768 if (upTriFactor->csrMat->column_indices) delete (THRUSTINTARRAY32*)upTriFactor->csrMat->column_indices; 1769 if (upTriFactor->csrMat->row_offsets) delete (THRUSTINTARRAY32*)upTriFactor->csrMat->row_offsets; 1770 delete (CsrMatrix*)upTriFactor->csrMat; 1771 } 1772 if (upTriFactor) delete (Mat_SeqAIJCUSPARSETriFactorStruct*)upTriFactor; 1773 } 1774 1775 if (loTriFactorT) { 1776 if (loTriFactorT->descr) { stat = cusparseDestroyMatDescr(loTriFactorT->descr);CHKERRCUSP(stat); } 1777 if (loTriFactorT->solveInfo) { stat = cusparseDestroySolveAnalysisInfo(loTriFactorT->solveInfo);CHKERRCUSP(stat); } 1778 if (loTriFactorT->csrMat) { 1779 if (loTriFactorT->csrMat->values) delete (THRUSTARRAY*)loTriFactorT->csrMat->values; 1780 if (loTriFactorT->csrMat->column_indices) delete (THRUSTINTARRAY32*)loTriFactorT->csrMat->column_indices; 1781 if (loTriFactorT->csrMat->row_offsets) delete (THRUSTINTARRAY32*)loTriFactorT->csrMat->row_offsets; 1782 delete (CsrMatrix*)loTriFactorT->csrMat; 1783 } 1784 if (loTriFactorT) delete (Mat_SeqAIJCUSPARSETriFactorStruct*)loTriFactorT; 1785 } 1786 1787 if (upTriFactorT) { 1788 if (upTriFactorT->descr) { stat = cusparseDestroyMatDescr(upTriFactorT->descr);CHKERRCUSP(stat); } 1789 if (upTriFactorT->solveInfo) { stat = cusparseDestroySolveAnalysisInfo(upTriFactorT->solveInfo);CHKERRCUSP(stat); } 1790 if (upTriFactorT->csrMat) { 1791 if (upTriFactorT->csrMat->values) delete (THRUSTARRAY*)upTriFactorT->csrMat->values; 1792 if (upTriFactorT->csrMat->column_indices) delete (THRUSTINTARRAY32*)upTriFactorT->csrMat->column_indices; 1793 if (upTriFactorT->csrMat->row_offsets) delete (THRUSTINTARRAY32*)upTriFactorT->csrMat->row_offsets; 1794 delete (CsrMatrix*)upTriFactorT->csrMat; 1795 } 1796 if (upTriFactorT) delete (Mat_SeqAIJCUSPARSETriFactorStruct*)upTriFactorT; 1797 } 1798 if (cusparseTriFactors->rpermIndices) delete (THRUSTINTARRAY*)cusparseTriFactors->rpermIndices; 1799 if (cusparseTriFactors->cpermIndices) delete (THRUSTINTARRAY*)cusparseTriFactors->cpermIndices; 1800 if (cusparseTriFactors->workVector) delete (THRUSTARRAY*)cusparseTriFactors->workVector; 1801 if (cusparseTriFactors->handle) { stat = cusparseDestroy(cusparseTriFactors->handle);CHKERRCUSP(stat); } 1802 if (cusparseTriFactors) delete (Mat_SeqAIJCUSPARSETriFactors*)cusparseTriFactors; 1803 } catch(char *ex) { 1804 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 1805 } 1806 } 1807 /*this next line is because MatDestroy tries to PetscFree spptr if it is not zero, and PetscFree only works if the memory was allocated with PetscNew or PetscMalloc, which don't call the constructor */ 1808 A->spptr = 0; 1809 1810 ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr); 1811 PetscFunctionReturn(0); 1812 } 1813 1814 #undef __FUNCT__ 1815 #define __FUNCT__ "MatCreate_SeqAIJCUSPARSE" 1816 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B) 1817 { 1818 PetscErrorCode ierr; 1819 cusparseStatus_t stat; 1820 cusparseHandle_t handle=0; 1821 1822 PetscFunctionBegin; 1823 ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr); 1824 if (B->factortype==MAT_FACTOR_NONE) { 1825 /* you cannot check the inode.use flag here since the matrix was just created. 1826 now build a GPU matrix data structure */ 1827 B->spptr = new Mat_SeqAIJCUSPARSE; 1828 ((Mat_SeqAIJCUSPARSE*)B->spptr)->mat = 0; 1829 ((Mat_SeqAIJCUSPARSE*)B->spptr)->matTranspose = 0; 1830 ((Mat_SeqAIJCUSPARSE*)B->spptr)->workVector = 0; 1831 ((Mat_SeqAIJCUSPARSE*)B->spptr)->format = MAT_CUSPARSE_CSR; 1832 ((Mat_SeqAIJCUSPARSE*)B->spptr)->stream = 0; 1833 ((Mat_SeqAIJCUSPARSE*)B->spptr)->handle = 0; 1834 stat = cusparseCreate(&handle);CHKERRCUSP(stat); 1835 ((Mat_SeqAIJCUSPARSE*)B->spptr)->handle = handle; 1836 ((Mat_SeqAIJCUSPARSE*)B->spptr)->stream = 0; 1837 } else { 1838 /* NEXT, set the pointers to the triangular factors */ 1839 B->spptr = new Mat_SeqAIJCUSPARSETriFactors; 1840 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->loTriFactorPtr = 0; 1841 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->upTriFactorPtr = 0; 1842 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->loTriFactorPtrTranspose = 0; 1843 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->upTriFactorPtrTranspose = 0; 1844 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->rpermIndices = 0; 1845 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->cpermIndices = 0; 1846 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->workVector = 0; 1847 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->handle = 0; 1848 stat = cusparseCreate(&handle);CHKERRCUSP(stat); 1849 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->handle = handle; 1850 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->nnz = 0; 1851 } 1852 1853 /* Here we overload MatGetFactor_petsc_C which enables -mat_type aijcusparse to use the 1854 default cusparse tri solve. Note the difference with the implementation in 1855 MatCreate_SeqAIJCUSP in ../seqcusp/aijcusp.cu */ 1856 ierr = PetscObjectComposeFunction((PetscObject)B,"MatGetFactor_petsc_C",MatGetFactor_seqaij_cusparse);CHKERRQ(ierr); 1857 1858 B->ops->assemblyend = MatAssemblyEnd_SeqAIJCUSPARSE; 1859 B->ops->destroy = MatDestroy_SeqAIJCUSPARSE; 1860 B->ops->getvecs = MatCreateVecs_SeqAIJCUSPARSE; 1861 B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE; 1862 B->ops->mult = MatMult_SeqAIJCUSPARSE; 1863 B->ops->multadd = MatMultAdd_SeqAIJCUSPARSE; 1864 B->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE; 1865 B->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE; 1866 1867 ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 1868 1869 B->valid_GPU_matrix = PETSC_CUSP_UNALLOCATED; 1870 1871 ierr = PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr); 1872 PetscFunctionReturn(0); 1873 } 1874 1875 /*M 1876 MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices. 1877 1878 A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either 1879 CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later. 1880 All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library. 1881 1882 Options Database Keys: 1883 + -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions() 1884 . -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 1885 . -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 1886 1887 Level: beginner 1888 1889 .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 1890 M*/ 1891