1 /* 2 Defines the basic matrix operations for the AIJ (compressed row) 3 matrix storage format using the CUSPARSE library, 4 */ 5 6 #include "petscconf.h" 7 #include "../src/mat/impls/aij/seq/aij.h" /*I "petscmat.h" I*/ 8 #include <../src/mat/impls/sbaij/seq/sbaij.h> 9 #include "../src/vec/vec/impls/dvecimpl.h" 10 #include "petsc-private/vecimpl.h" 11 #undef VecType 12 #include "cusparsematimpl.h" 13 14 const char *const MatCUSPARSEStorageFormats[] = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0}; 15 16 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*); 17 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*); 18 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*); 19 20 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 21 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 22 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*); 23 24 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec); 25 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 26 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 27 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 28 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat); 29 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec); 30 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 31 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 32 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 33 34 #undef __FUNCT__ 35 #define __FUNCT__ "MatCUSPARSESetStream" 36 PetscErrorCode MatCUSPARSESetStream(Mat A,const cudaStream_t stream) 37 { 38 cusparseStatus_t stat; 39 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 40 41 PetscFunctionBegin; 42 cusparsestruct->stream = stream; 43 stat = cusparseSetStream(cusparsestruct->handle,cusparsestruct->stream);CHKERRCUSP(stat); 44 PetscFunctionReturn(0); 45 } 46 47 #undef __FUNCT__ 48 #define __FUNCT__ "MatCUSPARSESetHandle" 49 PetscErrorCode MatCUSPARSESetHandle(Mat A,const cusparseHandle_t handle) 50 { 51 cusparseStatus_t stat; 52 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 53 54 PetscFunctionBegin; 55 if (cusparsestruct->handle) 56 stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUSP(stat); 57 cusparsestruct->handle = handle; 58 stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSP(stat); 59 PetscFunctionReturn(0); 60 } 61 62 #undef __FUNCT__ 63 #define __FUNCT__ "MatCUSPARSEClearHandle" 64 PetscErrorCode MatCUSPARSEClearHandle(Mat A) 65 { 66 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 67 PetscFunctionBegin; 68 if (cusparsestruct->handle) 69 cusparsestruct->handle = 0; 70 PetscFunctionReturn(0); 71 } 72 73 #undef __FUNCT__ 74 #define __FUNCT__ "MatFactorGetSolverPackage_seqaij_cusparse" 75 PetscErrorCode MatFactorGetSolverPackage_seqaij_cusparse(Mat A,const MatSolverPackage *type) 76 { 77 PetscFunctionBegin; 78 *type = MATSOLVERCUSPARSE; 79 PetscFunctionReturn(0); 80 } 81 82 /*MC 83 MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices 84 on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported 85 algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer 86 performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the 87 CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these 88 algorithms are not recommended. This class does NOT support direct solver operations. 89 90 Level: beginner 91 92 .seealso: PCFactorSetMatSolverPackage(), MatSolverPackage, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 93 M*/ 94 95 #undef __FUNCT__ 96 #define __FUNCT__ "MatGetFactor_seqaij_cusparse" 97 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaij_cusparse(Mat A,MatFactorType ftype,Mat *B) 98 { 99 PetscErrorCode ierr; 100 PetscInt n = A->rmap->n; 101 102 PetscFunctionBegin; 103 ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr); 104 (*B)->factortype = ftype; 105 ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr); 106 ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 107 108 if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) { 109 ierr = MatSetBlockSizes(*B,A->rmap->bs,A->cmap->bs);CHKERRQ(ierr); 110 (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE; 111 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJCUSPARSE; 112 } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) { 113 (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJCUSPARSE; 114 (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE; 115 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types"); 116 117 ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr); 118 ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverPackage_C",MatFactorGetSolverPackage_seqaij_cusparse);CHKERRQ(ierr); 119 PetscFunctionReturn(0); 120 } 121 122 #undef __FUNCT__ 123 #define __FUNCT__ "MatCUSPARSESetFormat_SeqAIJCUSPARSE" 124 PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 125 { 126 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 127 128 PetscFunctionBegin; 129 #if CUDA_VERSION>=4020 130 switch (op) { 131 case MAT_CUSPARSE_MULT: 132 cusparsestruct->format = format; 133 break; 134 case MAT_CUSPARSE_ALL: 135 cusparsestruct->format = format; 136 break; 137 default: 138 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op); 139 } 140 #else 141 if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) 142 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"ELL (Ellpack) and HYB (Hybrid) storage format require CUDA 4.2 or later."); 143 #endif 144 PetscFunctionReturn(0); 145 } 146 147 /*@ 148 MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular 149 operation. Only the MatMult operation can use different GPU storage formats 150 for MPIAIJCUSPARSE matrices. 151 Not Collective 152 153 Input Parameters: 154 + A - Matrix of type SEQAIJCUSPARSE 155 . op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL. 156 - format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2) 157 158 Output Parameter: 159 160 Level: intermediate 161 162 .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 163 @*/ 164 #undef __FUNCT__ 165 #define __FUNCT__ "MatCUSPARSESetFormat" 166 PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 167 { 168 PetscErrorCode ierr; 169 170 PetscFunctionBegin; 171 PetscValidHeaderSpecific(A, MAT_CLASSID,1); 172 ierr = PetscTryMethod(A, "MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));CHKERRQ(ierr); 173 PetscFunctionReturn(0); 174 } 175 176 #undef __FUNCT__ 177 #define __FUNCT__ "MatSetFromOptions_SeqAIJCUSPARSE" 178 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat A) 179 { 180 PetscErrorCode ierr; 181 MatCUSPARSEStorageFormat format; 182 PetscBool flg; 183 184 PetscFunctionBegin; 185 ierr = PetscOptionsHead("SeqAIJCUSPARSE options");CHKERRQ(ierr); 186 ierr = PetscObjectOptionsBegin((PetscObject)A); 187 if (A->factortype==MAT_FACTOR_NONE) { 188 ierr = PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV", 189 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)MAT_CUSPARSE_CSR,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 190 if (flg) { 191 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);CHKERRQ(ierr); 192 } 193 } 194 ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve", 195 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)MAT_CUSPARSE_CSR,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 196 if (flg) { 197 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr); 198 } 199 ierr = PetscOptionsEnd();CHKERRQ(ierr); 200 PetscFunctionReturn(0); 201 202 } 203 204 #undef __FUNCT__ 205 #define __FUNCT__ "MatILUFactorSymbolic_SeqAIJCUSPARSE" 206 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 207 { 208 PetscErrorCode ierr; 209 210 PetscFunctionBegin; 211 ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 212 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 213 PetscFunctionReturn(0); 214 } 215 216 #undef __FUNCT__ 217 #define __FUNCT__ "MatLUFactorSymbolic_SeqAIJCUSPARSE" 218 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 219 { 220 PetscErrorCode ierr; 221 222 PetscFunctionBegin; 223 ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 224 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 225 PetscFunctionReturn(0); 226 } 227 228 #undef __FUNCT__ 229 #define __FUNCT__ "MatICCFactorSymbolic_SeqAIJCUSPARSE" 230 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info) 231 { 232 PetscErrorCode ierr; 233 234 PetscFunctionBegin; 235 ierr = MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr); 236 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 237 PetscFunctionReturn(0); 238 } 239 240 #undef __FUNCT__ 241 #define __FUNCT__ "MatCholeskyFactorSymbolic_SeqAIJCUSPARSE" 242 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info) 243 { 244 PetscErrorCode ierr; 245 246 PetscFunctionBegin; 247 ierr = MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr); 248 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 249 PetscFunctionReturn(0); 250 } 251 252 #undef __FUNCT__ 253 #define __FUNCT__ "MatSeqAIJCUSPARSEBuildILULowerTriMatrix" 254 static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A) 255 { 256 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 257 PetscInt n = A->rmap->n; 258 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 259 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 260 cusparseStatus_t stat; 261 const PetscInt *ai = a->i,*aj = a->j,*vi; 262 const MatScalar *aa = a->a,*v; 263 PetscInt *AiLo, *AjLo; 264 PetscScalar *AALo; 265 PetscInt i,nz, nzLower, offset, rowOffset; 266 PetscErrorCode ierr; 267 268 PetscFunctionBegin; 269 if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUSP_CPU) { 270 try { 271 /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */ 272 nzLower=n+ai[n]-ai[1]; 273 274 /* Allocate Space for the lower triangular matrix */ 275 ierr = cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUSP(ierr); 276 ierr = cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUSP(ierr); 277 ierr = cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUSP(ierr); 278 279 /* Fill the lower triangular matrix */ 280 AiLo[0] = (PetscInt) 0; 281 AiLo[n] = nzLower; 282 AjLo[0] = (PetscInt) 0; 283 AALo[0] = (MatScalar) 1.0; 284 v = aa; 285 vi = aj; 286 offset = 1; 287 rowOffset= 1; 288 for (i=1; i<n; i++) { 289 nz = ai[i+1] - ai[i]; 290 /* additional 1 for the term on the diagonal */ 291 AiLo[i] = rowOffset; 292 rowOffset += nz+1; 293 294 ierr = PetscMemcpy(&(AjLo[offset]), vi, nz*sizeof(PetscInt));CHKERRQ(ierr); 295 ierr = PetscMemcpy(&(AALo[offset]), v, nz*sizeof(PetscScalar));CHKERRQ(ierr); 296 297 offset += nz; 298 AjLo[offset] = (PetscInt) i; 299 AALo[offset] = (MatScalar) 1.0; 300 offset += 1; 301 302 v += nz; 303 vi += nz; 304 } 305 306 /* allocate space for the triangular factor information */ 307 loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct; 308 309 /* Create the matrix description */ 310 stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSP(stat); 311 stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSP(stat); 312 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSP(stat); 313 stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER);CHKERRCUSP(stat); 314 stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSP(stat); 315 316 /* Create the solve analysis information */ 317 stat = cusparseCreateSolveAnalysisInfo(&loTriFactor->solveInfo);CHKERRCUSP(stat); 318 319 /* set the operation */ 320 loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 321 322 /* set the matrix */ 323 loTriFactor->csrMat = new CsrMatrix; 324 loTriFactor->csrMat->num_rows = n; 325 loTriFactor->csrMat->num_cols = n; 326 loTriFactor->csrMat->num_entries = nzLower; 327 328 loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1); 329 loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1); 330 331 loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower); 332 loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower); 333 334 loTriFactor->csrMat->values = new THRUSTARRAY(nzLower); 335 loTriFactor->csrMat->values->assign(AALo, AALo+nzLower); 336 337 /* perform the solve analysis */ 338 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, 339 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 340 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 341 loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo);CHKERRCUSP(stat); 342 343 /* assign the pointer. Is this really necessary? */ 344 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor; 345 346 ierr = cudaFreeHost(AiLo);CHKERRCUSP(ierr); 347 ierr = cudaFreeHost(AjLo);CHKERRCUSP(ierr); 348 ierr = cudaFreeHost(AALo);CHKERRCUSP(ierr); 349 } catch(char *ex) { 350 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 351 } 352 } 353 PetscFunctionReturn(0); 354 } 355 356 #undef __FUNCT__ 357 #define __FUNCT__ "MatSeqAIJCUSPARSEBuildILUUpperTriMatrix" 358 static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A) 359 { 360 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 361 PetscInt n = A->rmap->n; 362 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 363 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 364 cusparseStatus_t stat; 365 const PetscInt *aj = a->j,*adiag = a->diag,*vi; 366 const MatScalar *aa = a->a,*v; 367 PetscInt *AiUp, *AjUp; 368 PetscScalar *AAUp; 369 PetscInt i,nz, nzUpper, offset; 370 PetscErrorCode ierr; 371 372 PetscFunctionBegin; 373 if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUSP_CPU) { 374 try { 375 /* next, figure out the number of nonzeros in the upper triangular matrix. */ 376 nzUpper = adiag[0]-adiag[n]; 377 378 /* Allocate Space for the upper triangular matrix */ 379 ierr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUSP(ierr); 380 ierr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUSP(ierr); 381 ierr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUSP(ierr); 382 383 /* Fill the upper triangular matrix */ 384 AiUp[0]=(PetscInt) 0; 385 AiUp[n]=nzUpper; 386 offset = nzUpper; 387 for (i=n-1; i>=0; i--) { 388 v = aa + adiag[i+1] + 1; 389 vi = aj + adiag[i+1] + 1; 390 391 /* number of elements NOT on the diagonal */ 392 nz = adiag[i] - adiag[i+1]-1; 393 394 /* decrement the offset */ 395 offset -= (nz+1); 396 397 /* first, set the diagonal elements */ 398 AjUp[offset] = (PetscInt) i; 399 AAUp[offset] = 1./v[nz]; 400 AiUp[i] = AiUp[i+1] - (nz+1); 401 402 ierr = PetscMemcpy(&(AjUp[offset+1]), vi, nz*sizeof(PetscInt));CHKERRQ(ierr); 403 ierr = PetscMemcpy(&(AAUp[offset+1]), v, nz*sizeof(PetscScalar));CHKERRQ(ierr); 404 } 405 406 /* allocate space for the triangular factor information */ 407 upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct; 408 409 /* Create the matrix description */ 410 stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSP(stat); 411 stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSP(stat); 412 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSP(stat); 413 stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSP(stat); 414 stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSP(stat); 415 416 /* Create the solve analysis information */ 417 stat = cusparseCreateSolveAnalysisInfo(&upTriFactor->solveInfo);CHKERRCUSP(stat); 418 419 /* set the operation */ 420 upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 421 422 /* set the matrix */ 423 upTriFactor->csrMat = new CsrMatrix; 424 upTriFactor->csrMat->num_rows = n; 425 upTriFactor->csrMat->num_cols = n; 426 upTriFactor->csrMat->num_entries = nzUpper; 427 428 upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1); 429 upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1); 430 431 upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper); 432 upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper); 433 434 upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper); 435 upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper); 436 437 /* perform the solve analysis */ 438 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, 439 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 440 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 441 upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo);CHKERRCUSP(stat); 442 443 /* assign the pointer. Is this really necessary? */ 444 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor; 445 446 ierr = cudaFreeHost(AiUp);CHKERRCUSP(ierr); 447 ierr = cudaFreeHost(AjUp);CHKERRCUSP(ierr); 448 ierr = cudaFreeHost(AAUp);CHKERRCUSP(ierr); 449 } catch(char *ex) { 450 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 451 } 452 } 453 PetscFunctionReturn(0); 454 } 455 456 #undef __FUNCT__ 457 #define __FUNCT__ "MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU" 458 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A) 459 { 460 PetscErrorCode ierr; 461 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 462 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 463 IS isrow = a->row,iscol = a->icol; 464 PetscBool row_identity,col_identity; 465 const PetscInt *r,*c; 466 PetscInt n = A->rmap->n; 467 468 PetscFunctionBegin; 469 ierr = MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);CHKERRQ(ierr); 470 ierr = MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);CHKERRQ(ierr); 471 472 cusparseTriFactors->workVector = new THRUSTARRAY; 473 cusparseTriFactors->workVector->resize(n); 474 cusparseTriFactors->nnz=a->nz; 475 476 A->valid_GPU_matrix = PETSC_CUSP_BOTH; 477 /*lower triangular indices */ 478 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 479 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 480 if (!row_identity) { 481 cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 482 cusparseTriFactors->rpermIndices->assign(r, r+n); 483 } 484 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 485 486 /*upper triangular indices */ 487 ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr); 488 ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 489 if (!col_identity) { 490 cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 491 cusparseTriFactors->cpermIndices->assign(c, c+n); 492 } 493 ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr); 494 PetscFunctionReturn(0); 495 } 496 497 #undef __FUNCT__ 498 #define __FUNCT__ "MatSeqAIJCUSPARSEBuildICCTriMatrices" 499 static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A) 500 { 501 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 502 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 503 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 504 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 505 cusparseStatus_t stat; 506 PetscErrorCode ierr; 507 PetscInt *AiUp, *AjUp; 508 PetscScalar *AAUp; 509 PetscScalar *AALo; 510 PetscInt nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j; 511 Mat_SeqSBAIJ *b = (Mat_SeqSBAIJ*)A->data; 512 const PetscInt *ai = b->i,*aj = b->j,*vj; 513 const MatScalar *aa = b->a,*v; 514 515 PetscFunctionBegin; 516 if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUSP_CPU) { 517 try { 518 /* Allocate Space for the upper triangular matrix */ 519 ierr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUSP(ierr); 520 ierr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUSP(ierr); 521 ierr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUSP(ierr); 522 ierr = cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUSP(ierr); 523 524 /* Fill the upper triangular matrix */ 525 AiUp[0]=(PetscInt) 0; 526 AiUp[n]=nzUpper; 527 offset = 0; 528 for (i=0; i<n; i++) { 529 /* set the pointers */ 530 v = aa + ai[i]; 531 vj = aj + ai[i]; 532 nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */ 533 534 /* first, set the diagonal elements */ 535 AjUp[offset] = (PetscInt) i; 536 AAUp[offset] = 1.0/v[nz]; 537 AiUp[i] = offset; 538 AALo[offset] = 1.0/v[nz]; 539 540 offset+=1; 541 if (nz>0) { 542 ierr = PetscMemcpy(&(AjUp[offset]), vj, nz*sizeof(PetscInt));CHKERRQ(ierr); 543 ierr = PetscMemcpy(&(AAUp[offset]), v, nz*sizeof(PetscScalar));CHKERRQ(ierr); 544 for (j=offset; j<offset+nz; j++) { 545 AAUp[j] = -AAUp[j]; 546 AALo[j] = AAUp[j]/v[nz]; 547 } 548 offset+=nz; 549 } 550 } 551 552 /* allocate space for the triangular factor information */ 553 upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct; 554 555 /* Create the matrix description */ 556 stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSP(stat); 557 stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSP(stat); 558 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSP(stat); 559 stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSP(stat); 560 stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSP(stat); 561 562 /* Create the solve analysis information */ 563 stat = cusparseCreateSolveAnalysisInfo(&upTriFactor->solveInfo);CHKERRCUSP(stat); 564 565 /* set the operation */ 566 upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 567 568 /* set the matrix */ 569 upTriFactor->csrMat = new CsrMatrix; 570 upTriFactor->csrMat->num_rows = A->rmap->n; 571 upTriFactor->csrMat->num_cols = A->cmap->n; 572 upTriFactor->csrMat->num_entries = a->nz; 573 574 upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 575 upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1); 576 577 upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 578 upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz); 579 580 upTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 581 upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz); 582 583 /* perform the solve analysis */ 584 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, 585 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 586 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 587 upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo);CHKERRCUSP(stat); 588 589 /* assign the pointer. Is this really necessary? */ 590 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor; 591 592 /* allocate space for the triangular factor information */ 593 loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct; 594 595 /* Create the matrix description */ 596 stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSP(stat); 597 stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSP(stat); 598 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSP(stat); 599 stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSP(stat); 600 stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSP(stat); 601 602 /* Create the solve analysis information */ 603 stat = cusparseCreateSolveAnalysisInfo(&loTriFactor->solveInfo);CHKERRCUSP(stat); 604 605 /* set the operation */ 606 loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE; 607 608 /* set the matrix */ 609 loTriFactor->csrMat = new CsrMatrix; 610 loTriFactor->csrMat->num_rows = A->rmap->n; 611 loTriFactor->csrMat->num_cols = A->cmap->n; 612 loTriFactor->csrMat->num_entries = a->nz; 613 614 loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 615 loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1); 616 617 loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 618 loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz); 619 620 loTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 621 loTriFactor->csrMat->values->assign(AALo, AALo+a->nz); 622 623 /* perform the solve analysis */ 624 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, 625 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 626 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 627 loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo);CHKERRCUSP(stat); 628 629 /* assign the pointer. Is this really necessary? */ 630 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor; 631 632 A->valid_GPU_matrix = PETSC_CUSP_BOTH; 633 ierr = cudaFreeHost(AiUp);CHKERRCUSP(ierr); 634 ierr = cudaFreeHost(AjUp);CHKERRCUSP(ierr); 635 ierr = cudaFreeHost(AAUp);CHKERRCUSP(ierr); 636 ierr = cudaFreeHost(AALo);CHKERRCUSP(ierr); 637 } catch(char *ex) { 638 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 639 } 640 } 641 PetscFunctionReturn(0); 642 } 643 644 #undef __FUNCT__ 645 #define __FUNCT__ "MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU" 646 static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A) 647 { 648 PetscErrorCode ierr; 649 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 650 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 651 IS ip = a->row; 652 const PetscInt *rip; 653 PetscBool perm_identity; 654 PetscInt n = A->rmap->n; 655 656 PetscFunctionBegin; 657 ierr = MatSeqAIJCUSPARSEBuildICCTriMatrices(A);CHKERRQ(ierr); 658 cusparseTriFactors->workVector = new THRUSTARRAY; 659 cusparseTriFactors->workVector->resize(n); 660 cusparseTriFactors->nnz=(a->nz-n)*2 + n; 661 662 /*lower triangular indices */ 663 ierr = ISGetIndices(ip,&rip);CHKERRQ(ierr); 664 ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr); 665 if (!perm_identity) { 666 cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 667 cusparseTriFactors->rpermIndices->assign(rip, rip+n); 668 cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 669 cusparseTriFactors->cpermIndices->assign(rip, rip+n); 670 } 671 ierr = ISRestoreIndices(ip,&rip);CHKERRQ(ierr); 672 PetscFunctionReturn(0); 673 } 674 675 #undef __FUNCT__ 676 #define __FUNCT__ "MatLUFactorNumeric_SeqAIJCUSPARSE" 677 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info) 678 { 679 Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 680 IS isrow = b->row,iscol = b->col; 681 PetscBool row_identity,col_identity; 682 PetscErrorCode ierr; 683 684 PetscFunctionBegin; 685 ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 686 /* determine which version of MatSolve needs to be used. */ 687 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 688 ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 689 if (row_identity && col_identity) { 690 B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 691 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 692 } else { 693 B->ops->solve = MatSolve_SeqAIJCUSPARSE; 694 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 695 } 696 697 /* get the triangular factors */ 698 ierr = MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);CHKERRQ(ierr); 699 PetscFunctionReturn(0); 700 } 701 702 #undef __FUNCT__ 703 #define __FUNCT__ "MatCholeskyFactorNumeric_SeqAIJCUSPARSE" 704 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info) 705 { 706 Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 707 IS ip = b->row; 708 PetscBool perm_identity; 709 PetscErrorCode ierr; 710 711 PetscFunctionBegin; 712 ierr = MatCholeskyFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 713 714 /* determine which version of MatSolve needs to be used. */ 715 ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr); 716 if (perm_identity) { 717 B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 718 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 719 } else { 720 B->ops->solve = MatSolve_SeqAIJCUSPARSE; 721 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 722 } 723 724 /* get the triangular factors */ 725 ierr = MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);CHKERRQ(ierr); 726 PetscFunctionReturn(0); 727 } 728 729 #undef __FUNCT__ 730 #define __FUNCT__ "MatSeqAIJCUSPARSEAnalyzeTransposeForSolve" 731 static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A) 732 { 733 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 734 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 735 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 736 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 737 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 738 cusparseStatus_t stat; 739 cusparseIndexBase_t indexBase; 740 cusparseMatrixType_t matrixType; 741 cusparseFillMode_t fillMode; 742 cusparseDiagType_t diagType; 743 744 PetscFunctionBegin; 745 746 /*********************************************/ 747 /* Now the Transpose of the Lower Tri Factor */ 748 /*********************************************/ 749 750 /* allocate space for the transpose of the lower triangular factor */ 751 loTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct; 752 753 /* set the matrix descriptors of the lower triangular factor */ 754 matrixType = cusparseGetMatType(loTriFactor->descr); 755 indexBase = cusparseGetMatIndexBase(loTriFactor->descr); 756 fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ? 757 CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 758 diagType = cusparseGetMatDiagType(loTriFactor->descr); 759 760 /* Create the matrix description */ 761 stat = cusparseCreateMatDescr(&loTriFactorT->descr);CHKERRCUSP(stat); 762 stat = cusparseSetMatIndexBase(loTriFactorT->descr, indexBase);CHKERRCUSP(stat); 763 stat = cusparseSetMatType(loTriFactorT->descr, matrixType);CHKERRCUSP(stat); 764 stat = cusparseSetMatFillMode(loTriFactorT->descr, fillMode);CHKERRCUSP(stat); 765 stat = cusparseSetMatDiagType(loTriFactorT->descr, diagType);CHKERRCUSP(stat); 766 767 /* Create the solve analysis information */ 768 stat = cusparseCreateSolveAnalysisInfo(&loTriFactorT->solveInfo);CHKERRCUSP(stat); 769 770 /* set the operation */ 771 loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 772 773 /* allocate GPU space for the CSC of the lower triangular factor*/ 774 loTriFactorT->csrMat = new CsrMatrix; 775 loTriFactorT->csrMat->num_rows = loTriFactor->csrMat->num_rows; 776 loTriFactorT->csrMat->num_cols = loTriFactor->csrMat->num_cols; 777 loTriFactorT->csrMat->num_entries = loTriFactor->csrMat->num_entries; 778 loTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(loTriFactor->csrMat->num_rows+1); 779 loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactor->csrMat->num_entries); 780 loTriFactorT->csrMat->values = new THRUSTARRAY(loTriFactor->csrMat->num_entries); 781 782 /* compute the transpose of the lower triangular factor, i.e. the CSC */ 783 stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, 784 loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, 785 loTriFactor->csrMat->values->data().get(), 786 loTriFactor->csrMat->row_offsets->data().get(), 787 loTriFactor->csrMat->column_indices->data().get(), 788 loTriFactorT->csrMat->values->data().get(), 789 loTriFactorT->csrMat->column_indices->data().get(), 790 loTriFactorT->csrMat->row_offsets->data().get(), 791 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSP(stat); 792 793 /* perform the solve analysis on the transposed matrix */ 794 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp, 795 loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, 796 loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(), 797 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), 798 loTriFactorT->solveInfo);CHKERRCUSP(stat); 799 800 /* assign the pointer. Is this really necessary? */ 801 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT; 802 803 /*********************************************/ 804 /* Now the Transpose of the Upper Tri Factor */ 805 /*********************************************/ 806 807 /* allocate space for the transpose of the upper triangular factor */ 808 upTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct; 809 810 /* set the matrix descriptors of the upper triangular factor */ 811 matrixType = cusparseGetMatType(upTriFactor->descr); 812 indexBase = cusparseGetMatIndexBase(upTriFactor->descr); 813 fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ? 814 CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 815 diagType = cusparseGetMatDiagType(upTriFactor->descr); 816 817 /* Create the matrix description */ 818 stat = cusparseCreateMatDescr(&upTriFactorT->descr);CHKERRCUSP(stat); 819 stat = cusparseSetMatIndexBase(upTriFactorT->descr, indexBase);CHKERRCUSP(stat); 820 stat = cusparseSetMatType(upTriFactorT->descr, matrixType);CHKERRCUSP(stat); 821 stat = cusparseSetMatFillMode(upTriFactorT->descr, fillMode);CHKERRCUSP(stat); 822 stat = cusparseSetMatDiagType(upTriFactorT->descr, diagType);CHKERRCUSP(stat); 823 824 /* Create the solve analysis information */ 825 stat = cusparseCreateSolveAnalysisInfo(&upTriFactorT->solveInfo);CHKERRCUSP(stat); 826 827 /* set the operation */ 828 upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 829 830 /* allocate GPU space for the CSC of the upper triangular factor*/ 831 upTriFactorT->csrMat = new CsrMatrix; 832 upTriFactorT->csrMat->num_rows = upTriFactor->csrMat->num_rows; 833 upTriFactorT->csrMat->num_cols = upTriFactor->csrMat->num_cols; 834 upTriFactorT->csrMat->num_entries = upTriFactor->csrMat->num_entries; 835 upTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(upTriFactor->csrMat->num_rows+1); 836 upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactor->csrMat->num_entries); 837 upTriFactorT->csrMat->values = new THRUSTARRAY(upTriFactor->csrMat->num_entries); 838 839 /* compute the transpose of the upper triangular factor, i.e. the CSC */ 840 stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, 841 upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, 842 upTriFactor->csrMat->values->data().get(), 843 upTriFactor->csrMat->row_offsets->data().get(), 844 upTriFactor->csrMat->column_indices->data().get(), 845 upTriFactorT->csrMat->values->data().get(), 846 upTriFactorT->csrMat->column_indices->data().get(), 847 upTriFactorT->csrMat->row_offsets->data().get(), 848 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSP(stat); 849 850 /* perform the solve analysis on the transposed matrix */ 851 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp, 852 upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, 853 upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(), 854 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), 855 upTriFactorT->solveInfo);CHKERRCUSP(stat); 856 857 /* assign the pointer. Is this really necessary? */ 858 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT; 859 PetscFunctionReturn(0); 860 } 861 862 #undef __FUNCT__ 863 #define __FUNCT__ "MatSeqAIJCUSPARSEGenerateTransposeForMult" 864 static PetscErrorCode MatSeqAIJCUSPARSEGenerateTransposeForMult(Mat A) 865 { 866 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 867 Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 868 Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 869 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 870 cusparseStatus_t stat; 871 cusparseIndexBase_t indexBase; 872 cudaError_t err; 873 874 PetscFunctionBegin; 875 876 /* allocate space for the triangular factor information */ 877 matstructT = new Mat_SeqAIJCUSPARSEMultStruct; 878 stat = cusparseCreateMatDescr(&matstructT->descr);CHKERRCUSP(stat); 879 indexBase = cusparseGetMatIndexBase(matstruct->descr); 880 stat = cusparseSetMatIndexBase(matstructT->descr, indexBase);CHKERRCUSP(stat); 881 stat = cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSP(stat); 882 883 /* set alpha and beta */ 884 err = cudaMalloc((void **)&(matstructT->alpha),sizeof(PetscScalar));CHKERRCUSP(err); 885 err = cudaMemcpy(matstructT->alpha,&ALPHA,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUSP(err); 886 err = cudaMalloc((void **)&(matstructT->beta),sizeof(PetscScalar));CHKERRCUSP(err); 887 err = cudaMemcpy(matstructT->beta,&BETA,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUSP(err); 888 stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSP(stat); 889 890 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 891 CsrMatrix *matrix = (CsrMatrix*)matstruct->mat; 892 CsrMatrix *matrixT= new CsrMatrix; 893 matrixT->num_rows = A->rmap->n; 894 matrixT->num_cols = A->cmap->n; 895 matrixT->num_entries = a->nz; 896 matrixT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 897 matrixT->column_indices = new THRUSTINTARRAY32(a->nz); 898 matrixT->values = new THRUSTARRAY(a->nz); 899 900 /* compute the transpose of the upper triangular factor, i.e. the CSC */ 901 indexBase = cusparseGetMatIndexBase(matstruct->descr); 902 stat = cusparse_csr2csc(cusparsestruct->handle, matrix->num_rows, 903 matrix->num_cols, matrix->num_entries, 904 matrix->values->data().get(), 905 matrix->row_offsets->data().get(), 906 matrix->column_indices->data().get(), 907 matrixT->values->data().get(), 908 matrixT->column_indices->data().get(), 909 matrixT->row_offsets->data().get(), 910 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSP(stat); 911 912 /* assign the pointer */ 913 matstructT->mat = matrixT; 914 915 } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) { 916 #if CUDA_VERSION>=5000 917 /* First convert HYB to CSR */ 918 CsrMatrix *temp= new CsrMatrix; 919 temp->num_rows = A->rmap->n; 920 temp->num_cols = A->cmap->n; 921 temp->num_entries = a->nz; 922 temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 923 temp->column_indices = new THRUSTINTARRAY32(a->nz); 924 temp->values = new THRUSTARRAY(a->nz); 925 926 927 stat = cusparse_hyb2csr(cusparsestruct->handle, 928 matstruct->descr, (cusparseHybMat_t)matstruct->mat, 929 temp->values->data().get(), 930 temp->row_offsets->data().get(), 931 temp->column_indices->data().get());CHKERRCUSP(stat); 932 933 /* Next, convert CSR to CSC (i.e. the matrix transpose) */ 934 CsrMatrix *tempT= new CsrMatrix; 935 tempT->num_rows = A->rmap->n; 936 tempT->num_cols = A->cmap->n; 937 tempT->num_entries = a->nz; 938 tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 939 tempT->column_indices = new THRUSTINTARRAY32(a->nz); 940 tempT->values = new THRUSTARRAY(a->nz); 941 942 stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows, 943 temp->num_cols, temp->num_entries, 944 temp->values->data().get(), 945 temp->row_offsets->data().get(), 946 temp->column_indices->data().get(), 947 tempT->values->data().get(), 948 tempT->column_indices->data().get(), 949 tempT->row_offsets->data().get(), 950 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSP(stat); 951 952 /* Last, convert CSC to HYB */ 953 cusparseHybMat_t hybMat; 954 stat = cusparseCreateHybMat(&hybMat);CHKERRCUSP(stat); 955 cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ? 956 CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 957 stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n, 958 matstructT->descr, tempT->values->data().get(), 959 tempT->row_offsets->data().get(), 960 tempT->column_indices->data().get(), 961 hybMat, 0, partition);CHKERRCUSP(stat); 962 963 /* assign the pointer */ 964 matstructT->mat = hybMat; 965 966 /* delete temporaries */ 967 if (tempT) { 968 if (tempT->values) delete (THRUSTARRAY*) tempT->values; 969 if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices; 970 if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets; 971 delete (CsrMatrix*) tempT; 972 } 973 if (temp) { 974 if (temp->values) delete (THRUSTARRAY*) temp->values; 975 if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices; 976 if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets; 977 delete (CsrMatrix*) temp; 978 } 979 #else 980 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"ELL (Ellpack) and HYB (Hybrid) storage format for the Matrix Transpose (in MatMultTranspose) require CUDA 5.0 or later."); 981 #endif 982 } 983 /* assign the compressed row indices */ 984 matstructT->cprowIndices = new THRUSTINTARRAY; 985 986 /* assign the pointer */ 987 ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT; 988 PetscFunctionReturn(0); 989 } 990 991 #undef __FUNCT__ 992 #define __FUNCT__ "MatSolveTranspose_SeqAIJCUSPARSE" 993 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 994 { 995 CUSPARRAY *xGPU, *bGPU; 996 cusparseStatus_t stat; 997 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 998 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 999 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1000 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1001 PetscErrorCode ierr; 1002 1003 PetscFunctionBegin; 1004 /* Analyze the matrix and create the transpose ... on the fly */ 1005 if (!loTriFactorT && !upTriFactorT) { 1006 ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr); 1007 loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1008 upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1009 } 1010 1011 /* Get the GPU pointers */ 1012 ierr = VecCUSPGetArrayWrite(xx,&xGPU);CHKERRQ(ierr); 1013 ierr = VecCUSPGetArrayRead(bb,&bGPU);CHKERRQ(ierr); 1014 1015 /* First, reorder with the row permutation */ 1016 thrust::copy(thrust::make_permutation_iterator(bGPU->begin(), cusparseTriFactors->rpermIndices->begin()), 1017 thrust::make_permutation_iterator(bGPU->end(), cusparseTriFactors->rpermIndices->end()), 1018 xGPU->begin()); 1019 1020 /* First, solve U */ 1021 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, 1022 upTriFactorT->csrMat->num_rows, &ALPHA, upTriFactorT->descr, 1023 upTriFactorT->csrMat->values->data().get(), 1024 upTriFactorT->csrMat->row_offsets->data().get(), 1025 upTriFactorT->csrMat->column_indices->data().get(), 1026 upTriFactorT->solveInfo, 1027 xGPU->data().get(), tempGPU->data().get());CHKERRCUSP(stat); 1028 1029 /* Then, solve L */ 1030 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, 1031 loTriFactorT->csrMat->num_rows, &ALPHA, loTriFactorT->descr, 1032 loTriFactorT->csrMat->values->data().get(), 1033 loTriFactorT->csrMat->row_offsets->data().get(), 1034 loTriFactorT->csrMat->column_indices->data().get(), 1035 loTriFactorT->solveInfo, 1036 tempGPU->data().get(), xGPU->data().get());CHKERRCUSP(stat); 1037 1038 /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */ 1039 thrust::copy(thrust::make_permutation_iterator(xGPU->begin(), cusparseTriFactors->cpermIndices->begin()), 1040 thrust::make_permutation_iterator(xGPU->end(), cusparseTriFactors->cpermIndices->end()), 1041 tempGPU->begin()); 1042 1043 /* Copy the temporary to the full solution. */ 1044 thrust::copy(tempGPU->begin(), tempGPU->end(), xGPU->begin()); 1045 1046 /* restore */ 1047 ierr = VecCUSPRestoreArrayRead(bb,&bGPU);CHKERRQ(ierr); 1048 ierr = VecCUSPRestoreArrayWrite(xx,&xGPU);CHKERRQ(ierr); 1049 ierr = WaitForGPU();CHKERRCUSP(ierr); 1050 1051 ierr = PetscLogFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1052 PetscFunctionReturn(0); 1053 } 1054 1055 #undef __FUNCT__ 1056 #define __FUNCT__ "MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering" 1057 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 1058 { 1059 CUSPARRAY *xGPU,*bGPU; 1060 cusparseStatus_t stat; 1061 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1062 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1063 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1064 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1065 PetscErrorCode ierr; 1066 1067 PetscFunctionBegin; 1068 /* Analyze the matrix and create the transpose ... on the fly */ 1069 if (!loTriFactorT && !upTriFactorT) { 1070 ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr); 1071 loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1072 upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1073 } 1074 1075 /* Get the GPU pointers */ 1076 ierr = VecCUSPGetArrayWrite(xx,&xGPU);CHKERRQ(ierr); 1077 ierr = VecCUSPGetArrayRead(bb,&bGPU);CHKERRQ(ierr); 1078 1079 /* First, solve U */ 1080 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, 1081 upTriFactorT->csrMat->num_rows, &ALPHA, upTriFactorT->descr, 1082 upTriFactorT->csrMat->values->data().get(), 1083 upTriFactorT->csrMat->row_offsets->data().get(), 1084 upTriFactorT->csrMat->column_indices->data().get(), 1085 upTriFactorT->solveInfo, 1086 bGPU->data().get(), tempGPU->data().get());CHKERRCUSP(stat); 1087 1088 /* Then, solve L */ 1089 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, 1090 loTriFactorT->csrMat->num_rows, &ALPHA, loTriFactorT->descr, 1091 loTriFactorT->csrMat->values->data().get(), 1092 loTriFactorT->csrMat->row_offsets->data().get(), 1093 loTriFactorT->csrMat->column_indices->data().get(), 1094 loTriFactorT->solveInfo, 1095 tempGPU->data().get(), xGPU->data().get());CHKERRCUSP(stat); 1096 1097 /* restore */ 1098 ierr = VecCUSPRestoreArrayRead(bb,&bGPU);CHKERRQ(ierr); 1099 ierr = VecCUSPRestoreArrayWrite(xx,&xGPU);CHKERRQ(ierr); 1100 ierr = WaitForGPU();CHKERRCUSP(ierr); 1101 ierr = PetscLogFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1102 PetscFunctionReturn(0); 1103 } 1104 1105 #undef __FUNCT__ 1106 #define __FUNCT__ "MatSolve_SeqAIJCUSPARSE" 1107 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 1108 { 1109 CUSPARRAY *xGPU,*bGPU; 1110 cusparseStatus_t stat; 1111 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1112 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1113 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1114 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1115 PetscErrorCode ierr; 1116 1117 PetscFunctionBegin; 1118 /* Get the GPU pointers */ 1119 ierr = VecCUSPGetArrayWrite(xx,&xGPU);CHKERRQ(ierr); 1120 ierr = VecCUSPGetArrayRead(bb,&bGPU);CHKERRQ(ierr); 1121 1122 /* First, reorder with the row permutation */ 1123 thrust::copy(thrust::make_permutation_iterator(bGPU->begin(), cusparseTriFactors->rpermIndices->begin()), 1124 thrust::make_permutation_iterator(bGPU->end(), cusparseTriFactors->rpermIndices->end()), 1125 xGPU->begin()); 1126 1127 /* Next, solve L */ 1128 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp, 1129 loTriFactor->csrMat->num_rows, &ALPHA, loTriFactor->descr, 1130 loTriFactor->csrMat->values->data().get(), 1131 loTriFactor->csrMat->row_offsets->data().get(), 1132 loTriFactor->csrMat->column_indices->data().get(), 1133 loTriFactor->solveInfo, 1134 xGPU->data().get(), tempGPU->data().get());CHKERRCUSP(stat); 1135 1136 /* Then, solve U */ 1137 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp, 1138 upTriFactor->csrMat->num_rows, &ALPHA, upTriFactor->descr, 1139 upTriFactor->csrMat->values->data().get(), 1140 upTriFactor->csrMat->row_offsets->data().get(), 1141 upTriFactor->csrMat->column_indices->data().get(), 1142 upTriFactor->solveInfo, 1143 tempGPU->data().get(), xGPU->data().get());CHKERRCUSP(stat); 1144 1145 /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */ 1146 thrust::copy(thrust::make_permutation_iterator(xGPU->begin(), cusparseTriFactors->cpermIndices->begin()), 1147 thrust::make_permutation_iterator(xGPU->end(), cusparseTriFactors->cpermIndices->end()), 1148 tempGPU->begin()); 1149 1150 /* Copy the temporary to the full solution. */ 1151 thrust::copy(tempGPU->begin(), tempGPU->end(), xGPU->begin()); 1152 1153 ierr = VecCUSPRestoreArrayRead(bb,&bGPU);CHKERRQ(ierr); 1154 ierr = VecCUSPRestoreArrayWrite(xx,&xGPU);CHKERRQ(ierr); 1155 ierr = WaitForGPU();CHKERRCUSP(ierr); 1156 ierr = PetscLogFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1157 PetscFunctionReturn(0); 1158 } 1159 1160 #undef __FUNCT__ 1161 #define __FUNCT__ "MatSolve_SeqAIJCUSPARSE_NaturalOrdering" 1162 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 1163 { 1164 CUSPARRAY *xGPU,*bGPU; 1165 cusparseStatus_t stat; 1166 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1167 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1168 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1169 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1170 PetscErrorCode ierr; 1171 1172 PetscFunctionBegin; 1173 /* Get the GPU pointers */ 1174 ierr = VecCUSPGetArrayWrite(xx,&xGPU);CHKERRQ(ierr); 1175 ierr = VecCUSPGetArrayRead(bb,&bGPU);CHKERRQ(ierr); 1176 1177 /* First, solve L */ 1178 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp, 1179 loTriFactor->csrMat->num_rows, &ALPHA, loTriFactor->descr, 1180 loTriFactor->csrMat->values->data().get(), 1181 loTriFactor->csrMat->row_offsets->data().get(), 1182 loTriFactor->csrMat->column_indices->data().get(), 1183 loTriFactor->solveInfo, 1184 bGPU->data().get(), tempGPU->data().get());CHKERRCUSP(stat); 1185 1186 /* Next, solve U */ 1187 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp, 1188 upTriFactor->csrMat->num_rows, &ALPHA, upTriFactor->descr, 1189 upTriFactor->csrMat->values->data().get(), 1190 upTriFactor->csrMat->row_offsets->data().get(), 1191 upTriFactor->csrMat->column_indices->data().get(), 1192 upTriFactor->solveInfo, 1193 tempGPU->data().get(), xGPU->data().get());CHKERRCUSP(stat); 1194 1195 ierr = VecCUSPRestoreArrayRead(bb,&bGPU);CHKERRQ(ierr); 1196 ierr = VecCUSPRestoreArrayWrite(xx,&xGPU);CHKERRQ(ierr); 1197 ierr = WaitForGPU();CHKERRCUSP(ierr); 1198 ierr = PetscLogFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1199 PetscFunctionReturn(0); 1200 } 1201 1202 #undef __FUNCT__ 1203 #define __FUNCT__ "MatSeqAIJCUSPARSECopyToGPU" 1204 static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A) 1205 { 1206 1207 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1208 Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 1209 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1210 PetscInt m = A->rmap->n,*ii,*ridx; 1211 PetscErrorCode ierr; 1212 cusparseStatus_t stat; 1213 cudaError_t err; 1214 1215 PetscFunctionBegin; 1216 if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUSP_CPU) { 1217 ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1218 if (matstruct) { 1219 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Data structure should not be initialized."); 1220 } 1221 try { 1222 cusparsestruct->nonzerorow=0; 1223 for (int j = 0; j<m; j++) cusparsestruct->nonzerorow += ((a->i[j+1]-a->i[j])>0); 1224 1225 if (a->compressedrow.use) { 1226 m = a->compressedrow.nrows; 1227 ii = a->compressedrow.i; 1228 ridx = a->compressedrow.rindex; 1229 } else { 1230 /* Forcing compressed row on the GPU */ 1231 int k=0; 1232 ierr = PetscMalloc((cusparsestruct->nonzerorow+1)*sizeof(PetscInt), &ii);CHKERRQ(ierr); 1233 ierr = PetscMalloc((cusparsestruct->nonzerorow)*sizeof(PetscInt), &ridx);CHKERRQ(ierr); 1234 ii[0]=0; 1235 for (int j = 0; j<m; j++) { 1236 if ((a->i[j+1]-a->i[j])>0) { 1237 ii[k] = a->i[j]; 1238 ridx[k]= j; 1239 k++; 1240 } 1241 } 1242 ii[cusparsestruct->nonzerorow] = a->nz; 1243 m = cusparsestruct->nonzerorow; 1244 } 1245 1246 /* allocate space for the triangular factor information */ 1247 matstruct = new Mat_SeqAIJCUSPARSEMultStruct; 1248 stat = cusparseCreateMatDescr(&matstruct->descr);CHKERRCUSP(stat); 1249 stat = cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSP(stat); 1250 stat = cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSP(stat); 1251 1252 err = cudaMalloc((void **)&(matstruct->alpha),sizeof(PetscScalar));CHKERRCUSP(err); 1253 err = cudaMemcpy(matstruct->alpha,&ALPHA,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUSP(err); 1254 err = cudaMalloc((void **)&(matstruct->beta),sizeof(PetscScalar));CHKERRCUSP(err); 1255 err = cudaMemcpy(matstruct->beta,&BETA,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUSP(err); 1256 stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSP(stat); 1257 1258 /* Build a hybrid/ellpack matrix if this option is chosen for the storage */ 1259 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1260 /* set the matrix */ 1261 CsrMatrix *matrix= new CsrMatrix; 1262 matrix->num_rows = m; 1263 matrix->num_cols = A->cmap->n; 1264 matrix->num_entries = a->nz; 1265 matrix->row_offsets = new THRUSTINTARRAY32(m+1); 1266 matrix->row_offsets->assign(ii, ii + m+1); 1267 1268 matrix->column_indices = new THRUSTINTARRAY32(a->nz); 1269 matrix->column_indices->assign(a->j, a->j+a->nz); 1270 1271 matrix->values = new THRUSTARRAY(a->nz); 1272 matrix->values->assign(a->a, a->a+a->nz); 1273 1274 /* assign the pointer */ 1275 matstruct->mat = matrix; 1276 1277 } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) { 1278 #if CUDA_VERSION>=4020 1279 CsrMatrix *matrix= new CsrMatrix; 1280 matrix->num_rows = m; 1281 matrix->num_cols = A->cmap->n; 1282 matrix->num_entries = a->nz; 1283 matrix->row_offsets = new THRUSTINTARRAY32(m+1); 1284 matrix->row_offsets->assign(ii, ii + m+1); 1285 1286 matrix->column_indices = new THRUSTINTARRAY32(a->nz); 1287 matrix->column_indices->assign(a->j, a->j+a->nz); 1288 1289 matrix->values = new THRUSTARRAY(a->nz); 1290 matrix->values->assign(a->a, a->a+a->nz); 1291 1292 cusparseHybMat_t hybMat; 1293 stat = cusparseCreateHybMat(&hybMat);CHKERRCUSP(stat); 1294 cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ? 1295 CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 1296 stat = cusparse_csr2hyb(cusparsestruct->handle, matrix->num_rows, matrix->num_cols, 1297 matstruct->descr, matrix->values->data().get(), 1298 matrix->row_offsets->data().get(), 1299 matrix->column_indices->data().get(), 1300 hybMat, 0, partition);CHKERRCUSP(stat); 1301 /* assign the pointer */ 1302 matstruct->mat = hybMat; 1303 1304 if (matrix) { 1305 if (matrix->values) delete (THRUSTARRAY*)matrix->values; 1306 if (matrix->column_indices) delete (THRUSTINTARRAY32*)matrix->column_indices; 1307 if (matrix->row_offsets) delete (THRUSTINTARRAY32*)matrix->row_offsets; 1308 delete (CsrMatrix*)matrix; 1309 } 1310 #endif 1311 } 1312 1313 /* assign the compressed row indices */ 1314 matstruct->cprowIndices = new THRUSTINTARRAY(m); 1315 matstruct->cprowIndices->assign(ridx,ridx+m); 1316 1317 /* assign the pointer */ 1318 cusparsestruct->mat = matstruct; 1319 1320 if (!a->compressedrow.use) { 1321 ierr = PetscFree(ii);CHKERRQ(ierr); 1322 ierr = PetscFree(ridx);CHKERRQ(ierr); 1323 } 1324 cusparsestruct->workVector = new THRUSTARRAY; 1325 cusparsestruct->workVector->resize(m); 1326 } catch(char *ex) { 1327 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 1328 } 1329 ierr = WaitForGPU();CHKERRCUSP(ierr); 1330 1331 A->valid_GPU_matrix = PETSC_CUSP_BOTH; 1332 1333 ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1334 } 1335 PetscFunctionReturn(0); 1336 } 1337 1338 #undef __FUNCT__ 1339 #define __FUNCT__ "MatGetVecs_SeqAIJCUSPARSE" 1340 static PetscErrorCode MatGetVecs_SeqAIJCUSPARSE(Mat mat, Vec *right, Vec *left) 1341 { 1342 PetscErrorCode ierr; 1343 1344 PetscFunctionBegin; 1345 if (right) { 1346 ierr = VecCreate(PetscObjectComm((PetscObject)mat),right);CHKERRQ(ierr); 1347 ierr = VecSetSizes(*right,mat->cmap->n,PETSC_DETERMINE);CHKERRQ(ierr); 1348 ierr = VecSetBlockSize(*right,mat->rmap->bs);CHKERRQ(ierr); 1349 ierr = VecSetType(*right,VECSEQCUSP);CHKERRQ(ierr); 1350 ierr = PetscLayoutReference(mat->cmap,&(*right)->map);CHKERRQ(ierr); 1351 } 1352 if (left) { 1353 ierr = VecCreate(PetscObjectComm((PetscObject)mat),left);CHKERRQ(ierr); 1354 ierr = VecSetSizes(*left,mat->rmap->n,PETSC_DETERMINE);CHKERRQ(ierr); 1355 ierr = VecSetBlockSize(*left,mat->rmap->bs);CHKERRQ(ierr); 1356 ierr = VecSetType(*left,VECSEQCUSP);CHKERRQ(ierr); 1357 ierr = PetscLayoutReference(mat->rmap,&(*left)->map);CHKERRQ(ierr); 1358 } 1359 PetscFunctionReturn(0); 1360 } 1361 1362 struct VecCUSPPlusEquals 1363 { 1364 template <typename Tuple> 1365 __host__ __device__ 1366 void operator()(Tuple t) 1367 { 1368 thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t); 1369 } 1370 }; 1371 1372 #undef __FUNCT__ 1373 #define __FUNCT__ "MatMult_SeqAIJCUSPARSE" 1374 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 1375 { 1376 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1377 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1378 Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 1379 CUSPARRAY *xarray,*yarray; 1380 PetscErrorCode ierr; 1381 cusparseStatus_t stat; 1382 1383 PetscFunctionBegin; 1384 /* The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE 1385 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); */ 1386 ierr = VecCUSPGetArrayRead(xx,&xarray);CHKERRQ(ierr); 1387 ierr = VecCUSPGetArrayWrite(yy,&yarray);CHKERRQ(ierr); 1388 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1389 CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 1390 stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1391 mat->num_rows, mat->num_cols, mat->num_entries, 1392 matstruct->alpha, matstruct->descr, mat->values->data().get(), mat->row_offsets->data().get(), 1393 mat->column_indices->data().get(), xarray->data().get(), matstruct->beta, 1394 yarray->data().get());CHKERRCUSP(stat); 1395 } else { 1396 #if CUDA_VERSION>=4020 1397 cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat; 1398 stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1399 matstruct->alpha, matstruct->descr, hybMat, 1400 xarray->data().get(), matstruct->beta, 1401 yarray->data().get());CHKERRCUSP(stat); 1402 #endif 1403 } 1404 ierr = VecCUSPRestoreArrayRead(xx,&xarray);CHKERRQ(ierr); 1405 ierr = VecCUSPRestoreArrayWrite(yy,&yarray);CHKERRQ(ierr); 1406 if (!cusparsestruct->stream) { 1407 ierr = WaitForGPU();CHKERRCUSP(ierr); 1408 } 1409 ierr = PetscLogFlops(2.0*a->nz - cusparsestruct->nonzerorow);CHKERRQ(ierr); 1410 PetscFunctionReturn(0); 1411 } 1412 1413 #undef __FUNCT__ 1414 #define __FUNCT__ "MatMultTranspose_SeqAIJCUSPARSE" 1415 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 1416 { 1417 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1418 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1419 Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 1420 CUSPARRAY *xarray,*yarray; 1421 PetscErrorCode ierr; 1422 cusparseStatus_t stat; 1423 1424 PetscFunctionBegin; 1425 /* The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE 1426 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); */ 1427 if (!matstructT) { 1428 ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr); 1429 matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 1430 } 1431 ierr = VecCUSPGetArrayRead(xx,&xarray);CHKERRQ(ierr); 1432 ierr = VecCUSPGetArrayWrite(yy,&yarray);CHKERRQ(ierr); 1433 1434 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1435 CsrMatrix *mat = (CsrMatrix*)matstructT->mat; 1436 stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1437 mat->num_rows, mat->num_cols, 1438 mat->num_entries, matstructT->alpha, matstructT->descr, 1439 mat->values->data().get(), mat->row_offsets->data().get(), 1440 mat->column_indices->data().get(), xarray->data().get(), matstructT->beta, 1441 yarray->data().get());CHKERRCUSP(stat); 1442 } else { 1443 #if CUDA_VERSION>=4020 1444 cusparseHybMat_t hybMat = (cusparseHybMat_t)matstructT->mat; 1445 stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1446 matstructT->alpha, matstructT->descr, hybMat, 1447 xarray->data().get(), matstructT->beta, 1448 yarray->data().get());CHKERRCUSP(stat); 1449 #endif 1450 } 1451 ierr = VecCUSPRestoreArrayRead(xx,&xarray);CHKERRQ(ierr); 1452 ierr = VecCUSPRestoreArrayWrite(yy,&yarray);CHKERRQ(ierr); 1453 if (!cusparsestruct->stream) { 1454 ierr = WaitForGPU();CHKERRCUSP(ierr); 1455 } 1456 ierr = PetscLogFlops(2.0*a->nz - cusparsestruct->nonzerorow);CHKERRQ(ierr); 1457 PetscFunctionReturn(0); 1458 } 1459 1460 1461 #undef __FUNCT__ 1462 #define __FUNCT__ "MatMultAdd_SeqAIJCUSPARSE" 1463 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 1464 { 1465 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1466 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1467 Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 1468 CUSPARRAY *xarray,*yarray,*zarray; 1469 PetscErrorCode ierr; 1470 cusparseStatus_t stat; 1471 1472 PetscFunctionBegin; 1473 /* The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE 1474 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); */ 1475 try { 1476 ierr = VecCopy_SeqCUSP(yy,zz);CHKERRQ(ierr); 1477 ierr = VecCUSPGetArrayRead(xx,&xarray);CHKERRQ(ierr); 1478 ierr = VecCUSPGetArrayRead(yy,&yarray);CHKERRQ(ierr); 1479 ierr = VecCUSPGetArrayWrite(zz,&zarray);CHKERRQ(ierr); 1480 1481 /* multiply add */ 1482 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1483 CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 1484 /* here we need to be careful to set the number of rows in the multiply to the 1485 number of compressed rows in the matrix ... which is equivalent to the 1486 size of the workVector */ 1487 stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1488 mat->num_rows, mat->num_cols, 1489 mat->num_entries, matstruct->alpha, matstruct->descr, 1490 mat->values->data().get(), mat->row_offsets->data().get(), 1491 mat->column_indices->data().get(), xarray->data().get(), matstruct->beta, 1492 cusparsestruct->workVector->data().get());CHKERRCUSP(stat); 1493 } else { 1494 #if CUDA_VERSION>=4020 1495 cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat; 1496 if (cusparsestruct->workVector->size()) { 1497 stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1498 matstruct->alpha, matstruct->descr, hybMat, 1499 xarray->data().get(), matstruct->beta, 1500 cusparsestruct->workVector->data().get());CHKERRCUSP(stat); 1501 } 1502 #endif 1503 } 1504 1505 /* scatter the data from the temporary into the full vector with a += operation */ 1506 thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zarray->begin(), matstruct->cprowIndices->begin()))), 1507 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zarray->begin(), matstruct->cprowIndices->begin()))) + cusparsestruct->workVector->size(), 1508 VecCUSPPlusEquals()); 1509 ierr = VecCUSPRestoreArrayRead(xx,&xarray);CHKERRQ(ierr); 1510 ierr = VecCUSPRestoreArrayRead(yy,&yarray);CHKERRQ(ierr); 1511 ierr = VecCUSPRestoreArrayWrite(zz,&zarray);CHKERRQ(ierr); 1512 1513 } catch(char *ex) { 1514 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 1515 } 1516 ierr = WaitForGPU();CHKERRCUSP(ierr); 1517 ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr); 1518 PetscFunctionReturn(0); 1519 } 1520 1521 #undef __FUNCT__ 1522 #define __FUNCT__ "MatMultTransposeAdd_SeqAIJCUSPARSE" 1523 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 1524 { 1525 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1526 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1527 Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 1528 CUSPARRAY *xarray,*yarray,*zarray; 1529 PetscErrorCode ierr; 1530 cusparseStatus_t stat; 1531 1532 PetscFunctionBegin; 1533 /* The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE 1534 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); */ 1535 if (!matstructT) { 1536 ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr); 1537 matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 1538 } 1539 1540 try { 1541 ierr = VecCopy_SeqCUSP(yy,zz);CHKERRQ(ierr); 1542 ierr = VecCUSPGetArrayRead(xx,&xarray);CHKERRQ(ierr); 1543 ierr = VecCUSPGetArrayRead(yy,&yarray);CHKERRQ(ierr); 1544 ierr = VecCUSPGetArrayWrite(zz,&zarray);CHKERRQ(ierr); 1545 1546 /* multiply add with matrix transpose */ 1547 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1548 CsrMatrix *mat = (CsrMatrix*)matstructT->mat; 1549 /* here we need to be careful to set the number of rows in the multiply to the 1550 number of compressed rows in the matrix ... which is equivalent to the 1551 size of the workVector */ 1552 stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1553 mat->num_rows, mat->num_cols, 1554 mat->num_entries, matstructT->alpha, matstructT->descr, 1555 mat->values->data().get(), mat->row_offsets->data().get(), 1556 mat->column_indices->data().get(), xarray->data().get(), matstructT->beta, 1557 cusparsestruct->workVector->data().get());CHKERRCUSP(stat); 1558 } else { 1559 #if CUDA_VERSION>=4020 1560 cusparseHybMat_t hybMat = (cusparseHybMat_t)matstructT->mat; 1561 if (cusparsestruct->workVector->size()) { 1562 stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1563 matstructT->alpha, matstructT->descr, hybMat, 1564 xarray->data().get(), matstructT->beta, 1565 cusparsestruct->workVector->data().get());CHKERRCUSP(stat); 1566 } 1567 #endif 1568 } 1569 1570 /* scatter the data from the temporary into the full vector with a += operation */ 1571 thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zarray->begin(), matstructT->cprowIndices->begin()))), 1572 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zarray->begin(), matstructT->cprowIndices->begin()))) + cusparsestruct->workVector->size(), 1573 VecCUSPPlusEquals()); 1574 1575 ierr = VecCUSPRestoreArrayRead(xx,&xarray);CHKERRQ(ierr); 1576 ierr = VecCUSPRestoreArrayRead(yy,&yarray);CHKERRQ(ierr); 1577 ierr = VecCUSPRestoreArrayWrite(zz,&zarray);CHKERRQ(ierr); 1578 1579 } catch(char *ex) { 1580 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 1581 } 1582 ierr = WaitForGPU();CHKERRCUSP(ierr); 1583 ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr); 1584 PetscFunctionReturn(0); 1585 } 1586 1587 #undef __FUNCT__ 1588 #define __FUNCT__ "MatAssemblyEnd_SeqAIJCUSPARSE" 1589 static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode) 1590 { 1591 PetscErrorCode ierr; 1592 1593 PetscFunctionBegin; 1594 ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr); 1595 if (A->factortype==MAT_FACTOR_NONE) { 1596 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 1597 } 1598 if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0); 1599 A->ops->mult = MatMult_SeqAIJCUSPARSE; 1600 A->ops->multadd = MatMultAdd_SeqAIJCUSPARSE; 1601 A->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE; 1602 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE; 1603 PetscFunctionReturn(0); 1604 } 1605 1606 /* --------------------------------------------------------------------------------*/ 1607 #undef __FUNCT__ 1608 #define __FUNCT__ "MatCreateSeqAIJCUSPARSE" 1609 /*@ 1610 MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format 1611 (the default parallel PETSc format). This matrix will ultimately pushed down 1612 to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix 1613 assembly performance the user should preallocate the matrix storage by setting 1614 the parameter nz (or the array nnz). By setting these parameters accurately, 1615 performance during matrix assembly can be increased by more than a factor of 50. 1616 1617 Collective on MPI_Comm 1618 1619 Input Parameters: 1620 + comm - MPI communicator, set to PETSC_COMM_SELF 1621 . m - number of rows 1622 . n - number of columns 1623 . nz - number of nonzeros per row (same for all rows) 1624 - nnz - array containing the number of nonzeros in the various rows 1625 (possibly different for each row) or NULL 1626 1627 Output Parameter: 1628 . A - the matrix 1629 1630 It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 1631 MatXXXXSetPreallocation() paradgm instead of this routine directly. 1632 [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 1633 1634 Notes: 1635 If nnz is given then nz is ignored 1636 1637 The AIJ format (also called the Yale sparse matrix format or 1638 compressed row storage), is fully compatible with standard Fortran 77 1639 storage. That is, the stored row and column indices can begin at 1640 either one (as in Fortran) or zero. See the users' manual for details. 1641 1642 Specify the preallocated storage with either nz or nnz (not both). 1643 Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 1644 allocation. For large problems you MUST preallocate memory or you 1645 will get TERRIBLE performance, see the users' manual chapter on matrices. 1646 1647 By default, this format uses inodes (identical nodes) when possible, to 1648 improve numerical efficiency of matrix-vector products and solves. We 1649 search for consecutive rows with the same nonzero structure, thereby 1650 reusing matrix information to achieve increased efficiency. 1651 1652 Level: intermediate 1653 1654 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE 1655 @*/ 1656 PetscErrorCode MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A) 1657 { 1658 PetscErrorCode ierr; 1659 1660 PetscFunctionBegin; 1661 ierr = MatCreate(comm,A);CHKERRQ(ierr); 1662 ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr); 1663 ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 1664 ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr); 1665 PetscFunctionReturn(0); 1666 } 1667 1668 #undef __FUNCT__ 1669 #define __FUNCT__ "MatDestroy_SeqAIJCUSPARSE" 1670 static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A) 1671 { 1672 PetscErrorCode ierr; 1673 cusparseStatus_t stat; 1674 cudaError_t err; 1675 PetscFunctionBegin; 1676 if (A->factortype==MAT_FACTOR_NONE) { 1677 try { 1678 if (A->valid_GPU_matrix != PETSC_CUSP_UNALLOCATED) { 1679 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1680 Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 1681 Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 1682 1683 /* delete all the members in each of the data structures */ 1684 if (matstruct) { 1685 if (matstruct->descr) { stat = cusparseDestroyMatDescr(matstruct->descr);CHKERRCUSP(stat); } 1686 if (matstruct->mat) { 1687 if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) { 1688 #if CUDA_VERSION>=4020 1689 cusparseHybMat_t hybMat = (cusparseHybMat_t) matstruct->mat; 1690 stat = cusparseDestroyHybMat(hybMat);CHKERRCUSP(stat); 1691 #endif 1692 } else { 1693 CsrMatrix* mat = (CsrMatrix*)matstruct->mat; 1694 if (mat->values) delete (THRUSTARRAY*)mat->values; 1695 if (mat->column_indices) delete (THRUSTINTARRAY32*)mat->column_indices; 1696 if (mat->row_offsets) delete (THRUSTINTARRAY32*)mat->row_offsets; 1697 delete (CsrMatrix*)matstruct->mat; 1698 } 1699 } 1700 if (matstruct->alpha) { err=cudaFree(matstruct->alpha);CHKERRCUSP(err); } 1701 if (matstruct->beta) { err=cudaFree(matstruct->beta);CHKERRCUSP(err); } 1702 if (matstruct->cprowIndices) delete (THRUSTINTARRAY*)matstruct->cprowIndices; 1703 if (matstruct) delete (Mat_SeqAIJCUSPARSEMultStruct*)matstruct; 1704 } 1705 if (matstructT) { 1706 if (matstructT->descr) { stat = cusparseDestroyMatDescr(matstructT->descr);CHKERRCUSP(stat); } 1707 if (matstructT->mat) { 1708 if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) { 1709 #if CUDA_VERSION>=4020 1710 cusparseHybMat_t hybMat = (cusparseHybMat_t) matstructT->mat; 1711 stat = cusparseDestroyHybMat(hybMat);CHKERRCUSP(stat); 1712 #endif 1713 } else { 1714 CsrMatrix* matT = (CsrMatrix*)matstructT->mat; 1715 if (matT->values) delete (THRUSTARRAY*)matT->values; 1716 if (matT->column_indices) delete (THRUSTINTARRAY32*)matT->column_indices; 1717 if (matT->row_offsets) delete (THRUSTINTARRAY32*)matT->row_offsets; 1718 delete (CsrMatrix*)matstructT->mat; 1719 } 1720 } 1721 if (matstructT->alpha) { err=cudaFree(matstructT->alpha);CHKERRCUSP(err); } 1722 if (matstructT->beta) { err=cudaFree(matstructT->beta);CHKERRCUSP(err); } 1723 if (matstructT->cprowIndices) delete (THRUSTINTARRAY*)matstructT->cprowIndices; 1724 if (matstructT) delete (Mat_SeqAIJCUSPARSEMultStruct*)matstructT; 1725 } 1726 if (cusparsestruct->workVector) delete (THRUSTARRAY*)cusparsestruct->workVector; 1727 if (cusparsestruct->handle) { stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUSP(stat); } 1728 if (cusparsestruct) delete (Mat_SeqAIJCUSPARSE*)cusparsestruct; 1729 A->valid_GPU_matrix = PETSC_CUSP_UNALLOCATED; 1730 } else { 1731 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1732 if (cusparsestruct->handle) { stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUSP(stat); } 1733 if (cusparsestruct) delete (Mat_SeqAIJCUSPARSE*)cusparsestruct; 1734 } 1735 } catch(char *ex) { 1736 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 1737 } 1738 } else { 1739 /* The triangular factors */ 1740 try { 1741 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1742 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1743 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1744 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1745 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1746 1747 /* delete all the members in each of the data structures */ 1748 if (loTriFactor) { 1749 if (loTriFactor->descr) { stat = cusparseDestroyMatDescr(loTriFactor->descr);CHKERRCUSP(stat); } 1750 if (loTriFactor->solveInfo) { stat = cusparseDestroySolveAnalysisInfo(loTriFactor->solveInfo);CHKERRCUSP(stat); } 1751 if (loTriFactor->csrMat) { 1752 if (loTriFactor->csrMat->values) delete (THRUSTARRAY*)loTriFactor->csrMat->values; 1753 if (loTriFactor->csrMat->column_indices) delete (THRUSTINTARRAY32*)loTriFactor->csrMat->column_indices; 1754 if (loTriFactor->csrMat->row_offsets) delete (THRUSTINTARRAY32*)loTriFactor->csrMat->row_offsets; 1755 delete (CsrMatrix*)loTriFactor->csrMat; 1756 } 1757 if (loTriFactor) delete (Mat_SeqAIJCUSPARSETriFactorStruct*)loTriFactor; 1758 } 1759 1760 if (upTriFactor) { 1761 if (upTriFactor->descr) { stat = cusparseDestroyMatDescr(upTriFactor->descr);CHKERRCUSP(stat); } 1762 if (upTriFactor->solveInfo) { stat = cusparseDestroySolveAnalysisInfo(upTriFactor->solveInfo);CHKERRCUSP(stat); } 1763 if (upTriFactor->csrMat) { 1764 if (upTriFactor->csrMat->values) delete (THRUSTARRAY*)upTriFactor->csrMat->values; 1765 if (upTriFactor->csrMat->column_indices) delete (THRUSTINTARRAY32*)upTriFactor->csrMat->column_indices; 1766 if (upTriFactor->csrMat->row_offsets) delete (THRUSTINTARRAY32*)upTriFactor->csrMat->row_offsets; 1767 delete (CsrMatrix*)upTriFactor->csrMat; 1768 } 1769 if (upTriFactor) delete (Mat_SeqAIJCUSPARSETriFactorStruct*)upTriFactor; 1770 } 1771 1772 if (loTriFactorT) { 1773 if (loTriFactorT->descr) { stat = cusparseDestroyMatDescr(loTriFactorT->descr);CHKERRCUSP(stat); } 1774 if (loTriFactorT->solveInfo) { stat = cusparseDestroySolveAnalysisInfo(loTriFactorT->solveInfo);CHKERRCUSP(stat); } 1775 if (loTriFactorT->csrMat) { 1776 if (loTriFactorT->csrMat->values) delete (THRUSTARRAY*)loTriFactorT->csrMat->values; 1777 if (loTriFactorT->csrMat->column_indices) delete (THRUSTINTARRAY32*)loTriFactorT->csrMat->column_indices; 1778 if (loTriFactorT->csrMat->row_offsets) delete (THRUSTINTARRAY32*)loTriFactorT->csrMat->row_offsets; 1779 delete (CsrMatrix*)loTriFactorT->csrMat; 1780 } 1781 if (loTriFactorT) delete (Mat_SeqAIJCUSPARSETriFactorStruct*)loTriFactorT; 1782 } 1783 1784 if (upTriFactorT) { 1785 if (upTriFactorT->descr) { stat = cusparseDestroyMatDescr(upTriFactorT->descr);CHKERRCUSP(stat); } 1786 if (upTriFactorT->solveInfo) { stat = cusparseDestroySolveAnalysisInfo(upTriFactorT->solveInfo);CHKERRCUSP(stat); } 1787 if (upTriFactorT->csrMat) { 1788 if (upTriFactorT->csrMat->values) delete (THRUSTARRAY*)upTriFactorT->csrMat->values; 1789 if (upTriFactorT->csrMat->column_indices) delete (THRUSTINTARRAY32*)upTriFactorT->csrMat->column_indices; 1790 if (upTriFactorT->csrMat->row_offsets) delete (THRUSTINTARRAY32*)upTriFactorT->csrMat->row_offsets; 1791 delete (CsrMatrix*)upTriFactorT->csrMat; 1792 } 1793 if (upTriFactorT) delete (Mat_SeqAIJCUSPARSETriFactorStruct*)upTriFactorT; 1794 } 1795 if (cusparseTriFactors->rpermIndices) delete (THRUSTINTARRAY*)cusparseTriFactors->rpermIndices; 1796 if (cusparseTriFactors->cpermIndices) delete (THRUSTINTARRAY*)cusparseTriFactors->cpermIndices; 1797 if (cusparseTriFactors->workVector) delete (THRUSTARRAY*)cusparseTriFactors->workVector; 1798 if (cusparseTriFactors->handle) { stat = cusparseDestroy(cusparseTriFactors->handle);CHKERRCUSP(stat); } 1799 if (cusparseTriFactors) delete (Mat_SeqAIJCUSPARSETriFactors*)cusparseTriFactors; 1800 } catch(char *ex) { 1801 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 1802 } 1803 } 1804 /*this next line is because MatDestroy tries to PetscFree spptr if it is not zero, and PetscFree only works if the memory was allocated with PetscNew or PetscMalloc, which don't call the constructor */ 1805 A->spptr = 0; 1806 1807 ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr); 1808 PetscFunctionReturn(0); 1809 } 1810 1811 #undef __FUNCT__ 1812 #define __FUNCT__ "MatCreate_SeqAIJCUSPARSE" 1813 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B) 1814 { 1815 PetscErrorCode ierr; 1816 cusparseStatus_t stat; 1817 cusparseHandle_t handle=0; 1818 1819 PetscFunctionBegin; 1820 ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr); 1821 if (B->factortype==MAT_FACTOR_NONE) { 1822 /* you cannot check the inode.use flag here since the matrix was just created. 1823 now build a GPU matrix data structure */ 1824 B->spptr = new Mat_SeqAIJCUSPARSE; 1825 ((Mat_SeqAIJCUSPARSE*)B->spptr)->mat = 0; 1826 ((Mat_SeqAIJCUSPARSE*)B->spptr)->matTranspose = 0; 1827 ((Mat_SeqAIJCUSPARSE*)B->spptr)->workVector = 0; 1828 ((Mat_SeqAIJCUSPARSE*)B->spptr)->format = MAT_CUSPARSE_CSR; 1829 ((Mat_SeqAIJCUSPARSE*)B->spptr)->stream = 0; 1830 ((Mat_SeqAIJCUSPARSE*)B->spptr)->handle = 0; 1831 stat = cusparseCreate(&handle);CHKERRCUSP(stat); 1832 ((Mat_SeqAIJCUSPARSE*)B->spptr)->handle = handle; 1833 ((Mat_SeqAIJCUSPARSE*)B->spptr)->stream = 0; 1834 } else { 1835 /* NEXT, set the pointers to the triangular factors */ 1836 B->spptr = new Mat_SeqAIJCUSPARSETriFactors; 1837 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->loTriFactorPtr = 0; 1838 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->upTriFactorPtr = 0; 1839 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->loTriFactorPtrTranspose = 0; 1840 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->upTriFactorPtrTranspose = 0; 1841 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->rpermIndices = 0; 1842 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->cpermIndices = 0; 1843 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->workVector = 0; 1844 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->handle = 0; 1845 stat = cusparseCreate(&handle);CHKERRCUSP(stat); 1846 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->handle = handle; 1847 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->nnz = 0; 1848 } 1849 1850 /* Here we overload MatGetFactor_petsc_C which enables -mat_type aijcusparse to use the 1851 default cusparse tri solve. Note the difference with the implementation in 1852 MatCreate_SeqAIJCUSP in ../seqcusp/aijcusp.cu */ 1853 ierr = PetscObjectComposeFunction((PetscObject)B,"MatGetFactor_petsc_C",MatGetFactor_seqaij_cusparse);CHKERRQ(ierr); 1854 1855 B->ops->assemblyend = MatAssemblyEnd_SeqAIJCUSPARSE; 1856 B->ops->destroy = MatDestroy_SeqAIJCUSPARSE; 1857 B->ops->getvecs = MatGetVecs_SeqAIJCUSPARSE; 1858 B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE; 1859 B->ops->mult = MatMult_SeqAIJCUSPARSE; 1860 B->ops->multadd = MatMultAdd_SeqAIJCUSPARSE; 1861 B->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE; 1862 B->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE; 1863 1864 ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 1865 1866 B->valid_GPU_matrix = PETSC_CUSP_UNALLOCATED; 1867 1868 ierr = PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr); 1869 PetscFunctionReturn(0); 1870 } 1871 1872 /*M 1873 MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices. 1874 1875 A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either 1876 CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later. 1877 All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library. 1878 1879 Options Database Keys: 1880 + -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions() 1881 . -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 1882 . -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 1883 1884 Level: beginner 1885 1886 .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 1887 M*/ 1888