1 /* 2 Defines the basic matrix operations for the AIJ (compressed row) 3 matrix storage format using the CUSPARSE library, 4 */ 5 6 #include <petscconf.h> 7 #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/ 8 #include <../src/mat/impls/sbaij/seq/sbaij.h> 9 #include <../src/vec/vec/impls/dvecimpl.h> 10 #include <petsc-private/vecimpl.h> 11 #undef VecType 12 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h> 13 14 const char *const MatCUSPARSEStorageFormats[] = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0}; 15 16 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*); 17 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*); 18 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*); 19 20 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 21 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 22 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*); 23 24 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec); 25 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 26 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 27 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 28 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat); 29 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec); 30 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 31 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 32 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 33 34 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**); 35 static PetscErrorCode Mat_SeqAIJCUSPARSETriFactorStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**); 36 static PetscErrorCode Mat_SeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat); 37 static PetscErrorCode Mat_SeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**); 38 static PetscErrorCode Mat_SeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**); 39 40 #undef __FUNCT__ 41 #define __FUNCT__ "MatCUSPARSESetStream" 42 PetscErrorCode MatCUSPARSESetStream(Mat A,const cudaStream_t stream) 43 { 44 cusparseStatus_t stat; 45 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 46 47 PetscFunctionBegin; 48 cusparsestruct->stream = stream; 49 stat = cusparseSetStream(cusparsestruct->handle,cusparsestruct->stream);CHKERRCUSP(stat); 50 PetscFunctionReturn(0); 51 } 52 53 #undef __FUNCT__ 54 #define __FUNCT__ "MatCUSPARSESetHandle" 55 PetscErrorCode MatCUSPARSESetHandle(Mat A,const cusparseHandle_t handle) 56 { 57 cusparseStatus_t stat; 58 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 59 60 PetscFunctionBegin; 61 if (cusparsestruct->handle) 62 stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUSP(stat); 63 cusparsestruct->handle = handle; 64 stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSP(stat); 65 PetscFunctionReturn(0); 66 } 67 68 #undef __FUNCT__ 69 #define __FUNCT__ "MatCUSPARSEClearHandle" 70 PetscErrorCode MatCUSPARSEClearHandle(Mat A) 71 { 72 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 73 PetscFunctionBegin; 74 if (cusparsestruct->handle) 75 cusparsestruct->handle = 0; 76 PetscFunctionReturn(0); 77 } 78 79 #undef __FUNCT__ 80 #define __FUNCT__ "MatFactorGetSolverPackage_seqaij_cusparse" 81 PetscErrorCode MatFactorGetSolverPackage_seqaij_cusparse(Mat A,const MatSolverPackage *type) 82 { 83 PetscFunctionBegin; 84 *type = MATSOLVERCUSPARSE; 85 PetscFunctionReturn(0); 86 } 87 88 /*MC 89 MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices 90 on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported 91 algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer 92 performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the 93 CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these 94 algorithms are not recommended. This class does NOT support direct solver operations. 95 96 Level: beginner 97 98 .seealso: PCFactorSetMatSolverPackage(), MatSolverPackage, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 99 M*/ 100 101 #undef __FUNCT__ 102 #define __FUNCT__ "MatGetFactor_seqaij_cusparse" 103 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaij_cusparse(Mat A,MatFactorType ftype,Mat *B) 104 { 105 PetscErrorCode ierr; 106 PetscInt n = A->rmap->n; 107 108 PetscFunctionBegin; 109 ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr); 110 (*B)->factortype = ftype; 111 ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr); 112 ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 113 114 if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) { 115 ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr); 116 (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE; 117 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJCUSPARSE; 118 } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) { 119 (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJCUSPARSE; 120 (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE; 121 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types"); 122 123 ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr); 124 ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverPackage_C",MatFactorGetSolverPackage_seqaij_cusparse);CHKERRQ(ierr); 125 PetscFunctionReturn(0); 126 } 127 128 #undef __FUNCT__ 129 #define __FUNCT__ "MatCUSPARSESetFormat_SeqAIJCUSPARSE" 130 PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 131 { 132 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 133 134 PetscFunctionBegin; 135 #if CUDA_VERSION>=4020 136 switch (op) { 137 case MAT_CUSPARSE_MULT: 138 cusparsestruct->format = format; 139 break; 140 case MAT_CUSPARSE_ALL: 141 cusparsestruct->format = format; 142 break; 143 default: 144 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op); 145 } 146 #else 147 if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) 148 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"ELL (Ellpack) and HYB (Hybrid) storage format require CUDA 4.2 or later."); 149 #endif 150 PetscFunctionReturn(0); 151 } 152 153 /*@ 154 MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular 155 operation. Only the MatMult operation can use different GPU storage formats 156 for MPIAIJCUSPARSE matrices. 157 Not Collective 158 159 Input Parameters: 160 + A - Matrix of type SEQAIJCUSPARSE 161 . op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL. 162 - format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2) 163 164 Output Parameter: 165 166 Level: intermediate 167 168 .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 169 @*/ 170 #undef __FUNCT__ 171 #define __FUNCT__ "MatCUSPARSESetFormat" 172 PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 173 { 174 PetscErrorCode ierr; 175 176 PetscFunctionBegin; 177 PetscValidHeaderSpecific(A, MAT_CLASSID,1); 178 ierr = PetscTryMethod(A, "MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));CHKERRQ(ierr); 179 PetscFunctionReturn(0); 180 } 181 182 #undef __FUNCT__ 183 #define __FUNCT__ "MatSetFromOptions_SeqAIJCUSPARSE" 184 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat A) 185 { 186 PetscErrorCode ierr; 187 MatCUSPARSEStorageFormat format; 188 PetscBool flg; 189 190 PetscFunctionBegin; 191 ierr = PetscOptionsHead("SeqAIJCUSPARSE options");CHKERRQ(ierr); 192 ierr = PetscObjectOptionsBegin((PetscObject)A); 193 if (A->factortype==MAT_FACTOR_NONE) { 194 ierr = PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV", 195 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)MAT_CUSPARSE_CSR,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 196 if (flg) { 197 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);CHKERRQ(ierr); 198 } 199 } 200 ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve", 201 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)MAT_CUSPARSE_CSR,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 202 if (flg) { 203 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr); 204 } 205 ierr = PetscOptionsEnd();CHKERRQ(ierr); 206 PetscFunctionReturn(0); 207 208 } 209 210 #undef __FUNCT__ 211 #define __FUNCT__ "MatILUFactorSymbolic_SeqAIJCUSPARSE" 212 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 213 { 214 PetscErrorCode ierr; 215 216 PetscFunctionBegin; 217 ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 218 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 219 PetscFunctionReturn(0); 220 } 221 222 #undef __FUNCT__ 223 #define __FUNCT__ "MatLUFactorSymbolic_SeqAIJCUSPARSE" 224 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 225 { 226 PetscErrorCode ierr; 227 228 PetscFunctionBegin; 229 ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 230 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 231 PetscFunctionReturn(0); 232 } 233 234 #undef __FUNCT__ 235 #define __FUNCT__ "MatICCFactorSymbolic_SeqAIJCUSPARSE" 236 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info) 237 { 238 PetscErrorCode ierr; 239 240 PetscFunctionBegin; 241 ierr = MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr); 242 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 243 PetscFunctionReturn(0); 244 } 245 246 #undef __FUNCT__ 247 #define __FUNCT__ "MatCholeskyFactorSymbolic_SeqAIJCUSPARSE" 248 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info) 249 { 250 PetscErrorCode ierr; 251 252 PetscFunctionBegin; 253 ierr = MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr); 254 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 255 PetscFunctionReturn(0); 256 } 257 258 #undef __FUNCT__ 259 #define __FUNCT__ "MatSeqAIJCUSPARSEBuildILULowerTriMatrix" 260 static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A) 261 { 262 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 263 PetscInt n = A->rmap->n; 264 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 265 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 266 cusparseStatus_t stat; 267 const PetscInt *ai = a->i,*aj = a->j,*vi; 268 const MatScalar *aa = a->a,*v; 269 PetscInt *AiLo, *AjLo; 270 PetscScalar *AALo; 271 PetscInt i,nz, nzLower, offset, rowOffset; 272 PetscErrorCode ierr; 273 274 PetscFunctionBegin; 275 if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUSP_CPU) { 276 try { 277 /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */ 278 nzLower=n+ai[n]-ai[1]; 279 280 /* Allocate Space for the lower triangular matrix */ 281 ierr = cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUSP(ierr); 282 ierr = cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUSP(ierr); 283 ierr = cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUSP(ierr); 284 285 /* Fill the lower triangular matrix */ 286 AiLo[0] = (PetscInt) 0; 287 AiLo[n] = nzLower; 288 AjLo[0] = (PetscInt) 0; 289 AALo[0] = (MatScalar) 1.0; 290 v = aa; 291 vi = aj; 292 offset = 1; 293 rowOffset= 1; 294 for (i=1; i<n; i++) { 295 nz = ai[i+1] - ai[i]; 296 /* additional 1 for the term on the diagonal */ 297 AiLo[i] = rowOffset; 298 rowOffset += nz+1; 299 300 ierr = PetscMemcpy(&(AjLo[offset]), vi, nz*sizeof(PetscInt));CHKERRQ(ierr); 301 ierr = PetscMemcpy(&(AALo[offset]), v, nz*sizeof(PetscScalar));CHKERRQ(ierr); 302 303 offset += nz; 304 AjLo[offset] = (PetscInt) i; 305 AALo[offset] = (MatScalar) 1.0; 306 offset += 1; 307 308 v += nz; 309 vi += nz; 310 } 311 312 /* allocate space for the triangular factor information */ 313 loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct; 314 315 /* Create the matrix description */ 316 stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSP(stat); 317 stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSP(stat); 318 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSP(stat); 319 stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER);CHKERRCUSP(stat); 320 stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSP(stat); 321 322 /* Create the solve analysis information */ 323 stat = cusparseCreateSolveAnalysisInfo(&loTriFactor->solveInfo);CHKERRCUSP(stat); 324 325 /* set the operation */ 326 loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 327 328 /* set the matrix */ 329 loTriFactor->csrMat = new CsrMatrix; 330 loTriFactor->csrMat->num_rows = n; 331 loTriFactor->csrMat->num_cols = n; 332 loTriFactor->csrMat->num_entries = nzLower; 333 334 loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1); 335 loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1); 336 337 loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower); 338 loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower); 339 340 loTriFactor->csrMat->values = new THRUSTARRAY(nzLower); 341 loTriFactor->csrMat->values->assign(AALo, AALo+nzLower); 342 343 /* perform the solve analysis */ 344 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, 345 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 346 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 347 loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo);CHKERRCUSP(stat); 348 349 /* assign the pointer. Is this really necessary? */ 350 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor; 351 352 ierr = cudaFreeHost(AiLo);CHKERRCUSP(ierr); 353 ierr = cudaFreeHost(AjLo);CHKERRCUSP(ierr); 354 ierr = cudaFreeHost(AALo);CHKERRCUSP(ierr); 355 } catch(char *ex) { 356 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 357 } 358 } 359 PetscFunctionReturn(0); 360 } 361 362 #undef __FUNCT__ 363 #define __FUNCT__ "MatSeqAIJCUSPARSEBuildILUUpperTriMatrix" 364 static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A) 365 { 366 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 367 PetscInt n = A->rmap->n; 368 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 369 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 370 cusparseStatus_t stat; 371 const PetscInt *aj = a->j,*adiag = a->diag,*vi; 372 const MatScalar *aa = a->a,*v; 373 PetscInt *AiUp, *AjUp; 374 PetscScalar *AAUp; 375 PetscInt i,nz, nzUpper, offset; 376 PetscErrorCode ierr; 377 378 PetscFunctionBegin; 379 if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUSP_CPU) { 380 try { 381 /* next, figure out the number of nonzeros in the upper triangular matrix. */ 382 nzUpper = adiag[0]-adiag[n]; 383 384 /* Allocate Space for the upper triangular matrix */ 385 ierr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUSP(ierr); 386 ierr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUSP(ierr); 387 ierr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUSP(ierr); 388 389 /* Fill the upper triangular matrix */ 390 AiUp[0]=(PetscInt) 0; 391 AiUp[n]=nzUpper; 392 offset = nzUpper; 393 for (i=n-1; i>=0; i--) { 394 v = aa + adiag[i+1] + 1; 395 vi = aj + adiag[i+1] + 1; 396 397 /* number of elements NOT on the diagonal */ 398 nz = adiag[i] - adiag[i+1]-1; 399 400 /* decrement the offset */ 401 offset -= (nz+1); 402 403 /* first, set the diagonal elements */ 404 AjUp[offset] = (PetscInt) i; 405 AAUp[offset] = 1./v[nz]; 406 AiUp[i] = AiUp[i+1] - (nz+1); 407 408 ierr = PetscMemcpy(&(AjUp[offset+1]), vi, nz*sizeof(PetscInt));CHKERRQ(ierr); 409 ierr = PetscMemcpy(&(AAUp[offset+1]), v, nz*sizeof(PetscScalar));CHKERRQ(ierr); 410 } 411 412 /* allocate space for the triangular factor information */ 413 upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct; 414 415 /* Create the matrix description */ 416 stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSP(stat); 417 stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSP(stat); 418 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSP(stat); 419 stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSP(stat); 420 stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSP(stat); 421 422 /* Create the solve analysis information */ 423 stat = cusparseCreateSolveAnalysisInfo(&upTriFactor->solveInfo);CHKERRCUSP(stat); 424 425 /* set the operation */ 426 upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 427 428 /* set the matrix */ 429 upTriFactor->csrMat = new CsrMatrix; 430 upTriFactor->csrMat->num_rows = n; 431 upTriFactor->csrMat->num_cols = n; 432 upTriFactor->csrMat->num_entries = nzUpper; 433 434 upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1); 435 upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1); 436 437 upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper); 438 upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper); 439 440 upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper); 441 upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper); 442 443 /* perform the solve analysis */ 444 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, 445 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 446 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 447 upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo);CHKERRCUSP(stat); 448 449 /* assign the pointer. Is this really necessary? */ 450 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor; 451 452 ierr = cudaFreeHost(AiUp);CHKERRCUSP(ierr); 453 ierr = cudaFreeHost(AjUp);CHKERRCUSP(ierr); 454 ierr = cudaFreeHost(AAUp);CHKERRCUSP(ierr); 455 } catch(char *ex) { 456 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 457 } 458 } 459 PetscFunctionReturn(0); 460 } 461 462 #undef __FUNCT__ 463 #define __FUNCT__ "MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU" 464 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A) 465 { 466 PetscErrorCode ierr; 467 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 468 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 469 IS isrow = a->row,iscol = a->icol; 470 PetscBool row_identity,col_identity; 471 const PetscInt *r,*c; 472 PetscInt n = A->rmap->n; 473 474 PetscFunctionBegin; 475 ierr = MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);CHKERRQ(ierr); 476 ierr = MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);CHKERRQ(ierr); 477 478 cusparseTriFactors->workVector = new THRUSTARRAY; 479 cusparseTriFactors->workVector->resize(n); 480 cusparseTriFactors->nnz=a->nz; 481 482 A->valid_GPU_matrix = PETSC_CUSP_BOTH; 483 /*lower triangular indices */ 484 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 485 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 486 if (!row_identity) { 487 cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 488 cusparseTriFactors->rpermIndices->assign(r, r+n); 489 } 490 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 491 492 /*upper triangular indices */ 493 ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr); 494 ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 495 if (!col_identity) { 496 cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 497 cusparseTriFactors->cpermIndices->assign(c, c+n); 498 } 499 ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr); 500 PetscFunctionReturn(0); 501 } 502 503 #undef __FUNCT__ 504 #define __FUNCT__ "MatSeqAIJCUSPARSEBuildICCTriMatrices" 505 static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A) 506 { 507 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 508 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 509 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 510 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 511 cusparseStatus_t stat; 512 PetscErrorCode ierr; 513 PetscInt *AiUp, *AjUp; 514 PetscScalar *AAUp; 515 PetscScalar *AALo; 516 PetscInt nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j; 517 Mat_SeqSBAIJ *b = (Mat_SeqSBAIJ*)A->data; 518 const PetscInt *ai = b->i,*aj = b->j,*vj; 519 const MatScalar *aa = b->a,*v; 520 521 PetscFunctionBegin; 522 if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUSP_CPU) { 523 try { 524 /* Allocate Space for the upper triangular matrix */ 525 ierr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUSP(ierr); 526 ierr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUSP(ierr); 527 ierr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUSP(ierr); 528 ierr = cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUSP(ierr); 529 530 /* Fill the upper triangular matrix */ 531 AiUp[0]=(PetscInt) 0; 532 AiUp[n]=nzUpper; 533 offset = 0; 534 for (i=0; i<n; i++) { 535 /* set the pointers */ 536 v = aa + ai[i]; 537 vj = aj + ai[i]; 538 nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */ 539 540 /* first, set the diagonal elements */ 541 AjUp[offset] = (PetscInt) i; 542 AAUp[offset] = 1.0/v[nz]; 543 AiUp[i] = offset; 544 AALo[offset] = 1.0/v[nz]; 545 546 offset+=1; 547 if (nz>0) { 548 ierr = PetscMemcpy(&(AjUp[offset]), vj, nz*sizeof(PetscInt));CHKERRQ(ierr); 549 ierr = PetscMemcpy(&(AAUp[offset]), v, nz*sizeof(PetscScalar));CHKERRQ(ierr); 550 for (j=offset; j<offset+nz; j++) { 551 AAUp[j] = -AAUp[j]; 552 AALo[j] = AAUp[j]/v[nz]; 553 } 554 offset+=nz; 555 } 556 } 557 558 /* allocate space for the triangular factor information */ 559 upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct; 560 561 /* Create the matrix description */ 562 stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSP(stat); 563 stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSP(stat); 564 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSP(stat); 565 stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSP(stat); 566 stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSP(stat); 567 568 /* Create the solve analysis information */ 569 stat = cusparseCreateSolveAnalysisInfo(&upTriFactor->solveInfo);CHKERRCUSP(stat); 570 571 /* set the operation */ 572 upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 573 574 /* set the matrix */ 575 upTriFactor->csrMat = new CsrMatrix; 576 upTriFactor->csrMat->num_rows = A->rmap->n; 577 upTriFactor->csrMat->num_cols = A->cmap->n; 578 upTriFactor->csrMat->num_entries = a->nz; 579 580 upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 581 upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1); 582 583 upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 584 upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz); 585 586 upTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 587 upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz); 588 589 /* perform the solve analysis */ 590 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, 591 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 592 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 593 upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo);CHKERRCUSP(stat); 594 595 /* assign the pointer. Is this really necessary? */ 596 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor; 597 598 /* allocate space for the triangular factor information */ 599 loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct; 600 601 /* Create the matrix description */ 602 stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSP(stat); 603 stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSP(stat); 604 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSP(stat); 605 stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSP(stat); 606 stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSP(stat); 607 608 /* Create the solve analysis information */ 609 stat = cusparseCreateSolveAnalysisInfo(&loTriFactor->solveInfo);CHKERRCUSP(stat); 610 611 /* set the operation */ 612 loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE; 613 614 /* set the matrix */ 615 loTriFactor->csrMat = new CsrMatrix; 616 loTriFactor->csrMat->num_rows = A->rmap->n; 617 loTriFactor->csrMat->num_cols = A->cmap->n; 618 loTriFactor->csrMat->num_entries = a->nz; 619 620 loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 621 loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1); 622 623 loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 624 loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz); 625 626 loTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 627 loTriFactor->csrMat->values->assign(AALo, AALo+a->nz); 628 629 /* perform the solve analysis */ 630 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, 631 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 632 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 633 loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo);CHKERRCUSP(stat); 634 635 /* assign the pointer. Is this really necessary? */ 636 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor; 637 638 A->valid_GPU_matrix = PETSC_CUSP_BOTH; 639 ierr = cudaFreeHost(AiUp);CHKERRCUSP(ierr); 640 ierr = cudaFreeHost(AjUp);CHKERRCUSP(ierr); 641 ierr = cudaFreeHost(AAUp);CHKERRCUSP(ierr); 642 ierr = cudaFreeHost(AALo);CHKERRCUSP(ierr); 643 } catch(char *ex) { 644 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 645 } 646 } 647 PetscFunctionReturn(0); 648 } 649 650 #undef __FUNCT__ 651 #define __FUNCT__ "MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU" 652 static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A) 653 { 654 PetscErrorCode ierr; 655 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 656 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 657 IS ip = a->row; 658 const PetscInt *rip; 659 PetscBool perm_identity; 660 PetscInt n = A->rmap->n; 661 662 PetscFunctionBegin; 663 ierr = MatSeqAIJCUSPARSEBuildICCTriMatrices(A);CHKERRQ(ierr); 664 cusparseTriFactors->workVector = new THRUSTARRAY; 665 cusparseTriFactors->workVector->resize(n); 666 cusparseTriFactors->nnz=(a->nz-n)*2 + n; 667 668 /*lower triangular indices */ 669 ierr = ISGetIndices(ip,&rip);CHKERRQ(ierr); 670 ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr); 671 if (!perm_identity) { 672 cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 673 cusparseTriFactors->rpermIndices->assign(rip, rip+n); 674 cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 675 cusparseTriFactors->cpermIndices->assign(rip, rip+n); 676 } 677 ierr = ISRestoreIndices(ip,&rip);CHKERRQ(ierr); 678 PetscFunctionReturn(0); 679 } 680 681 #undef __FUNCT__ 682 #define __FUNCT__ "MatLUFactorNumeric_SeqAIJCUSPARSE" 683 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info) 684 { 685 Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 686 IS isrow = b->row,iscol = b->col; 687 PetscBool row_identity,col_identity; 688 PetscErrorCode ierr; 689 690 PetscFunctionBegin; 691 ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 692 /* determine which version of MatSolve needs to be used. */ 693 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 694 ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 695 if (row_identity && col_identity) { 696 B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 697 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 698 } else { 699 B->ops->solve = MatSolve_SeqAIJCUSPARSE; 700 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 701 } 702 703 /* get the triangular factors */ 704 ierr = MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);CHKERRQ(ierr); 705 PetscFunctionReturn(0); 706 } 707 708 #undef __FUNCT__ 709 #define __FUNCT__ "MatCholeskyFactorNumeric_SeqAIJCUSPARSE" 710 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info) 711 { 712 Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 713 IS ip = b->row; 714 PetscBool perm_identity; 715 PetscErrorCode ierr; 716 717 PetscFunctionBegin; 718 ierr = MatCholeskyFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 719 720 /* determine which version of MatSolve needs to be used. */ 721 ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr); 722 if (perm_identity) { 723 B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 724 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 725 } else { 726 B->ops->solve = MatSolve_SeqAIJCUSPARSE; 727 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 728 } 729 730 /* get the triangular factors */ 731 ierr = MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);CHKERRQ(ierr); 732 PetscFunctionReturn(0); 733 } 734 735 #undef __FUNCT__ 736 #define __FUNCT__ "MatSeqAIJCUSPARSEAnalyzeTransposeForSolve" 737 static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A) 738 { 739 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 740 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 741 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 742 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 743 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 744 cusparseStatus_t stat; 745 cusparseIndexBase_t indexBase; 746 cusparseMatrixType_t matrixType; 747 cusparseFillMode_t fillMode; 748 cusparseDiagType_t diagType; 749 750 PetscFunctionBegin; 751 752 /*********************************************/ 753 /* Now the Transpose of the Lower Tri Factor */ 754 /*********************************************/ 755 756 /* allocate space for the transpose of the lower triangular factor */ 757 loTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct; 758 759 /* set the matrix descriptors of the lower triangular factor */ 760 matrixType = cusparseGetMatType(loTriFactor->descr); 761 indexBase = cusparseGetMatIndexBase(loTriFactor->descr); 762 fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ? 763 CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 764 diagType = cusparseGetMatDiagType(loTriFactor->descr); 765 766 /* Create the matrix description */ 767 stat = cusparseCreateMatDescr(&loTriFactorT->descr);CHKERRCUSP(stat); 768 stat = cusparseSetMatIndexBase(loTriFactorT->descr, indexBase);CHKERRCUSP(stat); 769 stat = cusparseSetMatType(loTriFactorT->descr, matrixType);CHKERRCUSP(stat); 770 stat = cusparseSetMatFillMode(loTriFactorT->descr, fillMode);CHKERRCUSP(stat); 771 stat = cusparseSetMatDiagType(loTriFactorT->descr, diagType);CHKERRCUSP(stat); 772 773 /* Create the solve analysis information */ 774 stat = cusparseCreateSolveAnalysisInfo(&loTriFactorT->solveInfo);CHKERRCUSP(stat); 775 776 /* set the operation */ 777 loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 778 779 /* allocate GPU space for the CSC of the lower triangular factor*/ 780 loTriFactorT->csrMat = new CsrMatrix; 781 loTriFactorT->csrMat->num_rows = loTriFactor->csrMat->num_rows; 782 loTriFactorT->csrMat->num_cols = loTriFactor->csrMat->num_cols; 783 loTriFactorT->csrMat->num_entries = loTriFactor->csrMat->num_entries; 784 loTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(loTriFactor->csrMat->num_rows+1); 785 loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactor->csrMat->num_entries); 786 loTriFactorT->csrMat->values = new THRUSTARRAY(loTriFactor->csrMat->num_entries); 787 788 /* compute the transpose of the lower triangular factor, i.e. the CSC */ 789 stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, 790 loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, 791 loTriFactor->csrMat->values->data().get(), 792 loTriFactor->csrMat->row_offsets->data().get(), 793 loTriFactor->csrMat->column_indices->data().get(), 794 loTriFactorT->csrMat->values->data().get(), 795 loTriFactorT->csrMat->column_indices->data().get(), 796 loTriFactorT->csrMat->row_offsets->data().get(), 797 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSP(stat); 798 799 /* perform the solve analysis on the transposed matrix */ 800 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp, 801 loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, 802 loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(), 803 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), 804 loTriFactorT->solveInfo);CHKERRCUSP(stat); 805 806 /* assign the pointer. Is this really necessary? */ 807 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT; 808 809 /*********************************************/ 810 /* Now the Transpose of the Upper Tri Factor */ 811 /*********************************************/ 812 813 /* allocate space for the transpose of the upper triangular factor */ 814 upTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct; 815 816 /* set the matrix descriptors of the upper triangular factor */ 817 matrixType = cusparseGetMatType(upTriFactor->descr); 818 indexBase = cusparseGetMatIndexBase(upTriFactor->descr); 819 fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ? 820 CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 821 diagType = cusparseGetMatDiagType(upTriFactor->descr); 822 823 /* Create the matrix description */ 824 stat = cusparseCreateMatDescr(&upTriFactorT->descr);CHKERRCUSP(stat); 825 stat = cusparseSetMatIndexBase(upTriFactorT->descr, indexBase);CHKERRCUSP(stat); 826 stat = cusparseSetMatType(upTriFactorT->descr, matrixType);CHKERRCUSP(stat); 827 stat = cusparseSetMatFillMode(upTriFactorT->descr, fillMode);CHKERRCUSP(stat); 828 stat = cusparseSetMatDiagType(upTriFactorT->descr, diagType);CHKERRCUSP(stat); 829 830 /* Create the solve analysis information */ 831 stat = cusparseCreateSolveAnalysisInfo(&upTriFactorT->solveInfo);CHKERRCUSP(stat); 832 833 /* set the operation */ 834 upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 835 836 /* allocate GPU space for the CSC of the upper triangular factor*/ 837 upTriFactorT->csrMat = new CsrMatrix; 838 upTriFactorT->csrMat->num_rows = upTriFactor->csrMat->num_rows; 839 upTriFactorT->csrMat->num_cols = upTriFactor->csrMat->num_cols; 840 upTriFactorT->csrMat->num_entries = upTriFactor->csrMat->num_entries; 841 upTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(upTriFactor->csrMat->num_rows+1); 842 upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactor->csrMat->num_entries); 843 upTriFactorT->csrMat->values = new THRUSTARRAY(upTriFactor->csrMat->num_entries); 844 845 /* compute the transpose of the upper triangular factor, i.e. the CSC */ 846 stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, 847 upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, 848 upTriFactor->csrMat->values->data().get(), 849 upTriFactor->csrMat->row_offsets->data().get(), 850 upTriFactor->csrMat->column_indices->data().get(), 851 upTriFactorT->csrMat->values->data().get(), 852 upTriFactorT->csrMat->column_indices->data().get(), 853 upTriFactorT->csrMat->row_offsets->data().get(), 854 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSP(stat); 855 856 /* perform the solve analysis on the transposed matrix */ 857 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp, 858 upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, 859 upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(), 860 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), 861 upTriFactorT->solveInfo);CHKERRCUSP(stat); 862 863 /* assign the pointer. Is this really necessary? */ 864 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT; 865 PetscFunctionReturn(0); 866 } 867 868 #undef __FUNCT__ 869 #define __FUNCT__ "MatSeqAIJCUSPARSEGenerateTransposeForMult" 870 static PetscErrorCode MatSeqAIJCUSPARSEGenerateTransposeForMult(Mat A) 871 { 872 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 873 Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 874 Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 875 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 876 cusparseStatus_t stat; 877 cusparseIndexBase_t indexBase; 878 cudaError_t err; 879 880 PetscFunctionBegin; 881 882 /* allocate space for the triangular factor information */ 883 matstructT = new Mat_SeqAIJCUSPARSEMultStruct; 884 stat = cusparseCreateMatDescr(&matstructT->descr);CHKERRCUSP(stat); 885 indexBase = cusparseGetMatIndexBase(matstruct->descr); 886 stat = cusparseSetMatIndexBase(matstructT->descr, indexBase);CHKERRCUSP(stat); 887 stat = cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSP(stat); 888 889 /* set alpha and beta */ 890 err = cudaMalloc((void **)&(matstructT->alpha),sizeof(PetscScalar));CHKERRCUSP(err); 891 err = cudaMemcpy(matstructT->alpha,&ALPHA,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUSP(err); 892 err = cudaMalloc((void **)&(matstructT->beta),sizeof(PetscScalar));CHKERRCUSP(err); 893 err = cudaMemcpy(matstructT->beta,&BETA,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUSP(err); 894 stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSP(stat); 895 896 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 897 CsrMatrix *matrix = (CsrMatrix*)matstruct->mat; 898 CsrMatrix *matrixT= new CsrMatrix; 899 matrixT->num_rows = A->rmap->n; 900 matrixT->num_cols = A->cmap->n; 901 matrixT->num_entries = a->nz; 902 matrixT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 903 matrixT->column_indices = new THRUSTINTARRAY32(a->nz); 904 matrixT->values = new THRUSTARRAY(a->nz); 905 906 /* compute the transpose of the upper triangular factor, i.e. the CSC */ 907 indexBase = cusparseGetMatIndexBase(matstruct->descr); 908 stat = cusparse_csr2csc(cusparsestruct->handle, matrix->num_rows, 909 matrix->num_cols, matrix->num_entries, 910 matrix->values->data().get(), 911 matrix->row_offsets->data().get(), 912 matrix->column_indices->data().get(), 913 matrixT->values->data().get(), 914 matrixT->column_indices->data().get(), 915 matrixT->row_offsets->data().get(), 916 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSP(stat); 917 918 /* assign the pointer */ 919 matstructT->mat = matrixT; 920 921 } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) { 922 #if CUDA_VERSION>=5000 923 /* First convert HYB to CSR */ 924 CsrMatrix *temp= new CsrMatrix; 925 temp->num_rows = A->rmap->n; 926 temp->num_cols = A->cmap->n; 927 temp->num_entries = a->nz; 928 temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 929 temp->column_indices = new THRUSTINTARRAY32(a->nz); 930 temp->values = new THRUSTARRAY(a->nz); 931 932 933 stat = cusparse_hyb2csr(cusparsestruct->handle, 934 matstruct->descr, (cusparseHybMat_t)matstruct->mat, 935 temp->values->data().get(), 936 temp->row_offsets->data().get(), 937 temp->column_indices->data().get());CHKERRCUSP(stat); 938 939 /* Next, convert CSR to CSC (i.e. the matrix transpose) */ 940 CsrMatrix *tempT= new CsrMatrix; 941 tempT->num_rows = A->rmap->n; 942 tempT->num_cols = A->cmap->n; 943 tempT->num_entries = a->nz; 944 tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 945 tempT->column_indices = new THRUSTINTARRAY32(a->nz); 946 tempT->values = new THRUSTARRAY(a->nz); 947 948 stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows, 949 temp->num_cols, temp->num_entries, 950 temp->values->data().get(), 951 temp->row_offsets->data().get(), 952 temp->column_indices->data().get(), 953 tempT->values->data().get(), 954 tempT->column_indices->data().get(), 955 tempT->row_offsets->data().get(), 956 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSP(stat); 957 958 /* Last, convert CSC to HYB */ 959 cusparseHybMat_t hybMat; 960 stat = cusparseCreateHybMat(&hybMat);CHKERRCUSP(stat); 961 cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ? 962 CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 963 stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n, 964 matstructT->descr, tempT->values->data().get(), 965 tempT->row_offsets->data().get(), 966 tempT->column_indices->data().get(), 967 hybMat, 0, partition);CHKERRCUSP(stat); 968 969 /* assign the pointer */ 970 matstructT->mat = hybMat; 971 972 /* delete temporaries */ 973 if (tempT) { 974 if (tempT->values) delete (THRUSTARRAY*) tempT->values; 975 if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices; 976 if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets; 977 delete (CsrMatrix*) tempT; 978 } 979 if (temp) { 980 if (temp->values) delete (THRUSTARRAY*) temp->values; 981 if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices; 982 if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets; 983 delete (CsrMatrix*) temp; 984 } 985 #else 986 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"ELL (Ellpack) and HYB (Hybrid) storage format for the Matrix Transpose (in MatMultTranspose) require CUDA 5.0 or later."); 987 #endif 988 } 989 /* assign the compressed row indices */ 990 matstructT->cprowIndices = new THRUSTINTARRAY; 991 992 /* assign the pointer */ 993 ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT; 994 PetscFunctionReturn(0); 995 } 996 997 #undef __FUNCT__ 998 #define __FUNCT__ "MatSolveTranspose_SeqAIJCUSPARSE" 999 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 1000 { 1001 CUSPARRAY *xGPU, *bGPU; 1002 cusparseStatus_t stat; 1003 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1004 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1005 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1006 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1007 PetscErrorCode ierr; 1008 1009 PetscFunctionBegin; 1010 /* Analyze the matrix and create the transpose ... on the fly */ 1011 if (!loTriFactorT && !upTriFactorT) { 1012 ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr); 1013 loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1014 upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1015 } 1016 1017 /* Get the GPU pointers */ 1018 ierr = VecCUSPGetArrayWrite(xx,&xGPU);CHKERRQ(ierr); 1019 ierr = VecCUSPGetArrayRead(bb,&bGPU);CHKERRQ(ierr); 1020 1021 /* First, reorder with the row permutation */ 1022 thrust::copy(thrust::make_permutation_iterator(bGPU->begin(), cusparseTriFactors->rpermIndices->begin()), 1023 thrust::make_permutation_iterator(bGPU->end(), cusparseTriFactors->rpermIndices->end()), 1024 xGPU->begin()); 1025 1026 /* First, solve U */ 1027 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, 1028 upTriFactorT->csrMat->num_rows, &ALPHA, upTriFactorT->descr, 1029 upTriFactorT->csrMat->values->data().get(), 1030 upTriFactorT->csrMat->row_offsets->data().get(), 1031 upTriFactorT->csrMat->column_indices->data().get(), 1032 upTriFactorT->solveInfo, 1033 xGPU->data().get(), tempGPU->data().get());CHKERRCUSP(stat); 1034 1035 /* Then, solve L */ 1036 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, 1037 loTriFactorT->csrMat->num_rows, &ALPHA, loTriFactorT->descr, 1038 loTriFactorT->csrMat->values->data().get(), 1039 loTriFactorT->csrMat->row_offsets->data().get(), 1040 loTriFactorT->csrMat->column_indices->data().get(), 1041 loTriFactorT->solveInfo, 1042 tempGPU->data().get(), xGPU->data().get());CHKERRCUSP(stat); 1043 1044 /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */ 1045 thrust::copy(thrust::make_permutation_iterator(xGPU->begin(), cusparseTriFactors->cpermIndices->begin()), 1046 thrust::make_permutation_iterator(xGPU->end(), cusparseTriFactors->cpermIndices->end()), 1047 tempGPU->begin()); 1048 1049 /* Copy the temporary to the full solution. */ 1050 thrust::copy(tempGPU->begin(), tempGPU->end(), xGPU->begin()); 1051 1052 /* restore */ 1053 ierr = VecCUSPRestoreArrayRead(bb,&bGPU);CHKERRQ(ierr); 1054 ierr = VecCUSPRestoreArrayWrite(xx,&xGPU);CHKERRQ(ierr); 1055 ierr = WaitForGPU();CHKERRCUSP(ierr); 1056 1057 ierr = PetscLogFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1058 PetscFunctionReturn(0); 1059 } 1060 1061 #undef __FUNCT__ 1062 #define __FUNCT__ "MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering" 1063 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 1064 { 1065 CUSPARRAY *xGPU,*bGPU; 1066 cusparseStatus_t stat; 1067 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1068 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1069 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1070 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1071 PetscErrorCode ierr; 1072 1073 PetscFunctionBegin; 1074 /* Analyze the matrix and create the transpose ... on the fly */ 1075 if (!loTriFactorT && !upTriFactorT) { 1076 ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr); 1077 loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1078 upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1079 } 1080 1081 /* Get the GPU pointers */ 1082 ierr = VecCUSPGetArrayWrite(xx,&xGPU);CHKERRQ(ierr); 1083 ierr = VecCUSPGetArrayRead(bb,&bGPU);CHKERRQ(ierr); 1084 1085 /* First, solve U */ 1086 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, 1087 upTriFactorT->csrMat->num_rows, &ALPHA, upTriFactorT->descr, 1088 upTriFactorT->csrMat->values->data().get(), 1089 upTriFactorT->csrMat->row_offsets->data().get(), 1090 upTriFactorT->csrMat->column_indices->data().get(), 1091 upTriFactorT->solveInfo, 1092 bGPU->data().get(), tempGPU->data().get());CHKERRCUSP(stat); 1093 1094 /* Then, solve L */ 1095 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, 1096 loTriFactorT->csrMat->num_rows, &ALPHA, loTriFactorT->descr, 1097 loTriFactorT->csrMat->values->data().get(), 1098 loTriFactorT->csrMat->row_offsets->data().get(), 1099 loTriFactorT->csrMat->column_indices->data().get(), 1100 loTriFactorT->solveInfo, 1101 tempGPU->data().get(), xGPU->data().get());CHKERRCUSP(stat); 1102 1103 /* restore */ 1104 ierr = VecCUSPRestoreArrayRead(bb,&bGPU);CHKERRQ(ierr); 1105 ierr = VecCUSPRestoreArrayWrite(xx,&xGPU);CHKERRQ(ierr); 1106 ierr = WaitForGPU();CHKERRCUSP(ierr); 1107 ierr = PetscLogFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1108 PetscFunctionReturn(0); 1109 } 1110 1111 #undef __FUNCT__ 1112 #define __FUNCT__ "MatSolve_SeqAIJCUSPARSE" 1113 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 1114 { 1115 CUSPARRAY *xGPU,*bGPU; 1116 cusparseStatus_t stat; 1117 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1118 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1119 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1120 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1121 PetscErrorCode ierr; 1122 1123 PetscFunctionBegin; 1124 /* Get the GPU pointers */ 1125 ierr = VecCUSPGetArrayWrite(xx,&xGPU);CHKERRQ(ierr); 1126 ierr = VecCUSPGetArrayRead(bb,&bGPU);CHKERRQ(ierr); 1127 1128 /* First, reorder with the row permutation */ 1129 thrust::copy(thrust::make_permutation_iterator(bGPU->begin(), cusparseTriFactors->rpermIndices->begin()), 1130 thrust::make_permutation_iterator(bGPU->end(), cusparseTriFactors->rpermIndices->end()), 1131 xGPU->begin()); 1132 1133 /* Next, solve L */ 1134 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp, 1135 loTriFactor->csrMat->num_rows, &ALPHA, loTriFactor->descr, 1136 loTriFactor->csrMat->values->data().get(), 1137 loTriFactor->csrMat->row_offsets->data().get(), 1138 loTriFactor->csrMat->column_indices->data().get(), 1139 loTriFactor->solveInfo, 1140 xGPU->data().get(), tempGPU->data().get());CHKERRCUSP(stat); 1141 1142 /* Then, solve U */ 1143 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp, 1144 upTriFactor->csrMat->num_rows, &ALPHA, upTriFactor->descr, 1145 upTriFactor->csrMat->values->data().get(), 1146 upTriFactor->csrMat->row_offsets->data().get(), 1147 upTriFactor->csrMat->column_indices->data().get(), 1148 upTriFactor->solveInfo, 1149 tempGPU->data().get(), xGPU->data().get());CHKERRCUSP(stat); 1150 1151 /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */ 1152 thrust::copy(thrust::make_permutation_iterator(xGPU->begin(), cusparseTriFactors->cpermIndices->begin()), 1153 thrust::make_permutation_iterator(xGPU->end(), cusparseTriFactors->cpermIndices->end()), 1154 tempGPU->begin()); 1155 1156 /* Copy the temporary to the full solution. */ 1157 thrust::copy(tempGPU->begin(), tempGPU->end(), xGPU->begin()); 1158 1159 ierr = VecCUSPRestoreArrayRead(bb,&bGPU);CHKERRQ(ierr); 1160 ierr = VecCUSPRestoreArrayWrite(xx,&xGPU);CHKERRQ(ierr); 1161 ierr = WaitForGPU();CHKERRCUSP(ierr); 1162 ierr = PetscLogFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1163 PetscFunctionReturn(0); 1164 } 1165 1166 #undef __FUNCT__ 1167 #define __FUNCT__ "MatSolve_SeqAIJCUSPARSE_NaturalOrdering" 1168 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 1169 { 1170 CUSPARRAY *xGPU,*bGPU; 1171 cusparseStatus_t stat; 1172 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1173 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1174 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1175 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1176 PetscErrorCode ierr; 1177 1178 PetscFunctionBegin; 1179 /* Get the GPU pointers */ 1180 ierr = VecCUSPGetArrayWrite(xx,&xGPU);CHKERRQ(ierr); 1181 ierr = VecCUSPGetArrayRead(bb,&bGPU);CHKERRQ(ierr); 1182 1183 /* First, solve L */ 1184 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp, 1185 loTriFactor->csrMat->num_rows, &ALPHA, loTriFactor->descr, 1186 loTriFactor->csrMat->values->data().get(), 1187 loTriFactor->csrMat->row_offsets->data().get(), 1188 loTriFactor->csrMat->column_indices->data().get(), 1189 loTriFactor->solveInfo, 1190 bGPU->data().get(), tempGPU->data().get());CHKERRCUSP(stat); 1191 1192 /* Next, solve U */ 1193 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp, 1194 upTriFactor->csrMat->num_rows, &ALPHA, upTriFactor->descr, 1195 upTriFactor->csrMat->values->data().get(), 1196 upTriFactor->csrMat->row_offsets->data().get(), 1197 upTriFactor->csrMat->column_indices->data().get(), 1198 upTriFactor->solveInfo, 1199 tempGPU->data().get(), xGPU->data().get());CHKERRCUSP(stat); 1200 1201 ierr = VecCUSPRestoreArrayRead(bb,&bGPU);CHKERRQ(ierr); 1202 ierr = VecCUSPRestoreArrayWrite(xx,&xGPU);CHKERRQ(ierr); 1203 ierr = WaitForGPU();CHKERRCUSP(ierr); 1204 ierr = PetscLogFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1205 PetscFunctionReturn(0); 1206 } 1207 1208 #undef __FUNCT__ 1209 #define __FUNCT__ "MatSeqAIJCUSPARSECopyToGPU" 1210 static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A) 1211 { 1212 1213 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1214 Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 1215 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1216 PetscInt m = A->rmap->n,*ii,*ridx; 1217 PetscErrorCode ierr; 1218 cusparseStatus_t stat; 1219 cudaError_t err; 1220 1221 PetscFunctionBegin; 1222 if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUSP_CPU) { 1223 ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1224 Mat_SeqAIJCUSPARSEMultStruct_Destroy(&matstruct,cusparsestruct->format); 1225 try { 1226 cusparsestruct->nonzerorow=0; 1227 for (int j = 0; j<m; j++) cusparsestruct->nonzerorow += ((a->i[j+1]-a->i[j])>0); 1228 1229 if (a->compressedrow.use) { 1230 m = a->compressedrow.nrows; 1231 ii = a->compressedrow.i; 1232 ridx = a->compressedrow.rindex; 1233 } else { 1234 /* Forcing compressed row on the GPU */ 1235 int k=0; 1236 ierr = PetscMalloc1((cusparsestruct->nonzerorow+1), &ii);CHKERRQ(ierr); 1237 ierr = PetscMalloc1((cusparsestruct->nonzerorow), &ridx);CHKERRQ(ierr); 1238 ii[0]=0; 1239 for (int j = 0; j<m; j++) { 1240 if ((a->i[j+1]-a->i[j])>0) { 1241 ii[k] = a->i[j]; 1242 ridx[k]= j; 1243 k++; 1244 } 1245 } 1246 ii[cusparsestruct->nonzerorow] = a->nz; 1247 m = cusparsestruct->nonzerorow; 1248 } 1249 1250 /* allocate space for the triangular factor information */ 1251 matstruct = new Mat_SeqAIJCUSPARSEMultStruct; 1252 stat = cusparseCreateMatDescr(&matstruct->descr);CHKERRCUSP(stat); 1253 stat = cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSP(stat); 1254 stat = cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSP(stat); 1255 1256 err = cudaMalloc((void **)&(matstruct->alpha),sizeof(PetscScalar));CHKERRCUSP(err); 1257 err = cudaMemcpy(matstruct->alpha,&ALPHA,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUSP(err); 1258 err = cudaMalloc((void **)&(matstruct->beta),sizeof(PetscScalar));CHKERRCUSP(err); 1259 err = cudaMemcpy(matstruct->beta,&BETA,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUSP(err); 1260 stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSP(stat); 1261 1262 /* Build a hybrid/ellpack matrix if this option is chosen for the storage */ 1263 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1264 /* set the matrix */ 1265 CsrMatrix *matrix= new CsrMatrix; 1266 matrix->num_rows = m; 1267 matrix->num_cols = A->cmap->n; 1268 matrix->num_entries = a->nz; 1269 matrix->row_offsets = new THRUSTINTARRAY32(m+1); 1270 matrix->row_offsets->assign(ii, ii + m+1); 1271 1272 matrix->column_indices = new THRUSTINTARRAY32(a->nz); 1273 matrix->column_indices->assign(a->j, a->j+a->nz); 1274 1275 matrix->values = new THRUSTARRAY(a->nz); 1276 matrix->values->assign(a->a, a->a+a->nz); 1277 1278 /* assign the pointer */ 1279 matstruct->mat = matrix; 1280 1281 } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) { 1282 #if CUDA_VERSION>=4020 1283 CsrMatrix *matrix= new CsrMatrix; 1284 matrix->num_rows = m; 1285 matrix->num_cols = A->cmap->n; 1286 matrix->num_entries = a->nz; 1287 matrix->row_offsets = new THRUSTINTARRAY32(m+1); 1288 matrix->row_offsets->assign(ii, ii + m+1); 1289 1290 matrix->column_indices = new THRUSTINTARRAY32(a->nz); 1291 matrix->column_indices->assign(a->j, a->j+a->nz); 1292 1293 matrix->values = new THRUSTARRAY(a->nz); 1294 matrix->values->assign(a->a, a->a+a->nz); 1295 1296 cusparseHybMat_t hybMat; 1297 stat = cusparseCreateHybMat(&hybMat);CHKERRCUSP(stat); 1298 cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ? 1299 CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 1300 stat = cusparse_csr2hyb(cusparsestruct->handle, matrix->num_rows, matrix->num_cols, 1301 matstruct->descr, matrix->values->data().get(), 1302 matrix->row_offsets->data().get(), 1303 matrix->column_indices->data().get(), 1304 hybMat, 0, partition);CHKERRCUSP(stat); 1305 /* assign the pointer */ 1306 matstruct->mat = hybMat; 1307 1308 if (matrix) { 1309 if (matrix->values) delete (THRUSTARRAY*)matrix->values; 1310 if (matrix->column_indices) delete (THRUSTINTARRAY32*)matrix->column_indices; 1311 if (matrix->row_offsets) delete (THRUSTINTARRAY32*)matrix->row_offsets; 1312 delete (CsrMatrix*)matrix; 1313 } 1314 #endif 1315 } 1316 1317 /* assign the compressed row indices */ 1318 matstruct->cprowIndices = new THRUSTINTARRAY(m); 1319 matstruct->cprowIndices->assign(ridx,ridx+m); 1320 1321 /* assign the pointer */ 1322 cusparsestruct->mat = matstruct; 1323 1324 if (!a->compressedrow.use) { 1325 ierr = PetscFree(ii);CHKERRQ(ierr); 1326 ierr = PetscFree(ridx);CHKERRQ(ierr); 1327 } 1328 cusparsestruct->workVector = new THRUSTARRAY; 1329 cusparsestruct->workVector->resize(m); 1330 } catch(char *ex) { 1331 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 1332 } 1333 ierr = WaitForGPU();CHKERRCUSP(ierr); 1334 1335 A->valid_GPU_matrix = PETSC_CUSP_BOTH; 1336 1337 ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1338 } 1339 PetscFunctionReturn(0); 1340 } 1341 1342 #undef __FUNCT__ 1343 #define __FUNCT__ "MatCreateVecs_SeqAIJCUSPARSE" 1344 static PetscErrorCode MatCreateVecs_SeqAIJCUSPARSE(Mat mat, Vec *right, Vec *left) 1345 { 1346 PetscErrorCode ierr; 1347 PetscInt rbs,cbs; 1348 1349 PetscFunctionBegin; 1350 ierr = MatGetBlockSizes(mat,&rbs,&cbs);CHKERRQ(ierr); 1351 if (right) { 1352 ierr = VecCreate(PetscObjectComm((PetscObject)mat),right);CHKERRQ(ierr); 1353 ierr = VecSetSizes(*right,mat->cmap->n,PETSC_DETERMINE);CHKERRQ(ierr); 1354 ierr = VecSetBlockSize(*right,cbs);CHKERRQ(ierr); 1355 ierr = VecSetType(*right,VECSEQCUSP);CHKERRQ(ierr); 1356 ierr = PetscLayoutReference(mat->cmap,&(*right)->map);CHKERRQ(ierr); 1357 } 1358 if (left) { 1359 ierr = VecCreate(PetscObjectComm((PetscObject)mat),left);CHKERRQ(ierr); 1360 ierr = VecSetSizes(*left,mat->rmap->n,PETSC_DETERMINE);CHKERRQ(ierr); 1361 ierr = VecSetBlockSize(*left,rbs);CHKERRQ(ierr); 1362 ierr = VecSetType(*left,VECSEQCUSP);CHKERRQ(ierr); 1363 ierr = PetscLayoutReference(mat->rmap,&(*left)->map);CHKERRQ(ierr); 1364 } 1365 PetscFunctionReturn(0); 1366 } 1367 1368 struct VecCUSPPlusEquals 1369 { 1370 template <typename Tuple> 1371 __host__ __device__ 1372 void operator()(Tuple t) 1373 { 1374 thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t); 1375 } 1376 }; 1377 1378 #undef __FUNCT__ 1379 #define __FUNCT__ "MatMult_SeqAIJCUSPARSE" 1380 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 1381 { 1382 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1383 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1384 Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 1385 CUSPARRAY *xarray,*yarray; 1386 PetscErrorCode ierr; 1387 cusparseStatus_t stat; 1388 1389 PetscFunctionBegin; 1390 /* The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE 1391 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); */ 1392 ierr = VecCUSPGetArrayRead(xx,&xarray);CHKERRQ(ierr); 1393 ierr = VecCUSPGetArrayWrite(yy,&yarray);CHKERRQ(ierr); 1394 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1395 CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 1396 stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1397 mat->num_rows, mat->num_cols, mat->num_entries, 1398 matstruct->alpha, matstruct->descr, mat->values->data().get(), mat->row_offsets->data().get(), 1399 mat->column_indices->data().get(), xarray->data().get(), matstruct->beta, 1400 yarray->data().get());CHKERRCUSP(stat); 1401 } else { 1402 #if CUDA_VERSION>=4020 1403 cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat; 1404 stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1405 matstruct->alpha, matstruct->descr, hybMat, 1406 xarray->data().get(), matstruct->beta, 1407 yarray->data().get());CHKERRCUSP(stat); 1408 #endif 1409 } 1410 ierr = VecCUSPRestoreArrayRead(xx,&xarray);CHKERRQ(ierr); 1411 ierr = VecCUSPRestoreArrayWrite(yy,&yarray);CHKERRQ(ierr); 1412 if (!cusparsestruct->stream) { 1413 ierr = WaitForGPU();CHKERRCUSP(ierr); 1414 } 1415 ierr = PetscLogFlops(2.0*a->nz - cusparsestruct->nonzerorow);CHKERRQ(ierr); 1416 PetscFunctionReturn(0); 1417 } 1418 1419 #undef __FUNCT__ 1420 #define __FUNCT__ "MatMultTranspose_SeqAIJCUSPARSE" 1421 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 1422 { 1423 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1424 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1425 Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 1426 CUSPARRAY *xarray,*yarray; 1427 PetscErrorCode ierr; 1428 cusparseStatus_t stat; 1429 1430 PetscFunctionBegin; 1431 /* The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE 1432 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); */ 1433 if (!matstructT) { 1434 ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr); 1435 matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 1436 } 1437 ierr = VecCUSPGetArrayRead(xx,&xarray);CHKERRQ(ierr); 1438 ierr = VecCUSPGetArrayWrite(yy,&yarray);CHKERRQ(ierr); 1439 1440 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1441 CsrMatrix *mat = (CsrMatrix*)matstructT->mat; 1442 stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1443 mat->num_rows, mat->num_cols, 1444 mat->num_entries, matstructT->alpha, matstructT->descr, 1445 mat->values->data().get(), mat->row_offsets->data().get(), 1446 mat->column_indices->data().get(), xarray->data().get(), matstructT->beta, 1447 yarray->data().get());CHKERRCUSP(stat); 1448 } else { 1449 #if CUDA_VERSION>=4020 1450 cusparseHybMat_t hybMat = (cusparseHybMat_t)matstructT->mat; 1451 stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1452 matstructT->alpha, matstructT->descr, hybMat, 1453 xarray->data().get(), matstructT->beta, 1454 yarray->data().get());CHKERRCUSP(stat); 1455 #endif 1456 } 1457 ierr = VecCUSPRestoreArrayRead(xx,&xarray);CHKERRQ(ierr); 1458 ierr = VecCUSPRestoreArrayWrite(yy,&yarray);CHKERRQ(ierr); 1459 if (!cusparsestruct->stream) { 1460 ierr = WaitForGPU();CHKERRCUSP(ierr); 1461 } 1462 ierr = PetscLogFlops(2.0*a->nz - cusparsestruct->nonzerorow);CHKERRQ(ierr); 1463 PetscFunctionReturn(0); 1464 } 1465 1466 1467 #undef __FUNCT__ 1468 #define __FUNCT__ "MatMultAdd_SeqAIJCUSPARSE" 1469 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 1470 { 1471 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1472 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1473 Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 1474 CUSPARRAY *xarray,*yarray,*zarray; 1475 PetscErrorCode ierr; 1476 cusparseStatus_t stat; 1477 1478 PetscFunctionBegin; 1479 /* The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE 1480 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); */ 1481 try { 1482 ierr = VecCopy_SeqCUSP(yy,zz);CHKERRQ(ierr); 1483 ierr = VecCUSPGetArrayRead(xx,&xarray);CHKERRQ(ierr); 1484 ierr = VecCUSPGetArrayRead(yy,&yarray);CHKERRQ(ierr); 1485 ierr = VecCUSPGetArrayWrite(zz,&zarray);CHKERRQ(ierr); 1486 1487 /* multiply add */ 1488 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1489 CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 1490 /* here we need to be careful to set the number of rows in the multiply to the 1491 number of compressed rows in the matrix ... which is equivalent to the 1492 size of the workVector */ 1493 stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1494 mat->num_rows, mat->num_cols, 1495 mat->num_entries, matstruct->alpha, matstruct->descr, 1496 mat->values->data().get(), mat->row_offsets->data().get(), 1497 mat->column_indices->data().get(), xarray->data().get(), matstruct->beta, 1498 cusparsestruct->workVector->data().get());CHKERRCUSP(stat); 1499 } else { 1500 #if CUDA_VERSION>=4020 1501 cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat; 1502 if (cusparsestruct->workVector->size()) { 1503 stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1504 matstruct->alpha, matstruct->descr, hybMat, 1505 xarray->data().get(), matstruct->beta, 1506 cusparsestruct->workVector->data().get());CHKERRCUSP(stat); 1507 } 1508 #endif 1509 } 1510 1511 /* scatter the data from the temporary into the full vector with a += operation */ 1512 thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zarray->begin(), matstruct->cprowIndices->begin()))), 1513 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zarray->begin(), matstruct->cprowIndices->begin()))) + cusparsestruct->workVector->size(), 1514 VecCUSPPlusEquals()); 1515 ierr = VecCUSPRestoreArrayRead(xx,&xarray);CHKERRQ(ierr); 1516 ierr = VecCUSPRestoreArrayRead(yy,&yarray);CHKERRQ(ierr); 1517 ierr = VecCUSPRestoreArrayWrite(zz,&zarray);CHKERRQ(ierr); 1518 1519 } catch(char *ex) { 1520 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 1521 } 1522 ierr = WaitForGPU();CHKERRCUSP(ierr); 1523 ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr); 1524 PetscFunctionReturn(0); 1525 } 1526 1527 #undef __FUNCT__ 1528 #define __FUNCT__ "MatMultTransposeAdd_SeqAIJCUSPARSE" 1529 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 1530 { 1531 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1532 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1533 Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 1534 CUSPARRAY *xarray,*yarray,*zarray; 1535 PetscErrorCode ierr; 1536 cusparseStatus_t stat; 1537 1538 PetscFunctionBegin; 1539 /* The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE 1540 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); */ 1541 if (!matstructT) { 1542 ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr); 1543 matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 1544 } 1545 1546 try { 1547 ierr = VecCopy_SeqCUSP(yy,zz);CHKERRQ(ierr); 1548 ierr = VecCUSPGetArrayRead(xx,&xarray);CHKERRQ(ierr); 1549 ierr = VecCUSPGetArrayRead(yy,&yarray);CHKERRQ(ierr); 1550 ierr = VecCUSPGetArrayWrite(zz,&zarray);CHKERRQ(ierr); 1551 1552 /* multiply add with matrix transpose */ 1553 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1554 CsrMatrix *mat = (CsrMatrix*)matstructT->mat; 1555 /* here we need to be careful to set the number of rows in the multiply to the 1556 number of compressed rows in the matrix ... which is equivalent to the 1557 size of the workVector */ 1558 stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1559 mat->num_rows, mat->num_cols, 1560 mat->num_entries, matstructT->alpha, matstructT->descr, 1561 mat->values->data().get(), mat->row_offsets->data().get(), 1562 mat->column_indices->data().get(), xarray->data().get(), matstructT->beta, 1563 cusparsestruct->workVector->data().get());CHKERRCUSP(stat); 1564 } else { 1565 #if CUDA_VERSION>=4020 1566 cusparseHybMat_t hybMat = (cusparseHybMat_t)matstructT->mat; 1567 if (cusparsestruct->workVector->size()) { 1568 stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1569 matstructT->alpha, matstructT->descr, hybMat, 1570 xarray->data().get(), matstructT->beta, 1571 cusparsestruct->workVector->data().get());CHKERRCUSP(stat); 1572 } 1573 #endif 1574 } 1575 1576 /* scatter the data from the temporary into the full vector with a += operation */ 1577 thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zarray->begin(), matstructT->cprowIndices->begin()))), 1578 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zarray->begin(), matstructT->cprowIndices->begin()))) + cusparsestruct->workVector->size(), 1579 VecCUSPPlusEquals()); 1580 1581 ierr = VecCUSPRestoreArrayRead(xx,&xarray);CHKERRQ(ierr); 1582 ierr = VecCUSPRestoreArrayRead(yy,&yarray);CHKERRQ(ierr); 1583 ierr = VecCUSPRestoreArrayWrite(zz,&zarray);CHKERRQ(ierr); 1584 1585 } catch(char *ex) { 1586 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 1587 } 1588 ierr = WaitForGPU();CHKERRCUSP(ierr); 1589 ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr); 1590 PetscFunctionReturn(0); 1591 } 1592 1593 #undef __FUNCT__ 1594 #define __FUNCT__ "MatAssemblyEnd_SeqAIJCUSPARSE" 1595 static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode) 1596 { 1597 PetscErrorCode ierr; 1598 1599 PetscFunctionBegin; 1600 ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr); 1601 if (A->factortype==MAT_FACTOR_NONE) { 1602 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 1603 } 1604 if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0); 1605 A->ops->mult = MatMult_SeqAIJCUSPARSE; 1606 A->ops->multadd = MatMultAdd_SeqAIJCUSPARSE; 1607 A->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE; 1608 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE; 1609 PetscFunctionReturn(0); 1610 } 1611 1612 /* --------------------------------------------------------------------------------*/ 1613 #undef __FUNCT__ 1614 #define __FUNCT__ "MatCreateSeqAIJCUSPARSE" 1615 /*@ 1616 MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format 1617 (the default parallel PETSc format). This matrix will ultimately pushed down 1618 to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix 1619 assembly performance the user should preallocate the matrix storage by setting 1620 the parameter nz (or the array nnz). By setting these parameters accurately, 1621 performance during matrix assembly can be increased by more than a factor of 50. 1622 1623 Collective on MPI_Comm 1624 1625 Input Parameters: 1626 + comm - MPI communicator, set to PETSC_COMM_SELF 1627 . m - number of rows 1628 . n - number of columns 1629 . nz - number of nonzeros per row (same for all rows) 1630 - nnz - array containing the number of nonzeros in the various rows 1631 (possibly different for each row) or NULL 1632 1633 Output Parameter: 1634 . A - the matrix 1635 1636 It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 1637 MatXXXXSetPreallocation() paradgm instead of this routine directly. 1638 [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 1639 1640 Notes: 1641 If nnz is given then nz is ignored 1642 1643 The AIJ format (also called the Yale sparse matrix format or 1644 compressed row storage), is fully compatible with standard Fortran 77 1645 storage. That is, the stored row and column indices can begin at 1646 either one (as in Fortran) or zero. See the users' manual for details. 1647 1648 Specify the preallocated storage with either nz or nnz (not both). 1649 Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 1650 allocation. For large problems you MUST preallocate memory or you 1651 will get TERRIBLE performance, see the users' manual chapter on matrices. 1652 1653 By default, this format uses inodes (identical nodes) when possible, to 1654 improve numerical efficiency of matrix-vector products and solves. We 1655 search for consecutive rows with the same nonzero structure, thereby 1656 reusing matrix information to achieve increased efficiency. 1657 1658 Level: intermediate 1659 1660 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE 1661 @*/ 1662 PetscErrorCode MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A) 1663 { 1664 PetscErrorCode ierr; 1665 1666 PetscFunctionBegin; 1667 ierr = MatCreate(comm,A);CHKERRQ(ierr); 1668 ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr); 1669 ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 1670 ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr); 1671 PetscFunctionReturn(0); 1672 } 1673 1674 #undef __FUNCT__ 1675 #define __FUNCT__ "MatDestroy_SeqAIJCUSPARSE" 1676 static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A) 1677 { 1678 PetscErrorCode ierr; 1679 1680 PetscFunctionBegin; 1681 if (A->factortype==MAT_FACTOR_NONE) { 1682 if (A->valid_GPU_matrix != PETSC_CUSP_UNALLOCATED) { 1683 ierr = Mat_SeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr);CHKERRQ(ierr); 1684 } 1685 } else { 1686 ierr = Mat_SeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr);CHKERRQ(ierr); 1687 } 1688 ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr); 1689 PetscFunctionReturn(0); 1690 } 1691 1692 #undef __FUNCT__ 1693 #define __FUNCT__ "MatCreate_SeqAIJCUSPARSE" 1694 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B) 1695 { 1696 PetscErrorCode ierr; 1697 cusparseStatus_t stat; 1698 cusparseHandle_t handle=0; 1699 1700 PetscFunctionBegin; 1701 ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr); 1702 if (B->factortype==MAT_FACTOR_NONE) { 1703 /* you cannot check the inode.use flag here since the matrix was just created. 1704 now build a GPU matrix data structure */ 1705 B->spptr = new Mat_SeqAIJCUSPARSE; 1706 ((Mat_SeqAIJCUSPARSE*)B->spptr)->mat = 0; 1707 ((Mat_SeqAIJCUSPARSE*)B->spptr)->matTranspose = 0; 1708 ((Mat_SeqAIJCUSPARSE*)B->spptr)->workVector = 0; 1709 ((Mat_SeqAIJCUSPARSE*)B->spptr)->format = MAT_CUSPARSE_CSR; 1710 ((Mat_SeqAIJCUSPARSE*)B->spptr)->stream = 0; 1711 ((Mat_SeqAIJCUSPARSE*)B->spptr)->handle = 0; 1712 stat = cusparseCreate(&handle);CHKERRCUSP(stat); 1713 ((Mat_SeqAIJCUSPARSE*)B->spptr)->handle = handle; 1714 ((Mat_SeqAIJCUSPARSE*)B->spptr)->stream = 0; 1715 } else { 1716 /* NEXT, set the pointers to the triangular factors */ 1717 B->spptr = new Mat_SeqAIJCUSPARSETriFactors; 1718 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->loTriFactorPtr = 0; 1719 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->upTriFactorPtr = 0; 1720 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->loTriFactorPtrTranspose = 0; 1721 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->upTriFactorPtrTranspose = 0; 1722 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->rpermIndices = 0; 1723 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->cpermIndices = 0; 1724 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->workVector = 0; 1725 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->handle = 0; 1726 stat = cusparseCreate(&handle);CHKERRCUSP(stat); 1727 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->handle = handle; 1728 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->nnz = 0; 1729 } 1730 1731 /* Here we overload MatGetFactor_petsc_C which enables -mat_type aijcusparse to use the 1732 default cusparse tri solve. Note the difference with the implementation in 1733 MatCreate_SeqAIJCUSP in ../seqcusp/aijcusp.cu */ 1734 ierr = PetscObjectComposeFunction((PetscObject)B,"MatGetFactor_petsc_C",MatGetFactor_seqaij_cusparse);CHKERRQ(ierr); 1735 1736 B->ops->assemblyend = MatAssemblyEnd_SeqAIJCUSPARSE; 1737 B->ops->destroy = MatDestroy_SeqAIJCUSPARSE; 1738 B->ops->getvecs = MatCreateVecs_SeqAIJCUSPARSE; 1739 B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE; 1740 B->ops->mult = MatMult_SeqAIJCUSPARSE; 1741 B->ops->multadd = MatMultAdd_SeqAIJCUSPARSE; 1742 B->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE; 1743 B->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE; 1744 1745 ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 1746 1747 B->valid_GPU_matrix = PETSC_CUSP_UNALLOCATED; 1748 1749 ierr = PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr); 1750 PetscFunctionReturn(0); 1751 } 1752 1753 /*M 1754 MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices. 1755 1756 A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either 1757 CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later. 1758 All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library. 1759 1760 Options Database Keys: 1761 + -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions() 1762 . -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 1763 . -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 1764 1765 Level: beginner 1766 1767 .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 1768 M*/ 1769 1770 #undef __FUNCT__ 1771 #define __FUNCT__ "Mat_SeqAIJCUSPARSE_Destroy" 1772 static PetscErrorCode Mat_SeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct) 1773 { 1774 cusparseStatus_t stat; 1775 cusparseHandle_t handle; 1776 1777 PetscFunctionBegin; 1778 if (*cusparsestruct) { 1779 Mat_SeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format); 1780 Mat_SeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format); 1781 delete (*cusparsestruct)->workVector; 1782 if (handle = (*cusparsestruct)->handle) { 1783 stat = cusparseDestroy(handle);CHKERRCUSP(stat); 1784 } 1785 delete *cusparsestruct; 1786 *cusparsestruct = 0; 1787 } 1788 PetscFunctionReturn(0); 1789 } 1790 1791 #undef __FUNCT__ 1792 #define __FUNCT__ "CsrMatrix_Destroy" 1793 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat) 1794 { 1795 PetscFunctionBegin; 1796 if (*mat) { 1797 delete (*mat)->values; 1798 delete (*mat)->column_indices; 1799 delete (*mat)->row_offsets; 1800 delete *mat; 1801 *mat = 0; 1802 } 1803 PetscFunctionReturn(0); 1804 } 1805 1806 #undef __FUNCT__ 1807 #define __FUNCT__ "Mat_SeqAIJCUSPARSETriFactorStruct_Destroy" 1808 static PetscErrorCode Mat_SeqAIJCUSPARSETriFactorStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor) 1809 { 1810 cusparseStatus_t stat; 1811 PetscErrorCode ierr; 1812 1813 PetscFunctionBegin; 1814 if (*trifactor) { 1815 if ((*trifactor)->descr) { stat = cusparseDestroyMatDescr((*trifactor)->descr);CHKERRCUSP(stat); } 1816 if ((*trifactor)->solveInfo) { stat = cusparseDestroySolveAnalysisInfo((*trifactor)->solveInfo);CHKERRCUSP(stat); } 1817 ierr = CsrMatrix_Destroy(&(*trifactor)->csrMat);CHKERRQ(ierr); 1818 delete *trifactor; 1819 *trifactor = 0; 1820 } 1821 PetscFunctionReturn(0); 1822 } 1823 1824 #undef __FUNCT__ 1825 #define __FUNCT__ "Mat_SeqAIJCUSPARSEMultStruct_Destroy" 1826 static PetscErrorCode Mat_SeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format) 1827 { 1828 CsrMatrix *mat; 1829 cusparseStatus_t stat; 1830 cudaError_t err; 1831 1832 PetscFunctionBegin; 1833 if (*matstruct) { 1834 if ((*matstruct)->mat) { 1835 if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) { 1836 cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat; 1837 stat = cusparseDestroyHybMat(hybMat);CHKERRCUSP(stat); 1838 } else { 1839 mat = (CsrMatrix*)(*matstruct)->mat; 1840 CsrMatrix_Destroy(&mat); 1841 } 1842 } 1843 if ((*matstruct)->descr) { stat = cusparseDestroyMatDescr((*matstruct)->descr);CHKERRCUSP(stat); } 1844 delete (*matstruct)->cprowIndices; 1845 if ((*matstruct)->alpha) { err=cudaFree((*matstruct)->alpha);CHKERRCUSP(err); } 1846 if ((*matstruct)->beta) { err=cudaFree((*matstruct)->beta);CHKERRCUSP(err); } 1847 delete *matstruct; 1848 *matstruct = 0; 1849 } 1850 PetscFunctionReturn(0); 1851 } 1852 1853 #undef __FUNCT__ 1854 #define __FUNCT__ "Mat_SeqAIJCUSPARSETriFactors_Destroy" 1855 static PetscErrorCode Mat_SeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors) 1856 { 1857 cusparseHandle_t handle; 1858 cusparseStatus_t stat; 1859 1860 PetscFunctionBegin; 1861 if (*trifactors) { 1862 Mat_SeqAIJCUSPARSETriFactorStruct_Destroy(&(*trifactors)->loTriFactorPtr); 1863 Mat_SeqAIJCUSPARSETriFactorStruct_Destroy(&(*trifactors)->upTriFactorPtr); 1864 Mat_SeqAIJCUSPARSETriFactorStruct_Destroy(&(*trifactors)->loTriFactorPtrTranspose); 1865 Mat_SeqAIJCUSPARSETriFactorStruct_Destroy(&(*trifactors)->upTriFactorPtrTranspose); 1866 delete (*trifactors)->rpermIndices; 1867 delete (*trifactors)->cpermIndices; 1868 delete (*trifactors)->workVector; 1869 if (handle = (*trifactors)->handle) { 1870 stat = cusparseDestroy(handle);CHKERRCUSP(stat); 1871 } 1872 delete *trifactors; 1873 *trifactors = 0; 1874 } 1875 PetscFunctionReturn(0); 1876 } 1877 1878