1 /* 2 Defines the basic matrix operations for the AIJ (compressed row) 3 matrix storage format using the CUSPARSE library, 4 */ 5 6 #include <petscconf.h> 7 #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/ 8 #include <../src/mat/impls/sbaij/seq/sbaij.h> 9 #include <../src/vec/vec/impls/dvecimpl.h> 10 #include <petsc-private/vecimpl.h> 11 #undef VecType 12 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h> 13 14 const char *const MatCUSPARSEStorageFormats[] = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0}; 15 16 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*); 17 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*); 18 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*); 19 20 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 21 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 22 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*); 23 24 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec); 25 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 26 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 27 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 28 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat); 29 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec); 30 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 31 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 32 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 33 34 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**); 35 static PetscErrorCode Mat_SeqAIJCUSPARSETriFactorStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**); 36 static PetscErrorCode Mat_SeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat); 37 static PetscErrorCode Mat_SeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**); 38 static PetscErrorCode Mat_SeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**); 39 40 #undef __FUNCT__ 41 #define __FUNCT__ "MatCUSPARSESetStream" 42 PetscErrorCode MatCUSPARSESetStream(Mat A,const cudaStream_t stream) 43 { 44 cusparseStatus_t stat; 45 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 46 47 PetscFunctionBegin; 48 cusparsestruct->stream = stream; 49 stat = cusparseSetStream(cusparsestruct->handle,cusparsestruct->stream);CHKERRCUSP(stat); 50 PetscFunctionReturn(0); 51 } 52 53 #undef __FUNCT__ 54 #define __FUNCT__ "MatCUSPARSESetHandle" 55 PetscErrorCode MatCUSPARSESetHandle(Mat A,const cusparseHandle_t handle) 56 { 57 cusparseStatus_t stat; 58 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 59 60 PetscFunctionBegin; 61 if (cusparsestruct->handle) 62 stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUSP(stat); 63 cusparsestruct->handle = handle; 64 stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSP(stat); 65 PetscFunctionReturn(0); 66 } 67 68 #undef __FUNCT__ 69 #define __FUNCT__ "MatCUSPARSEClearHandle" 70 PetscErrorCode MatCUSPARSEClearHandle(Mat A) 71 { 72 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 73 PetscFunctionBegin; 74 if (cusparsestruct->handle) 75 cusparsestruct->handle = 0; 76 PetscFunctionReturn(0); 77 } 78 79 #undef __FUNCT__ 80 #define __FUNCT__ "MatFactorGetSolverPackage_seqaij_cusparse" 81 PetscErrorCode MatFactorGetSolverPackage_seqaij_cusparse(Mat A,const MatSolverPackage *type) 82 { 83 PetscFunctionBegin; 84 *type = MATSOLVERCUSPARSE; 85 PetscFunctionReturn(0); 86 } 87 88 /*MC 89 MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices 90 on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported 91 algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer 92 performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the 93 CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these 94 algorithms are not recommended. This class does NOT support direct solver operations. 95 96 Level: beginner 97 98 .seealso: PCFactorSetMatSolverPackage(), MatSolverPackage, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 99 M*/ 100 101 #undef __FUNCT__ 102 #define __FUNCT__ "MatGetFactor_seqaij_cusparse" 103 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaij_cusparse(Mat A,MatFactorType ftype,Mat *B) 104 { 105 PetscErrorCode ierr; 106 PetscInt n = A->rmap->n; 107 108 PetscFunctionBegin; 109 ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr); 110 (*B)->factortype = ftype; 111 ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr); 112 ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 113 114 if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) { 115 ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr); 116 (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE; 117 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJCUSPARSE; 118 } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) { 119 (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJCUSPARSE; 120 (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE; 121 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types"); 122 123 ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr); 124 ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverPackage_C",MatFactorGetSolverPackage_seqaij_cusparse);CHKERRQ(ierr); 125 PetscFunctionReturn(0); 126 } 127 128 #undef __FUNCT__ 129 #define __FUNCT__ "MatCUSPARSESetFormat_SeqAIJCUSPARSE" 130 PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 131 { 132 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 133 134 PetscFunctionBegin; 135 #if CUDA_VERSION>=4020 136 switch (op) { 137 case MAT_CUSPARSE_MULT: 138 cusparsestruct->format = format; 139 break; 140 case MAT_CUSPARSE_ALL: 141 cusparsestruct->format = format; 142 break; 143 default: 144 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op); 145 } 146 #else 147 if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) 148 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"ELL (Ellpack) and HYB (Hybrid) storage format require CUDA 4.2 or later."); 149 #endif 150 PetscFunctionReturn(0); 151 } 152 153 /*@ 154 MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular 155 operation. Only the MatMult operation can use different GPU storage formats 156 for MPIAIJCUSPARSE matrices. 157 Not Collective 158 159 Input Parameters: 160 + A - Matrix of type SEQAIJCUSPARSE 161 . op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL. 162 - format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2) 163 164 Output Parameter: 165 166 Level: intermediate 167 168 .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 169 @*/ 170 #undef __FUNCT__ 171 #define __FUNCT__ "MatCUSPARSESetFormat" 172 PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 173 { 174 PetscErrorCode ierr; 175 176 PetscFunctionBegin; 177 PetscValidHeaderSpecific(A, MAT_CLASSID,1); 178 ierr = PetscTryMethod(A, "MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));CHKERRQ(ierr); 179 PetscFunctionReturn(0); 180 } 181 182 #undef __FUNCT__ 183 #define __FUNCT__ "MatSetFromOptions_SeqAIJCUSPARSE" 184 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat A) 185 { 186 PetscErrorCode ierr; 187 MatCUSPARSEStorageFormat format; 188 PetscBool flg; 189 190 PetscFunctionBegin; 191 ierr = PetscOptionsHead("SeqAIJCUSPARSE options");CHKERRQ(ierr); 192 ierr = PetscObjectOptionsBegin((PetscObject)A); 193 if (A->factortype==MAT_FACTOR_NONE) { 194 ierr = PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV", 195 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)MAT_CUSPARSE_CSR,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 196 if (flg) { 197 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);CHKERRQ(ierr); 198 } 199 } 200 ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve", 201 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)MAT_CUSPARSE_CSR,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 202 if (flg) { 203 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr); 204 } 205 ierr = PetscOptionsEnd();CHKERRQ(ierr); 206 PetscFunctionReturn(0); 207 208 } 209 210 #undef __FUNCT__ 211 #define __FUNCT__ "MatILUFactorSymbolic_SeqAIJCUSPARSE" 212 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 213 { 214 PetscErrorCode ierr; 215 216 PetscFunctionBegin; 217 ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 218 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 219 PetscFunctionReturn(0); 220 } 221 222 #undef __FUNCT__ 223 #define __FUNCT__ "MatLUFactorSymbolic_SeqAIJCUSPARSE" 224 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 225 { 226 PetscErrorCode ierr; 227 228 PetscFunctionBegin; 229 ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 230 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 231 PetscFunctionReturn(0); 232 } 233 234 #undef __FUNCT__ 235 #define __FUNCT__ "MatICCFactorSymbolic_SeqAIJCUSPARSE" 236 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info) 237 { 238 PetscErrorCode ierr; 239 240 PetscFunctionBegin; 241 ierr = MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr); 242 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 243 PetscFunctionReturn(0); 244 } 245 246 #undef __FUNCT__ 247 #define __FUNCT__ "MatCholeskyFactorSymbolic_SeqAIJCUSPARSE" 248 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info) 249 { 250 PetscErrorCode ierr; 251 252 PetscFunctionBegin; 253 ierr = MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr); 254 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 255 PetscFunctionReturn(0); 256 } 257 258 #undef __FUNCT__ 259 #define __FUNCT__ "MatSeqAIJCUSPARSEBuildILULowerTriMatrix" 260 static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A) 261 { 262 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 263 PetscInt n = A->rmap->n; 264 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 265 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 266 cusparseStatus_t stat; 267 const PetscInt *ai = a->i,*aj = a->j,*vi; 268 const MatScalar *aa = a->a,*v; 269 PetscInt *AiLo, *AjLo; 270 PetscScalar *AALo; 271 PetscInt i,nz, nzLower, offset, rowOffset; 272 PetscErrorCode ierr; 273 274 PetscFunctionBegin; 275 if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUSP_CPU) { 276 try { 277 /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */ 278 nzLower=n+ai[n]-ai[1]; 279 280 /* Allocate Space for the lower triangular matrix */ 281 ierr = cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUSP(ierr); 282 ierr = cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUSP(ierr); 283 ierr = cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUSP(ierr); 284 285 /* Fill the lower triangular matrix */ 286 AiLo[0] = (PetscInt) 0; 287 AiLo[n] = nzLower; 288 AjLo[0] = (PetscInt) 0; 289 AALo[0] = (MatScalar) 1.0; 290 v = aa; 291 vi = aj; 292 offset = 1; 293 rowOffset= 1; 294 for (i=1; i<n; i++) { 295 nz = ai[i+1] - ai[i]; 296 /* additional 1 for the term on the diagonal */ 297 AiLo[i] = rowOffset; 298 rowOffset += nz+1; 299 300 ierr = PetscMemcpy(&(AjLo[offset]), vi, nz*sizeof(PetscInt));CHKERRQ(ierr); 301 ierr = PetscMemcpy(&(AALo[offset]), v, nz*sizeof(PetscScalar));CHKERRQ(ierr); 302 303 offset += nz; 304 AjLo[offset] = (PetscInt) i; 305 AALo[offset] = (MatScalar) 1.0; 306 offset += 1; 307 308 v += nz; 309 vi += nz; 310 } 311 312 /* allocate space for the triangular factor information */ 313 loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct; 314 315 /* Create the matrix description */ 316 stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSP(stat); 317 stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSP(stat); 318 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSP(stat); 319 stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER);CHKERRCUSP(stat); 320 stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSP(stat); 321 322 /* Create the solve analysis information */ 323 stat = cusparseCreateSolveAnalysisInfo(&loTriFactor->solveInfo);CHKERRCUSP(stat); 324 325 /* set the operation */ 326 loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 327 328 /* set the matrix */ 329 loTriFactor->csrMat = new CsrMatrix; 330 loTriFactor->csrMat->num_rows = n; 331 loTriFactor->csrMat->num_cols = n; 332 loTriFactor->csrMat->num_entries = nzLower; 333 334 loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1); 335 loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1); 336 337 loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower); 338 loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower); 339 340 loTriFactor->csrMat->values = new THRUSTARRAY(nzLower); 341 loTriFactor->csrMat->values->assign(AALo, AALo+nzLower); 342 343 /* perform the solve analysis */ 344 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, 345 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 346 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 347 loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo);CHKERRCUSP(stat); 348 349 /* assign the pointer. Is this really necessary? */ 350 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor; 351 352 ierr = cudaFreeHost(AiLo);CHKERRCUSP(ierr); 353 ierr = cudaFreeHost(AjLo);CHKERRCUSP(ierr); 354 ierr = cudaFreeHost(AALo);CHKERRCUSP(ierr); 355 } catch(char *ex) { 356 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 357 } 358 } 359 PetscFunctionReturn(0); 360 } 361 362 #undef __FUNCT__ 363 #define __FUNCT__ "MatSeqAIJCUSPARSEBuildILUUpperTriMatrix" 364 static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A) 365 { 366 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 367 PetscInt n = A->rmap->n; 368 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 369 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 370 cusparseStatus_t stat; 371 const PetscInt *aj = a->j,*adiag = a->diag,*vi; 372 const MatScalar *aa = a->a,*v; 373 PetscInt *AiUp, *AjUp; 374 PetscScalar *AAUp; 375 PetscInt i,nz, nzUpper, offset; 376 PetscErrorCode ierr; 377 378 PetscFunctionBegin; 379 if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUSP_CPU) { 380 try { 381 /* next, figure out the number of nonzeros in the upper triangular matrix. */ 382 nzUpper = adiag[0]-adiag[n]; 383 384 /* Allocate Space for the upper triangular matrix */ 385 ierr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUSP(ierr); 386 ierr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUSP(ierr); 387 ierr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUSP(ierr); 388 389 /* Fill the upper triangular matrix */ 390 AiUp[0]=(PetscInt) 0; 391 AiUp[n]=nzUpper; 392 offset = nzUpper; 393 for (i=n-1; i>=0; i--) { 394 v = aa + adiag[i+1] + 1; 395 vi = aj + adiag[i+1] + 1; 396 397 /* number of elements NOT on the diagonal */ 398 nz = adiag[i] - adiag[i+1]-1; 399 400 /* decrement the offset */ 401 offset -= (nz+1); 402 403 /* first, set the diagonal elements */ 404 AjUp[offset] = (PetscInt) i; 405 AAUp[offset] = 1./v[nz]; 406 AiUp[i] = AiUp[i+1] - (nz+1); 407 408 ierr = PetscMemcpy(&(AjUp[offset+1]), vi, nz*sizeof(PetscInt));CHKERRQ(ierr); 409 ierr = PetscMemcpy(&(AAUp[offset+1]), v, nz*sizeof(PetscScalar));CHKERRQ(ierr); 410 } 411 412 /* allocate space for the triangular factor information */ 413 upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct; 414 415 /* Create the matrix description */ 416 stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSP(stat); 417 stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSP(stat); 418 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSP(stat); 419 stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSP(stat); 420 stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSP(stat); 421 422 /* Create the solve analysis information */ 423 stat = cusparseCreateSolveAnalysisInfo(&upTriFactor->solveInfo);CHKERRCUSP(stat); 424 425 /* set the operation */ 426 upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 427 428 /* set the matrix */ 429 upTriFactor->csrMat = new CsrMatrix; 430 upTriFactor->csrMat->num_rows = n; 431 upTriFactor->csrMat->num_cols = n; 432 upTriFactor->csrMat->num_entries = nzUpper; 433 434 upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1); 435 upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1); 436 437 upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper); 438 upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper); 439 440 upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper); 441 upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper); 442 443 /* perform the solve analysis */ 444 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, 445 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 446 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 447 upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo);CHKERRCUSP(stat); 448 449 /* assign the pointer. Is this really necessary? */ 450 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor; 451 452 ierr = cudaFreeHost(AiUp);CHKERRCUSP(ierr); 453 ierr = cudaFreeHost(AjUp);CHKERRCUSP(ierr); 454 ierr = cudaFreeHost(AAUp);CHKERRCUSP(ierr); 455 } catch(char *ex) { 456 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 457 } 458 } 459 PetscFunctionReturn(0); 460 } 461 462 #undef __FUNCT__ 463 #define __FUNCT__ "MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU" 464 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A) 465 { 466 PetscErrorCode ierr; 467 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 468 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 469 IS isrow = a->row,iscol = a->icol; 470 PetscBool row_identity,col_identity; 471 const PetscInt *r,*c; 472 PetscInt n = A->rmap->n; 473 474 PetscFunctionBegin; 475 ierr = MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);CHKERRQ(ierr); 476 ierr = MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);CHKERRQ(ierr); 477 478 cusparseTriFactors->workVector = new THRUSTARRAY; 479 cusparseTriFactors->workVector->resize(n); 480 cusparseTriFactors->nnz=a->nz; 481 482 A->valid_GPU_matrix = PETSC_CUSP_BOTH; 483 /*lower triangular indices */ 484 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 485 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 486 if (!row_identity) { 487 cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 488 cusparseTriFactors->rpermIndices->assign(r, r+n); 489 } 490 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 491 492 /*upper triangular indices */ 493 ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr); 494 ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 495 if (!col_identity) { 496 cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 497 cusparseTriFactors->cpermIndices->assign(c, c+n); 498 } 499 ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr); 500 PetscFunctionReturn(0); 501 } 502 503 #undef __FUNCT__ 504 #define __FUNCT__ "MatSeqAIJCUSPARSEBuildICCTriMatrices" 505 static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A) 506 { 507 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 508 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 509 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 510 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 511 cusparseStatus_t stat; 512 PetscErrorCode ierr; 513 PetscInt *AiUp, *AjUp; 514 PetscScalar *AAUp; 515 PetscScalar *AALo; 516 PetscInt nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j; 517 Mat_SeqSBAIJ *b = (Mat_SeqSBAIJ*)A->data; 518 const PetscInt *ai = b->i,*aj = b->j,*vj; 519 const MatScalar *aa = b->a,*v; 520 521 PetscFunctionBegin; 522 if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUSP_CPU) { 523 try { 524 /* Allocate Space for the upper triangular matrix */ 525 ierr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUSP(ierr); 526 ierr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUSP(ierr); 527 ierr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUSP(ierr); 528 ierr = cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUSP(ierr); 529 530 /* Fill the upper triangular matrix */ 531 AiUp[0]=(PetscInt) 0; 532 AiUp[n]=nzUpper; 533 offset = 0; 534 for (i=0; i<n; i++) { 535 /* set the pointers */ 536 v = aa + ai[i]; 537 vj = aj + ai[i]; 538 nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */ 539 540 /* first, set the diagonal elements */ 541 AjUp[offset] = (PetscInt) i; 542 AAUp[offset] = 1.0/v[nz]; 543 AiUp[i] = offset; 544 AALo[offset] = 1.0/v[nz]; 545 546 offset+=1; 547 if (nz>0) { 548 ierr = PetscMemcpy(&(AjUp[offset]), vj, nz*sizeof(PetscInt));CHKERRQ(ierr); 549 ierr = PetscMemcpy(&(AAUp[offset]), v, nz*sizeof(PetscScalar));CHKERRQ(ierr); 550 for (j=offset; j<offset+nz; j++) { 551 AAUp[j] = -AAUp[j]; 552 AALo[j] = AAUp[j]/v[nz]; 553 } 554 offset+=nz; 555 } 556 } 557 558 /* allocate space for the triangular factor information */ 559 upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct; 560 561 /* Create the matrix description */ 562 stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSP(stat); 563 stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSP(stat); 564 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSP(stat); 565 stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSP(stat); 566 stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSP(stat); 567 568 /* Create the solve analysis information */ 569 stat = cusparseCreateSolveAnalysisInfo(&upTriFactor->solveInfo);CHKERRCUSP(stat); 570 571 /* set the operation */ 572 upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 573 574 /* set the matrix */ 575 upTriFactor->csrMat = new CsrMatrix; 576 upTriFactor->csrMat->num_rows = A->rmap->n; 577 upTriFactor->csrMat->num_cols = A->cmap->n; 578 upTriFactor->csrMat->num_entries = a->nz; 579 580 upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 581 upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1); 582 583 upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 584 upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz); 585 586 upTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 587 upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz); 588 589 /* perform the solve analysis */ 590 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, 591 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 592 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 593 upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo);CHKERRCUSP(stat); 594 595 /* assign the pointer. Is this really necessary? */ 596 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor; 597 598 /* allocate space for the triangular factor information */ 599 loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct; 600 601 /* Create the matrix description */ 602 stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSP(stat); 603 stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSP(stat); 604 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSP(stat); 605 stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSP(stat); 606 stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSP(stat); 607 608 /* Create the solve analysis information */ 609 stat = cusparseCreateSolveAnalysisInfo(&loTriFactor->solveInfo);CHKERRCUSP(stat); 610 611 /* set the operation */ 612 loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE; 613 614 /* set the matrix */ 615 loTriFactor->csrMat = new CsrMatrix; 616 loTriFactor->csrMat->num_rows = A->rmap->n; 617 loTriFactor->csrMat->num_cols = A->cmap->n; 618 loTriFactor->csrMat->num_entries = a->nz; 619 620 loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 621 loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1); 622 623 loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 624 loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz); 625 626 loTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 627 loTriFactor->csrMat->values->assign(AALo, AALo+a->nz); 628 629 /* perform the solve analysis */ 630 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, 631 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 632 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 633 loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo);CHKERRCUSP(stat); 634 635 /* assign the pointer. Is this really necessary? */ 636 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor; 637 638 A->valid_GPU_matrix = PETSC_CUSP_BOTH; 639 ierr = cudaFreeHost(AiUp);CHKERRCUSP(ierr); 640 ierr = cudaFreeHost(AjUp);CHKERRCUSP(ierr); 641 ierr = cudaFreeHost(AAUp);CHKERRCUSP(ierr); 642 ierr = cudaFreeHost(AALo);CHKERRCUSP(ierr); 643 } catch(char *ex) { 644 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 645 } 646 } 647 PetscFunctionReturn(0); 648 } 649 650 #undef __FUNCT__ 651 #define __FUNCT__ "MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU" 652 static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A) 653 { 654 PetscErrorCode ierr; 655 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 656 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 657 IS ip = a->row; 658 const PetscInt *rip; 659 PetscBool perm_identity; 660 PetscInt n = A->rmap->n; 661 662 PetscFunctionBegin; 663 ierr = MatSeqAIJCUSPARSEBuildICCTriMatrices(A);CHKERRQ(ierr); 664 cusparseTriFactors->workVector = new THRUSTARRAY; 665 cusparseTriFactors->workVector->resize(n); 666 cusparseTriFactors->nnz=(a->nz-n)*2 + n; 667 668 /*lower triangular indices */ 669 ierr = ISGetIndices(ip,&rip);CHKERRQ(ierr); 670 ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr); 671 if (!perm_identity) { 672 cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 673 cusparseTriFactors->rpermIndices->assign(rip, rip+n); 674 cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 675 cusparseTriFactors->cpermIndices->assign(rip, rip+n); 676 } 677 ierr = ISRestoreIndices(ip,&rip);CHKERRQ(ierr); 678 PetscFunctionReturn(0); 679 } 680 681 #undef __FUNCT__ 682 #define __FUNCT__ "MatLUFactorNumeric_SeqAIJCUSPARSE" 683 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info) 684 { 685 Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 686 IS isrow = b->row,iscol = b->col; 687 PetscBool row_identity,col_identity; 688 PetscErrorCode ierr; 689 690 PetscFunctionBegin; 691 ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 692 /* determine which version of MatSolve needs to be used. */ 693 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 694 ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 695 if (row_identity && col_identity) { 696 B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 697 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 698 } else { 699 B->ops->solve = MatSolve_SeqAIJCUSPARSE; 700 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 701 } 702 703 /* get the triangular factors */ 704 ierr = MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);CHKERRQ(ierr); 705 PetscFunctionReturn(0); 706 } 707 708 #undef __FUNCT__ 709 #define __FUNCT__ "MatCholeskyFactorNumeric_SeqAIJCUSPARSE" 710 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info) 711 { 712 Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 713 IS ip = b->row; 714 PetscBool perm_identity; 715 PetscErrorCode ierr; 716 717 PetscFunctionBegin; 718 ierr = MatCholeskyFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 719 720 /* determine which version of MatSolve needs to be used. */ 721 ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr); 722 if (perm_identity) { 723 B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 724 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 725 } else { 726 B->ops->solve = MatSolve_SeqAIJCUSPARSE; 727 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 728 } 729 730 /* get the triangular factors */ 731 ierr = MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);CHKERRQ(ierr); 732 PetscFunctionReturn(0); 733 } 734 735 #undef __FUNCT__ 736 #define __FUNCT__ "MatSeqAIJCUSPARSEAnalyzeTransposeForSolve" 737 static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A) 738 { 739 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 740 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 741 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 742 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 743 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 744 cusparseStatus_t stat; 745 cusparseIndexBase_t indexBase; 746 cusparseMatrixType_t matrixType; 747 cusparseFillMode_t fillMode; 748 cusparseDiagType_t diagType; 749 750 PetscFunctionBegin; 751 752 /*********************************************/ 753 /* Now the Transpose of the Lower Tri Factor */ 754 /*********************************************/ 755 756 /* allocate space for the transpose of the lower triangular factor */ 757 loTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct; 758 759 /* set the matrix descriptors of the lower triangular factor */ 760 matrixType = cusparseGetMatType(loTriFactor->descr); 761 indexBase = cusparseGetMatIndexBase(loTriFactor->descr); 762 fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ? 763 CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 764 diagType = cusparseGetMatDiagType(loTriFactor->descr); 765 766 /* Create the matrix description */ 767 stat = cusparseCreateMatDescr(&loTriFactorT->descr);CHKERRCUSP(stat); 768 stat = cusparseSetMatIndexBase(loTriFactorT->descr, indexBase);CHKERRCUSP(stat); 769 stat = cusparseSetMatType(loTriFactorT->descr, matrixType);CHKERRCUSP(stat); 770 stat = cusparseSetMatFillMode(loTriFactorT->descr, fillMode);CHKERRCUSP(stat); 771 stat = cusparseSetMatDiagType(loTriFactorT->descr, diagType);CHKERRCUSP(stat); 772 773 /* Create the solve analysis information */ 774 stat = cusparseCreateSolveAnalysisInfo(&loTriFactorT->solveInfo);CHKERRCUSP(stat); 775 776 /* set the operation */ 777 loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 778 779 /* allocate GPU space for the CSC of the lower triangular factor*/ 780 loTriFactorT->csrMat = new CsrMatrix; 781 loTriFactorT->csrMat->num_rows = loTriFactor->csrMat->num_rows; 782 loTriFactorT->csrMat->num_cols = loTriFactor->csrMat->num_cols; 783 loTriFactorT->csrMat->num_entries = loTriFactor->csrMat->num_entries; 784 loTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(loTriFactor->csrMat->num_rows+1); 785 loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactor->csrMat->num_entries); 786 loTriFactorT->csrMat->values = new THRUSTARRAY(loTriFactor->csrMat->num_entries); 787 788 /* compute the transpose of the lower triangular factor, i.e. the CSC */ 789 stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, 790 loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, 791 loTriFactor->csrMat->values->data().get(), 792 loTriFactor->csrMat->row_offsets->data().get(), 793 loTriFactor->csrMat->column_indices->data().get(), 794 loTriFactorT->csrMat->values->data().get(), 795 loTriFactorT->csrMat->column_indices->data().get(), 796 loTriFactorT->csrMat->row_offsets->data().get(), 797 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSP(stat); 798 799 /* perform the solve analysis on the transposed matrix */ 800 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp, 801 loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, 802 loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(), 803 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), 804 loTriFactorT->solveInfo);CHKERRCUSP(stat); 805 806 /* assign the pointer. Is this really necessary? */ 807 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT; 808 809 /*********************************************/ 810 /* Now the Transpose of the Upper Tri Factor */ 811 /*********************************************/ 812 813 /* allocate space for the transpose of the upper triangular factor */ 814 upTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct; 815 816 /* set the matrix descriptors of the upper triangular factor */ 817 matrixType = cusparseGetMatType(upTriFactor->descr); 818 indexBase = cusparseGetMatIndexBase(upTriFactor->descr); 819 fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ? 820 CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 821 diagType = cusparseGetMatDiagType(upTriFactor->descr); 822 823 /* Create the matrix description */ 824 stat = cusparseCreateMatDescr(&upTriFactorT->descr);CHKERRCUSP(stat); 825 stat = cusparseSetMatIndexBase(upTriFactorT->descr, indexBase);CHKERRCUSP(stat); 826 stat = cusparseSetMatType(upTriFactorT->descr, matrixType);CHKERRCUSP(stat); 827 stat = cusparseSetMatFillMode(upTriFactorT->descr, fillMode);CHKERRCUSP(stat); 828 stat = cusparseSetMatDiagType(upTriFactorT->descr, diagType);CHKERRCUSP(stat); 829 830 /* Create the solve analysis information */ 831 stat = cusparseCreateSolveAnalysisInfo(&upTriFactorT->solveInfo);CHKERRCUSP(stat); 832 833 /* set the operation */ 834 upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 835 836 /* allocate GPU space for the CSC of the upper triangular factor*/ 837 upTriFactorT->csrMat = new CsrMatrix; 838 upTriFactorT->csrMat->num_rows = upTriFactor->csrMat->num_rows; 839 upTriFactorT->csrMat->num_cols = upTriFactor->csrMat->num_cols; 840 upTriFactorT->csrMat->num_entries = upTriFactor->csrMat->num_entries; 841 upTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(upTriFactor->csrMat->num_rows+1); 842 upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactor->csrMat->num_entries); 843 upTriFactorT->csrMat->values = new THRUSTARRAY(upTriFactor->csrMat->num_entries); 844 845 /* compute the transpose of the upper triangular factor, i.e. the CSC */ 846 stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, 847 upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, 848 upTriFactor->csrMat->values->data().get(), 849 upTriFactor->csrMat->row_offsets->data().get(), 850 upTriFactor->csrMat->column_indices->data().get(), 851 upTriFactorT->csrMat->values->data().get(), 852 upTriFactorT->csrMat->column_indices->data().get(), 853 upTriFactorT->csrMat->row_offsets->data().get(), 854 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSP(stat); 855 856 /* perform the solve analysis on the transposed matrix */ 857 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp, 858 upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, 859 upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(), 860 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), 861 upTriFactorT->solveInfo);CHKERRCUSP(stat); 862 863 /* assign the pointer. Is this really necessary? */ 864 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT; 865 PetscFunctionReturn(0); 866 } 867 868 #undef __FUNCT__ 869 #define __FUNCT__ "MatSeqAIJCUSPARSEGenerateTransposeForMult" 870 static PetscErrorCode MatSeqAIJCUSPARSEGenerateTransposeForMult(Mat A) 871 { 872 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 873 Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 874 Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 875 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 876 cusparseStatus_t stat; 877 cusparseIndexBase_t indexBase; 878 cudaError_t err; 879 880 PetscFunctionBegin; 881 882 /* allocate space for the triangular factor information */ 883 matstructT = new Mat_SeqAIJCUSPARSEMultStruct; 884 stat = cusparseCreateMatDescr(&matstructT->descr);CHKERRCUSP(stat); 885 indexBase = cusparseGetMatIndexBase(matstruct->descr); 886 stat = cusparseSetMatIndexBase(matstructT->descr, indexBase);CHKERRCUSP(stat); 887 stat = cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSP(stat); 888 889 /* set alpha and beta */ 890 err = cudaMalloc((void **)&(matstructT->alpha),sizeof(PetscScalar));CHKERRCUSP(err); 891 err = cudaMemcpy(matstructT->alpha,&ALPHA,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUSP(err); 892 err = cudaMalloc((void **)&(matstructT->beta),sizeof(PetscScalar));CHKERRCUSP(err); 893 err = cudaMemcpy(matstructT->beta,&BETA,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUSP(err); 894 stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSP(stat); 895 896 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 897 CsrMatrix *matrix = (CsrMatrix*)matstruct->mat; 898 CsrMatrix *matrixT= new CsrMatrix; 899 matrixT->num_rows = A->rmap->n; 900 matrixT->num_cols = A->cmap->n; 901 matrixT->num_entries = a->nz; 902 matrixT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 903 matrixT->column_indices = new THRUSTINTARRAY32(a->nz); 904 matrixT->values = new THRUSTARRAY(a->nz); 905 906 /* compute the transpose of the upper triangular factor, i.e. the CSC */ 907 indexBase = cusparseGetMatIndexBase(matstruct->descr); 908 stat = cusparse_csr2csc(cusparsestruct->handle, matrix->num_rows, 909 matrix->num_cols, matrix->num_entries, 910 matrix->values->data().get(), 911 matrix->row_offsets->data().get(), 912 matrix->column_indices->data().get(), 913 matrixT->values->data().get(), 914 matrixT->column_indices->data().get(), 915 matrixT->row_offsets->data().get(), 916 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSP(stat); 917 918 /* assign the pointer */ 919 matstructT->mat = matrixT; 920 921 } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) { 922 #if CUDA_VERSION>=5000 923 /* First convert HYB to CSR */ 924 CsrMatrix *temp= new CsrMatrix; 925 temp->num_rows = A->rmap->n; 926 temp->num_cols = A->cmap->n; 927 temp->num_entries = a->nz; 928 temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 929 temp->column_indices = new THRUSTINTARRAY32(a->nz); 930 temp->values = new THRUSTARRAY(a->nz); 931 932 933 stat = cusparse_hyb2csr(cusparsestruct->handle, 934 matstruct->descr, (cusparseHybMat_t)matstruct->mat, 935 temp->values->data().get(), 936 temp->row_offsets->data().get(), 937 temp->column_indices->data().get());CHKERRCUSP(stat); 938 939 /* Next, convert CSR to CSC (i.e. the matrix transpose) */ 940 CsrMatrix *tempT= new CsrMatrix; 941 tempT->num_rows = A->rmap->n; 942 tempT->num_cols = A->cmap->n; 943 tempT->num_entries = a->nz; 944 tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 945 tempT->column_indices = new THRUSTINTARRAY32(a->nz); 946 tempT->values = new THRUSTARRAY(a->nz); 947 948 stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows, 949 temp->num_cols, temp->num_entries, 950 temp->values->data().get(), 951 temp->row_offsets->data().get(), 952 temp->column_indices->data().get(), 953 tempT->values->data().get(), 954 tempT->column_indices->data().get(), 955 tempT->row_offsets->data().get(), 956 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSP(stat); 957 958 /* Last, convert CSC to HYB */ 959 cusparseHybMat_t hybMat; 960 stat = cusparseCreateHybMat(&hybMat);CHKERRCUSP(stat); 961 cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ? 962 CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 963 stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n, 964 matstructT->descr, tempT->values->data().get(), 965 tempT->row_offsets->data().get(), 966 tempT->column_indices->data().get(), 967 hybMat, 0, partition);CHKERRCUSP(stat); 968 969 /* assign the pointer */ 970 matstructT->mat = hybMat; 971 972 /* delete temporaries */ 973 if (tempT) { 974 if (tempT->values) delete (THRUSTARRAY*) tempT->values; 975 if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices; 976 if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets; 977 delete (CsrMatrix*) tempT; 978 } 979 if (temp) { 980 if (temp->values) delete (THRUSTARRAY*) temp->values; 981 if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices; 982 if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets; 983 delete (CsrMatrix*) temp; 984 } 985 #else 986 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"ELL (Ellpack) and HYB (Hybrid) storage format for the Matrix Transpose (in MatMultTranspose) require CUDA 5.0 or later."); 987 #endif 988 } 989 /* assign the compressed row indices */ 990 matstructT->cprowIndices = new THRUSTINTARRAY; 991 992 /* assign the pointer */ 993 ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT; 994 PetscFunctionReturn(0); 995 } 996 997 #undef __FUNCT__ 998 #define __FUNCT__ "MatSolveTranspose_SeqAIJCUSPARSE" 999 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 1000 { 1001 CUSPARRAY *xGPU, *bGPU; 1002 cusparseStatus_t stat; 1003 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1004 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1005 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1006 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1007 PetscErrorCode ierr; 1008 1009 PetscFunctionBegin; 1010 /* Analyze the matrix and create the transpose ... on the fly */ 1011 if (!loTriFactorT && !upTriFactorT) { 1012 ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr); 1013 loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1014 upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1015 } 1016 1017 /* Get the GPU pointers */ 1018 ierr = VecCUSPGetArrayWrite(xx,&xGPU);CHKERRQ(ierr); 1019 ierr = VecCUSPGetArrayRead(bb,&bGPU);CHKERRQ(ierr); 1020 1021 /* First, reorder with the row permutation */ 1022 thrust::copy(thrust::make_permutation_iterator(bGPU->begin(), cusparseTriFactors->rpermIndices->begin()), 1023 thrust::make_permutation_iterator(bGPU->end(), cusparseTriFactors->rpermIndices->end()), 1024 xGPU->begin()); 1025 1026 /* First, solve U */ 1027 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, 1028 upTriFactorT->csrMat->num_rows, &ALPHA, upTriFactorT->descr, 1029 upTriFactorT->csrMat->values->data().get(), 1030 upTriFactorT->csrMat->row_offsets->data().get(), 1031 upTriFactorT->csrMat->column_indices->data().get(), 1032 upTriFactorT->solveInfo, 1033 xGPU->data().get(), tempGPU->data().get());CHKERRCUSP(stat); 1034 1035 /* Then, solve L */ 1036 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, 1037 loTriFactorT->csrMat->num_rows, &ALPHA, loTriFactorT->descr, 1038 loTriFactorT->csrMat->values->data().get(), 1039 loTriFactorT->csrMat->row_offsets->data().get(), 1040 loTriFactorT->csrMat->column_indices->data().get(), 1041 loTriFactorT->solveInfo, 1042 tempGPU->data().get(), xGPU->data().get());CHKERRCUSP(stat); 1043 1044 /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */ 1045 thrust::copy(thrust::make_permutation_iterator(xGPU->begin(), cusparseTriFactors->cpermIndices->begin()), 1046 thrust::make_permutation_iterator(xGPU->end(), cusparseTriFactors->cpermIndices->end()), 1047 tempGPU->begin()); 1048 1049 /* Copy the temporary to the full solution. */ 1050 thrust::copy(tempGPU->begin(), tempGPU->end(), xGPU->begin()); 1051 1052 /* restore */ 1053 ierr = VecCUSPRestoreArrayRead(bb,&bGPU);CHKERRQ(ierr); 1054 ierr = VecCUSPRestoreArrayWrite(xx,&xGPU);CHKERRQ(ierr); 1055 ierr = WaitForGPU();CHKERRCUSP(ierr); 1056 1057 ierr = PetscLogFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1058 PetscFunctionReturn(0); 1059 } 1060 1061 #undef __FUNCT__ 1062 #define __FUNCT__ "MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering" 1063 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 1064 { 1065 CUSPARRAY *xGPU,*bGPU; 1066 cusparseStatus_t stat; 1067 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1068 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1069 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1070 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1071 PetscErrorCode ierr; 1072 1073 PetscFunctionBegin; 1074 /* Analyze the matrix and create the transpose ... on the fly */ 1075 if (!loTriFactorT && !upTriFactorT) { 1076 ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr); 1077 loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1078 upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1079 } 1080 1081 /* Get the GPU pointers */ 1082 ierr = VecCUSPGetArrayWrite(xx,&xGPU);CHKERRQ(ierr); 1083 ierr = VecCUSPGetArrayRead(bb,&bGPU);CHKERRQ(ierr); 1084 1085 /* First, solve U */ 1086 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, 1087 upTriFactorT->csrMat->num_rows, &ALPHA, upTriFactorT->descr, 1088 upTriFactorT->csrMat->values->data().get(), 1089 upTriFactorT->csrMat->row_offsets->data().get(), 1090 upTriFactorT->csrMat->column_indices->data().get(), 1091 upTriFactorT->solveInfo, 1092 bGPU->data().get(), tempGPU->data().get());CHKERRCUSP(stat); 1093 1094 /* Then, solve L */ 1095 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, 1096 loTriFactorT->csrMat->num_rows, &ALPHA, loTriFactorT->descr, 1097 loTriFactorT->csrMat->values->data().get(), 1098 loTriFactorT->csrMat->row_offsets->data().get(), 1099 loTriFactorT->csrMat->column_indices->data().get(), 1100 loTriFactorT->solveInfo, 1101 tempGPU->data().get(), xGPU->data().get());CHKERRCUSP(stat); 1102 1103 /* restore */ 1104 ierr = VecCUSPRestoreArrayRead(bb,&bGPU);CHKERRQ(ierr); 1105 ierr = VecCUSPRestoreArrayWrite(xx,&xGPU);CHKERRQ(ierr); 1106 ierr = WaitForGPU();CHKERRCUSP(ierr); 1107 ierr = PetscLogFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1108 PetscFunctionReturn(0); 1109 } 1110 1111 #undef __FUNCT__ 1112 #define __FUNCT__ "MatSolve_SeqAIJCUSPARSE" 1113 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 1114 { 1115 CUSPARRAY *xGPU,*bGPU; 1116 cusparseStatus_t stat; 1117 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1118 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1119 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1120 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1121 PetscErrorCode ierr; 1122 1123 PetscFunctionBegin; 1124 /* Get the GPU pointers */ 1125 ierr = VecCUSPGetArrayWrite(xx,&xGPU);CHKERRQ(ierr); 1126 ierr = VecCUSPGetArrayRead(bb,&bGPU);CHKERRQ(ierr); 1127 1128 /* First, reorder with the row permutation */ 1129 thrust::copy(thrust::make_permutation_iterator(bGPU->begin(), cusparseTriFactors->rpermIndices->begin()), 1130 thrust::make_permutation_iterator(bGPU->end(), cusparseTriFactors->rpermIndices->end()), 1131 xGPU->begin()); 1132 1133 /* Next, solve L */ 1134 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp, 1135 loTriFactor->csrMat->num_rows, &ALPHA, loTriFactor->descr, 1136 loTriFactor->csrMat->values->data().get(), 1137 loTriFactor->csrMat->row_offsets->data().get(), 1138 loTriFactor->csrMat->column_indices->data().get(), 1139 loTriFactor->solveInfo, 1140 xGPU->data().get(), tempGPU->data().get());CHKERRCUSP(stat); 1141 1142 /* Then, solve U */ 1143 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp, 1144 upTriFactor->csrMat->num_rows, &ALPHA, upTriFactor->descr, 1145 upTriFactor->csrMat->values->data().get(), 1146 upTriFactor->csrMat->row_offsets->data().get(), 1147 upTriFactor->csrMat->column_indices->data().get(), 1148 upTriFactor->solveInfo, 1149 tempGPU->data().get(), xGPU->data().get());CHKERRCUSP(stat); 1150 1151 /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */ 1152 thrust::copy(thrust::make_permutation_iterator(xGPU->begin(), cusparseTriFactors->cpermIndices->begin()), 1153 thrust::make_permutation_iterator(xGPU->end(), cusparseTriFactors->cpermIndices->end()), 1154 tempGPU->begin()); 1155 1156 /* Copy the temporary to the full solution. */ 1157 thrust::copy(tempGPU->begin(), tempGPU->end(), xGPU->begin()); 1158 1159 ierr = VecCUSPRestoreArrayRead(bb,&bGPU);CHKERRQ(ierr); 1160 ierr = VecCUSPRestoreArrayWrite(xx,&xGPU);CHKERRQ(ierr); 1161 ierr = WaitForGPU();CHKERRCUSP(ierr); 1162 ierr = PetscLogFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1163 PetscFunctionReturn(0); 1164 } 1165 1166 #undef __FUNCT__ 1167 #define __FUNCT__ "MatSolve_SeqAIJCUSPARSE_NaturalOrdering" 1168 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 1169 { 1170 CUSPARRAY *xGPU,*bGPU; 1171 cusparseStatus_t stat; 1172 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1173 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1174 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1175 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1176 PetscErrorCode ierr; 1177 1178 PetscFunctionBegin; 1179 /* Get the GPU pointers */ 1180 ierr = VecCUSPGetArrayWrite(xx,&xGPU);CHKERRQ(ierr); 1181 ierr = VecCUSPGetArrayRead(bb,&bGPU);CHKERRQ(ierr); 1182 1183 /* First, solve L */ 1184 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp, 1185 loTriFactor->csrMat->num_rows, &ALPHA, loTriFactor->descr, 1186 loTriFactor->csrMat->values->data().get(), 1187 loTriFactor->csrMat->row_offsets->data().get(), 1188 loTriFactor->csrMat->column_indices->data().get(), 1189 loTriFactor->solveInfo, 1190 bGPU->data().get(), tempGPU->data().get());CHKERRCUSP(stat); 1191 1192 /* Next, solve U */ 1193 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp, 1194 upTriFactor->csrMat->num_rows, &ALPHA, upTriFactor->descr, 1195 upTriFactor->csrMat->values->data().get(), 1196 upTriFactor->csrMat->row_offsets->data().get(), 1197 upTriFactor->csrMat->column_indices->data().get(), 1198 upTriFactor->solveInfo, 1199 tempGPU->data().get(), xGPU->data().get());CHKERRCUSP(stat); 1200 1201 ierr = VecCUSPRestoreArrayRead(bb,&bGPU);CHKERRQ(ierr); 1202 ierr = VecCUSPRestoreArrayWrite(xx,&xGPU);CHKERRQ(ierr); 1203 ierr = WaitForGPU();CHKERRCUSP(ierr); 1204 ierr = PetscLogFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1205 PetscFunctionReturn(0); 1206 } 1207 1208 #undef __FUNCT__ 1209 #define __FUNCT__ "MatSeqAIJCUSPARSECopyToGPU" 1210 static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A) 1211 { 1212 1213 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1214 Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 1215 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1216 PetscInt m = A->rmap->n,*ii,*ridx; 1217 PetscErrorCode ierr; 1218 cusparseStatus_t stat; 1219 cudaError_t err; 1220 1221 PetscFunctionBegin; 1222 if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUSP_CPU) { 1223 ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1224 if (matstruct) { 1225 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Data structure should not be initialized."); 1226 } 1227 try { 1228 cusparsestruct->nonzerorow=0; 1229 for (int j = 0; j<m; j++) cusparsestruct->nonzerorow += ((a->i[j+1]-a->i[j])>0); 1230 1231 if (a->compressedrow.use) { 1232 m = a->compressedrow.nrows; 1233 ii = a->compressedrow.i; 1234 ridx = a->compressedrow.rindex; 1235 } else { 1236 /* Forcing compressed row on the GPU */ 1237 int k=0; 1238 ierr = PetscMalloc1((cusparsestruct->nonzerorow+1), &ii);CHKERRQ(ierr); 1239 ierr = PetscMalloc1((cusparsestruct->nonzerorow), &ridx);CHKERRQ(ierr); 1240 ii[0]=0; 1241 for (int j = 0; j<m; j++) { 1242 if ((a->i[j+1]-a->i[j])>0) { 1243 ii[k] = a->i[j]; 1244 ridx[k]= j; 1245 k++; 1246 } 1247 } 1248 ii[cusparsestruct->nonzerorow] = a->nz; 1249 m = cusparsestruct->nonzerorow; 1250 } 1251 1252 /* allocate space for the triangular factor information */ 1253 matstruct = new Mat_SeqAIJCUSPARSEMultStruct; 1254 stat = cusparseCreateMatDescr(&matstruct->descr);CHKERRCUSP(stat); 1255 stat = cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSP(stat); 1256 stat = cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSP(stat); 1257 1258 err = cudaMalloc((void **)&(matstruct->alpha),sizeof(PetscScalar));CHKERRCUSP(err); 1259 err = cudaMemcpy(matstruct->alpha,&ALPHA,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUSP(err); 1260 err = cudaMalloc((void **)&(matstruct->beta),sizeof(PetscScalar));CHKERRCUSP(err); 1261 err = cudaMemcpy(matstruct->beta,&BETA,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUSP(err); 1262 stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSP(stat); 1263 1264 /* Build a hybrid/ellpack matrix if this option is chosen for the storage */ 1265 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1266 /* set the matrix */ 1267 CsrMatrix *matrix= new CsrMatrix; 1268 matrix->num_rows = m; 1269 matrix->num_cols = A->cmap->n; 1270 matrix->num_entries = a->nz; 1271 matrix->row_offsets = new THRUSTINTARRAY32(m+1); 1272 matrix->row_offsets->assign(ii, ii + m+1); 1273 1274 matrix->column_indices = new THRUSTINTARRAY32(a->nz); 1275 matrix->column_indices->assign(a->j, a->j+a->nz); 1276 1277 matrix->values = new THRUSTARRAY(a->nz); 1278 matrix->values->assign(a->a, a->a+a->nz); 1279 1280 /* assign the pointer */ 1281 matstruct->mat = matrix; 1282 1283 } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) { 1284 #if CUDA_VERSION>=4020 1285 CsrMatrix *matrix= new CsrMatrix; 1286 matrix->num_rows = m; 1287 matrix->num_cols = A->cmap->n; 1288 matrix->num_entries = a->nz; 1289 matrix->row_offsets = new THRUSTINTARRAY32(m+1); 1290 matrix->row_offsets->assign(ii, ii + m+1); 1291 1292 matrix->column_indices = new THRUSTINTARRAY32(a->nz); 1293 matrix->column_indices->assign(a->j, a->j+a->nz); 1294 1295 matrix->values = new THRUSTARRAY(a->nz); 1296 matrix->values->assign(a->a, a->a+a->nz); 1297 1298 cusparseHybMat_t hybMat; 1299 stat = cusparseCreateHybMat(&hybMat);CHKERRCUSP(stat); 1300 cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ? 1301 CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 1302 stat = cusparse_csr2hyb(cusparsestruct->handle, matrix->num_rows, matrix->num_cols, 1303 matstruct->descr, matrix->values->data().get(), 1304 matrix->row_offsets->data().get(), 1305 matrix->column_indices->data().get(), 1306 hybMat, 0, partition);CHKERRCUSP(stat); 1307 /* assign the pointer */ 1308 matstruct->mat = hybMat; 1309 1310 if (matrix) { 1311 if (matrix->values) delete (THRUSTARRAY*)matrix->values; 1312 if (matrix->column_indices) delete (THRUSTINTARRAY32*)matrix->column_indices; 1313 if (matrix->row_offsets) delete (THRUSTINTARRAY32*)matrix->row_offsets; 1314 delete (CsrMatrix*)matrix; 1315 } 1316 #endif 1317 } 1318 1319 /* assign the compressed row indices */ 1320 matstruct->cprowIndices = new THRUSTINTARRAY(m); 1321 matstruct->cprowIndices->assign(ridx,ridx+m); 1322 1323 /* assign the pointer */ 1324 cusparsestruct->mat = matstruct; 1325 1326 if (!a->compressedrow.use) { 1327 ierr = PetscFree(ii);CHKERRQ(ierr); 1328 ierr = PetscFree(ridx);CHKERRQ(ierr); 1329 } 1330 cusparsestruct->workVector = new THRUSTARRAY; 1331 cusparsestruct->workVector->resize(m); 1332 } catch(char *ex) { 1333 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 1334 } 1335 ierr = WaitForGPU();CHKERRCUSP(ierr); 1336 1337 A->valid_GPU_matrix = PETSC_CUSP_BOTH; 1338 1339 ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1340 } 1341 PetscFunctionReturn(0); 1342 } 1343 1344 #undef __FUNCT__ 1345 #define __FUNCT__ "MatCreateVecs_SeqAIJCUSPARSE" 1346 static PetscErrorCode MatCreateVecs_SeqAIJCUSPARSE(Mat mat, Vec *right, Vec *left) 1347 { 1348 PetscErrorCode ierr; 1349 PetscInt rbs,cbs; 1350 1351 PetscFunctionBegin; 1352 ierr = MatGetBlockSizes(mat,&rbs,&cbs);CHKERRQ(ierr); 1353 if (right) { 1354 ierr = VecCreate(PetscObjectComm((PetscObject)mat),right);CHKERRQ(ierr); 1355 ierr = VecSetSizes(*right,mat->cmap->n,PETSC_DETERMINE);CHKERRQ(ierr); 1356 ierr = VecSetBlockSize(*right,cbs);CHKERRQ(ierr); 1357 ierr = VecSetType(*right,VECSEQCUSP);CHKERRQ(ierr); 1358 ierr = PetscLayoutReference(mat->cmap,&(*right)->map);CHKERRQ(ierr); 1359 } 1360 if (left) { 1361 ierr = VecCreate(PetscObjectComm((PetscObject)mat),left);CHKERRQ(ierr); 1362 ierr = VecSetSizes(*left,mat->rmap->n,PETSC_DETERMINE);CHKERRQ(ierr); 1363 ierr = VecSetBlockSize(*left,rbs);CHKERRQ(ierr); 1364 ierr = VecSetType(*left,VECSEQCUSP);CHKERRQ(ierr); 1365 ierr = PetscLayoutReference(mat->rmap,&(*left)->map);CHKERRQ(ierr); 1366 } 1367 PetscFunctionReturn(0); 1368 } 1369 1370 struct VecCUSPPlusEquals 1371 { 1372 template <typename Tuple> 1373 __host__ __device__ 1374 void operator()(Tuple t) 1375 { 1376 thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t); 1377 } 1378 }; 1379 1380 #undef __FUNCT__ 1381 #define __FUNCT__ "MatMult_SeqAIJCUSPARSE" 1382 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 1383 { 1384 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1385 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1386 Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 1387 CUSPARRAY *xarray,*yarray; 1388 PetscErrorCode ierr; 1389 cusparseStatus_t stat; 1390 1391 PetscFunctionBegin; 1392 /* The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE 1393 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); */ 1394 ierr = VecCUSPGetArrayRead(xx,&xarray);CHKERRQ(ierr); 1395 ierr = VecCUSPGetArrayWrite(yy,&yarray);CHKERRQ(ierr); 1396 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1397 CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 1398 stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1399 mat->num_rows, mat->num_cols, mat->num_entries, 1400 matstruct->alpha, matstruct->descr, mat->values->data().get(), mat->row_offsets->data().get(), 1401 mat->column_indices->data().get(), xarray->data().get(), matstruct->beta, 1402 yarray->data().get());CHKERRCUSP(stat); 1403 } else { 1404 #if CUDA_VERSION>=4020 1405 cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat; 1406 stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1407 matstruct->alpha, matstruct->descr, hybMat, 1408 xarray->data().get(), matstruct->beta, 1409 yarray->data().get());CHKERRCUSP(stat); 1410 #endif 1411 } 1412 ierr = VecCUSPRestoreArrayRead(xx,&xarray);CHKERRQ(ierr); 1413 ierr = VecCUSPRestoreArrayWrite(yy,&yarray);CHKERRQ(ierr); 1414 if (!cusparsestruct->stream) { 1415 ierr = WaitForGPU();CHKERRCUSP(ierr); 1416 } 1417 ierr = PetscLogFlops(2.0*a->nz - cusparsestruct->nonzerorow);CHKERRQ(ierr); 1418 PetscFunctionReturn(0); 1419 } 1420 1421 #undef __FUNCT__ 1422 #define __FUNCT__ "MatMultTranspose_SeqAIJCUSPARSE" 1423 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 1424 { 1425 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1426 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1427 Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 1428 CUSPARRAY *xarray,*yarray; 1429 PetscErrorCode ierr; 1430 cusparseStatus_t stat; 1431 1432 PetscFunctionBegin; 1433 /* The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE 1434 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); */ 1435 if (!matstructT) { 1436 ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr); 1437 matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 1438 } 1439 ierr = VecCUSPGetArrayRead(xx,&xarray);CHKERRQ(ierr); 1440 ierr = VecCUSPGetArrayWrite(yy,&yarray);CHKERRQ(ierr); 1441 1442 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1443 CsrMatrix *mat = (CsrMatrix*)matstructT->mat; 1444 stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1445 mat->num_rows, mat->num_cols, 1446 mat->num_entries, matstructT->alpha, matstructT->descr, 1447 mat->values->data().get(), mat->row_offsets->data().get(), 1448 mat->column_indices->data().get(), xarray->data().get(), matstructT->beta, 1449 yarray->data().get());CHKERRCUSP(stat); 1450 } else { 1451 #if CUDA_VERSION>=4020 1452 cusparseHybMat_t hybMat = (cusparseHybMat_t)matstructT->mat; 1453 stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1454 matstructT->alpha, matstructT->descr, hybMat, 1455 xarray->data().get(), matstructT->beta, 1456 yarray->data().get());CHKERRCUSP(stat); 1457 #endif 1458 } 1459 ierr = VecCUSPRestoreArrayRead(xx,&xarray);CHKERRQ(ierr); 1460 ierr = VecCUSPRestoreArrayWrite(yy,&yarray);CHKERRQ(ierr); 1461 if (!cusparsestruct->stream) { 1462 ierr = WaitForGPU();CHKERRCUSP(ierr); 1463 } 1464 ierr = PetscLogFlops(2.0*a->nz - cusparsestruct->nonzerorow);CHKERRQ(ierr); 1465 PetscFunctionReturn(0); 1466 } 1467 1468 1469 #undef __FUNCT__ 1470 #define __FUNCT__ "MatMultAdd_SeqAIJCUSPARSE" 1471 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 1472 { 1473 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1474 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1475 Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 1476 CUSPARRAY *xarray,*yarray,*zarray; 1477 PetscErrorCode ierr; 1478 cusparseStatus_t stat; 1479 1480 PetscFunctionBegin; 1481 /* The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE 1482 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); */ 1483 try { 1484 ierr = VecCopy_SeqCUSP(yy,zz);CHKERRQ(ierr); 1485 ierr = VecCUSPGetArrayRead(xx,&xarray);CHKERRQ(ierr); 1486 ierr = VecCUSPGetArrayRead(yy,&yarray);CHKERRQ(ierr); 1487 ierr = VecCUSPGetArrayWrite(zz,&zarray);CHKERRQ(ierr); 1488 1489 /* multiply add */ 1490 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1491 CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 1492 /* here we need to be careful to set the number of rows in the multiply to the 1493 number of compressed rows in the matrix ... which is equivalent to the 1494 size of the workVector */ 1495 stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1496 mat->num_rows, mat->num_cols, 1497 mat->num_entries, matstruct->alpha, matstruct->descr, 1498 mat->values->data().get(), mat->row_offsets->data().get(), 1499 mat->column_indices->data().get(), xarray->data().get(), matstruct->beta, 1500 cusparsestruct->workVector->data().get());CHKERRCUSP(stat); 1501 } else { 1502 #if CUDA_VERSION>=4020 1503 cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat; 1504 if (cusparsestruct->workVector->size()) { 1505 stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1506 matstruct->alpha, matstruct->descr, hybMat, 1507 xarray->data().get(), matstruct->beta, 1508 cusparsestruct->workVector->data().get());CHKERRCUSP(stat); 1509 } 1510 #endif 1511 } 1512 1513 /* scatter the data from the temporary into the full vector with a += operation */ 1514 thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zarray->begin(), matstruct->cprowIndices->begin()))), 1515 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zarray->begin(), matstruct->cprowIndices->begin()))) + cusparsestruct->workVector->size(), 1516 VecCUSPPlusEquals()); 1517 ierr = VecCUSPRestoreArrayRead(xx,&xarray);CHKERRQ(ierr); 1518 ierr = VecCUSPRestoreArrayRead(yy,&yarray);CHKERRQ(ierr); 1519 ierr = VecCUSPRestoreArrayWrite(zz,&zarray);CHKERRQ(ierr); 1520 1521 } catch(char *ex) { 1522 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 1523 } 1524 ierr = WaitForGPU();CHKERRCUSP(ierr); 1525 ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr); 1526 PetscFunctionReturn(0); 1527 } 1528 1529 #undef __FUNCT__ 1530 #define __FUNCT__ "MatMultTransposeAdd_SeqAIJCUSPARSE" 1531 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 1532 { 1533 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1534 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1535 Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 1536 CUSPARRAY *xarray,*yarray,*zarray; 1537 PetscErrorCode ierr; 1538 cusparseStatus_t stat; 1539 1540 PetscFunctionBegin; 1541 /* The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE 1542 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); */ 1543 if (!matstructT) { 1544 ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr); 1545 matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 1546 } 1547 1548 try { 1549 ierr = VecCopy_SeqCUSP(yy,zz);CHKERRQ(ierr); 1550 ierr = VecCUSPGetArrayRead(xx,&xarray);CHKERRQ(ierr); 1551 ierr = VecCUSPGetArrayRead(yy,&yarray);CHKERRQ(ierr); 1552 ierr = VecCUSPGetArrayWrite(zz,&zarray);CHKERRQ(ierr); 1553 1554 /* multiply add with matrix transpose */ 1555 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1556 CsrMatrix *mat = (CsrMatrix*)matstructT->mat; 1557 /* here we need to be careful to set the number of rows in the multiply to the 1558 number of compressed rows in the matrix ... which is equivalent to the 1559 size of the workVector */ 1560 stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1561 mat->num_rows, mat->num_cols, 1562 mat->num_entries, matstructT->alpha, matstructT->descr, 1563 mat->values->data().get(), mat->row_offsets->data().get(), 1564 mat->column_indices->data().get(), xarray->data().get(), matstructT->beta, 1565 cusparsestruct->workVector->data().get());CHKERRCUSP(stat); 1566 } else { 1567 #if CUDA_VERSION>=4020 1568 cusparseHybMat_t hybMat = (cusparseHybMat_t)matstructT->mat; 1569 if (cusparsestruct->workVector->size()) { 1570 stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1571 matstructT->alpha, matstructT->descr, hybMat, 1572 xarray->data().get(), matstructT->beta, 1573 cusparsestruct->workVector->data().get());CHKERRCUSP(stat); 1574 } 1575 #endif 1576 } 1577 1578 /* scatter the data from the temporary into the full vector with a += operation */ 1579 thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zarray->begin(), matstructT->cprowIndices->begin()))), 1580 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zarray->begin(), matstructT->cprowIndices->begin()))) + cusparsestruct->workVector->size(), 1581 VecCUSPPlusEquals()); 1582 1583 ierr = VecCUSPRestoreArrayRead(xx,&xarray);CHKERRQ(ierr); 1584 ierr = VecCUSPRestoreArrayRead(yy,&yarray);CHKERRQ(ierr); 1585 ierr = VecCUSPRestoreArrayWrite(zz,&zarray);CHKERRQ(ierr); 1586 1587 } catch(char *ex) { 1588 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 1589 } 1590 ierr = WaitForGPU();CHKERRCUSP(ierr); 1591 ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr); 1592 PetscFunctionReturn(0); 1593 } 1594 1595 #undef __FUNCT__ 1596 #define __FUNCT__ "MatAssemblyEnd_SeqAIJCUSPARSE" 1597 static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode) 1598 { 1599 PetscErrorCode ierr; 1600 1601 PetscFunctionBegin; 1602 ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr); 1603 if (A->factortype==MAT_FACTOR_NONE) { 1604 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 1605 } 1606 if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0); 1607 A->ops->mult = MatMult_SeqAIJCUSPARSE; 1608 A->ops->multadd = MatMultAdd_SeqAIJCUSPARSE; 1609 A->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE; 1610 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE; 1611 PetscFunctionReturn(0); 1612 } 1613 1614 /* --------------------------------------------------------------------------------*/ 1615 #undef __FUNCT__ 1616 #define __FUNCT__ "MatCreateSeqAIJCUSPARSE" 1617 /*@ 1618 MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format 1619 (the default parallel PETSc format). This matrix will ultimately pushed down 1620 to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix 1621 assembly performance the user should preallocate the matrix storage by setting 1622 the parameter nz (or the array nnz). By setting these parameters accurately, 1623 performance during matrix assembly can be increased by more than a factor of 50. 1624 1625 Collective on MPI_Comm 1626 1627 Input Parameters: 1628 + comm - MPI communicator, set to PETSC_COMM_SELF 1629 . m - number of rows 1630 . n - number of columns 1631 . nz - number of nonzeros per row (same for all rows) 1632 - nnz - array containing the number of nonzeros in the various rows 1633 (possibly different for each row) or NULL 1634 1635 Output Parameter: 1636 . A - the matrix 1637 1638 It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 1639 MatXXXXSetPreallocation() paradgm instead of this routine directly. 1640 [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 1641 1642 Notes: 1643 If nnz is given then nz is ignored 1644 1645 The AIJ format (also called the Yale sparse matrix format or 1646 compressed row storage), is fully compatible with standard Fortran 77 1647 storage. That is, the stored row and column indices can begin at 1648 either one (as in Fortran) or zero. See the users' manual for details. 1649 1650 Specify the preallocated storage with either nz or nnz (not both). 1651 Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 1652 allocation. For large problems you MUST preallocate memory or you 1653 will get TERRIBLE performance, see the users' manual chapter on matrices. 1654 1655 By default, this format uses inodes (identical nodes) when possible, to 1656 improve numerical efficiency of matrix-vector products and solves. We 1657 search for consecutive rows with the same nonzero structure, thereby 1658 reusing matrix information to achieve increased efficiency. 1659 1660 Level: intermediate 1661 1662 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE 1663 @*/ 1664 PetscErrorCode MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A) 1665 { 1666 PetscErrorCode ierr; 1667 1668 PetscFunctionBegin; 1669 ierr = MatCreate(comm,A);CHKERRQ(ierr); 1670 ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr); 1671 ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 1672 ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr); 1673 PetscFunctionReturn(0); 1674 } 1675 1676 #undef __FUNCT__ 1677 #define __FUNCT__ "MatDestroy_SeqAIJCUSPARSE" 1678 static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A) 1679 { 1680 PetscErrorCode ierr; 1681 cusparseStatus_t stat; 1682 cudaError_t err; 1683 PetscFunctionBegin; 1684 if (A->factortype==MAT_FACTOR_NONE) { 1685 try { 1686 if (A->valid_GPU_matrix != PETSC_CUSP_UNALLOCATED) { 1687 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1688 Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 1689 Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 1690 1691 /* delete all the members in each of the data structures */ 1692 if (matstruct) { 1693 if (matstruct->descr) { stat = cusparseDestroyMatDescr(matstruct->descr);CHKERRCUSP(stat); } 1694 if (matstruct->mat) { 1695 if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) { 1696 #if CUDA_VERSION>=4020 1697 cusparseHybMat_t hybMat = (cusparseHybMat_t) matstruct->mat; 1698 stat = cusparseDestroyHybMat(hybMat);CHKERRCUSP(stat); 1699 #endif 1700 } else { 1701 CsrMatrix* mat = (CsrMatrix*)matstruct->mat; 1702 if (mat->values) delete (THRUSTARRAY*)mat->values; 1703 if (mat->column_indices) delete (THRUSTINTARRAY32*)mat->column_indices; 1704 if (mat->row_offsets) delete (THRUSTINTARRAY32*)mat->row_offsets; 1705 delete (CsrMatrix*)matstruct->mat; 1706 } 1707 } 1708 if (matstruct->alpha) { err=cudaFree(matstruct->alpha);CHKERRCUSP(err); } 1709 if (matstruct->beta) { err=cudaFree(matstruct->beta);CHKERRCUSP(err); } 1710 if (matstruct->cprowIndices) delete (THRUSTINTARRAY*)matstruct->cprowIndices; 1711 if (matstruct) delete (Mat_SeqAIJCUSPARSEMultStruct*)matstruct; 1712 } 1713 if (matstructT) { 1714 if (matstructT->descr) { stat = cusparseDestroyMatDescr(matstructT->descr);CHKERRCUSP(stat); } 1715 if (matstructT->mat) { 1716 if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) { 1717 #if CUDA_VERSION>=4020 1718 cusparseHybMat_t hybMat = (cusparseHybMat_t) matstructT->mat; 1719 stat = cusparseDestroyHybMat(hybMat);CHKERRCUSP(stat); 1720 #endif 1721 } else { 1722 CsrMatrix* matT = (CsrMatrix*)matstructT->mat; 1723 if (matT->values) delete (THRUSTARRAY*)matT->values; 1724 if (matT->column_indices) delete (THRUSTINTARRAY32*)matT->column_indices; 1725 if (matT->row_offsets) delete (THRUSTINTARRAY32*)matT->row_offsets; 1726 delete (CsrMatrix*)matstructT->mat; 1727 } 1728 } 1729 if (matstructT->alpha) { err=cudaFree(matstructT->alpha);CHKERRCUSP(err); } 1730 if (matstructT->beta) { err=cudaFree(matstructT->beta);CHKERRCUSP(err); } 1731 if (matstructT->cprowIndices) delete (THRUSTINTARRAY*)matstructT->cprowIndices; 1732 if (matstructT) delete (Mat_SeqAIJCUSPARSEMultStruct*)matstructT; 1733 } 1734 if (cusparsestruct->workVector) delete (THRUSTARRAY*)cusparsestruct->workVector; 1735 if (cusparsestruct->handle) { stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUSP(stat); } 1736 if (cusparsestruct) delete (Mat_SeqAIJCUSPARSE*)cusparsestruct; 1737 A->valid_GPU_matrix = PETSC_CUSP_UNALLOCATED; 1738 } else { 1739 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1740 if (cusparsestruct->handle) { stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUSP(stat); } 1741 if (cusparsestruct) delete (Mat_SeqAIJCUSPARSE*)cusparsestruct; 1742 } 1743 } catch(char *ex) { 1744 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 1745 } 1746 } else { 1747 /* The triangular factors */ 1748 try { 1749 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1750 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1751 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1752 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1753 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1754 1755 /* delete all the members in each of the data structures */ 1756 if (loTriFactor) { 1757 if (loTriFactor->descr) { stat = cusparseDestroyMatDescr(loTriFactor->descr);CHKERRCUSP(stat); } 1758 if (loTriFactor->solveInfo) { stat = cusparseDestroySolveAnalysisInfo(loTriFactor->solveInfo);CHKERRCUSP(stat); } 1759 if (loTriFactor->csrMat) { 1760 if (loTriFactor->csrMat->values) delete (THRUSTARRAY*)loTriFactor->csrMat->values; 1761 if (loTriFactor->csrMat->column_indices) delete (THRUSTINTARRAY32*)loTriFactor->csrMat->column_indices; 1762 if (loTriFactor->csrMat->row_offsets) delete (THRUSTINTARRAY32*)loTriFactor->csrMat->row_offsets; 1763 delete (CsrMatrix*)loTriFactor->csrMat; 1764 } 1765 if (loTriFactor) delete (Mat_SeqAIJCUSPARSETriFactorStruct*)loTriFactor; 1766 } 1767 1768 if (upTriFactor) { 1769 if (upTriFactor->descr) { stat = cusparseDestroyMatDescr(upTriFactor->descr);CHKERRCUSP(stat); } 1770 if (upTriFactor->solveInfo) { stat = cusparseDestroySolveAnalysisInfo(upTriFactor->solveInfo);CHKERRCUSP(stat); } 1771 if (upTriFactor->csrMat) { 1772 if (upTriFactor->csrMat->values) delete (THRUSTARRAY*)upTriFactor->csrMat->values; 1773 if (upTriFactor->csrMat->column_indices) delete (THRUSTINTARRAY32*)upTriFactor->csrMat->column_indices; 1774 if (upTriFactor->csrMat->row_offsets) delete (THRUSTINTARRAY32*)upTriFactor->csrMat->row_offsets; 1775 delete (CsrMatrix*)upTriFactor->csrMat; 1776 } 1777 if (upTriFactor) delete (Mat_SeqAIJCUSPARSETriFactorStruct*)upTriFactor; 1778 } 1779 1780 if (loTriFactorT) { 1781 if (loTriFactorT->descr) { stat = cusparseDestroyMatDescr(loTriFactorT->descr);CHKERRCUSP(stat); } 1782 if (loTriFactorT->solveInfo) { stat = cusparseDestroySolveAnalysisInfo(loTriFactorT->solveInfo);CHKERRCUSP(stat); } 1783 if (loTriFactorT->csrMat) { 1784 if (loTriFactorT->csrMat->values) delete (THRUSTARRAY*)loTriFactorT->csrMat->values; 1785 if (loTriFactorT->csrMat->column_indices) delete (THRUSTINTARRAY32*)loTriFactorT->csrMat->column_indices; 1786 if (loTriFactorT->csrMat->row_offsets) delete (THRUSTINTARRAY32*)loTriFactorT->csrMat->row_offsets; 1787 delete (CsrMatrix*)loTriFactorT->csrMat; 1788 } 1789 if (loTriFactorT) delete (Mat_SeqAIJCUSPARSETriFactorStruct*)loTriFactorT; 1790 } 1791 1792 if (upTriFactorT) { 1793 if (upTriFactorT->descr) { stat = cusparseDestroyMatDescr(upTriFactorT->descr);CHKERRCUSP(stat); } 1794 if (upTriFactorT->solveInfo) { stat = cusparseDestroySolveAnalysisInfo(upTriFactorT->solveInfo);CHKERRCUSP(stat); } 1795 if (upTriFactorT->csrMat) { 1796 if (upTriFactorT->csrMat->values) delete (THRUSTARRAY*)upTriFactorT->csrMat->values; 1797 if (upTriFactorT->csrMat->column_indices) delete (THRUSTINTARRAY32*)upTriFactorT->csrMat->column_indices; 1798 if (upTriFactorT->csrMat->row_offsets) delete (THRUSTINTARRAY32*)upTriFactorT->csrMat->row_offsets; 1799 delete (CsrMatrix*)upTriFactorT->csrMat; 1800 } 1801 if (upTriFactorT) delete (Mat_SeqAIJCUSPARSETriFactorStruct*)upTriFactorT; 1802 } 1803 if (cusparseTriFactors->rpermIndices) delete (THRUSTINTARRAY*)cusparseTriFactors->rpermIndices; 1804 if (cusparseTriFactors->cpermIndices) delete (THRUSTINTARRAY*)cusparseTriFactors->cpermIndices; 1805 if (cusparseTriFactors->workVector) delete (THRUSTARRAY*)cusparseTriFactors->workVector; 1806 if (cusparseTriFactors->handle) { stat = cusparseDestroy(cusparseTriFactors->handle);CHKERRCUSP(stat); } 1807 if (cusparseTriFactors) delete (Mat_SeqAIJCUSPARSETriFactors*)cusparseTriFactors; 1808 } catch(char *ex) { 1809 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 1810 } 1811 } 1812 /*this next line is because MatDestroy tries to PetscFree spptr if it is not zero, and PetscFree only works if the memory was allocated with PetscNew or PetscMalloc, which don't call the constructor */ 1813 A->spptr = 0; 1814 1815 ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr); 1816 PetscFunctionReturn(0); 1817 } 1818 1819 #undef __FUNCT__ 1820 #define __FUNCT__ "MatCreate_SeqAIJCUSPARSE" 1821 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B) 1822 { 1823 PetscErrorCode ierr; 1824 cusparseStatus_t stat; 1825 cusparseHandle_t handle=0; 1826 1827 PetscFunctionBegin; 1828 ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr); 1829 if (B->factortype==MAT_FACTOR_NONE) { 1830 /* you cannot check the inode.use flag here since the matrix was just created. 1831 now build a GPU matrix data structure */ 1832 B->spptr = new Mat_SeqAIJCUSPARSE; 1833 ((Mat_SeqAIJCUSPARSE*)B->spptr)->mat = 0; 1834 ((Mat_SeqAIJCUSPARSE*)B->spptr)->matTranspose = 0; 1835 ((Mat_SeqAIJCUSPARSE*)B->spptr)->workVector = 0; 1836 ((Mat_SeqAIJCUSPARSE*)B->spptr)->format = MAT_CUSPARSE_CSR; 1837 ((Mat_SeqAIJCUSPARSE*)B->spptr)->stream = 0; 1838 ((Mat_SeqAIJCUSPARSE*)B->spptr)->handle = 0; 1839 stat = cusparseCreate(&handle);CHKERRCUSP(stat); 1840 ((Mat_SeqAIJCUSPARSE*)B->spptr)->handle = handle; 1841 ((Mat_SeqAIJCUSPARSE*)B->spptr)->stream = 0; 1842 } else { 1843 /* NEXT, set the pointers to the triangular factors */ 1844 B->spptr = new Mat_SeqAIJCUSPARSETriFactors; 1845 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->loTriFactorPtr = 0; 1846 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->upTriFactorPtr = 0; 1847 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->loTriFactorPtrTranspose = 0; 1848 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->upTriFactorPtrTranspose = 0; 1849 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->rpermIndices = 0; 1850 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->cpermIndices = 0; 1851 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->workVector = 0; 1852 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->handle = 0; 1853 stat = cusparseCreate(&handle);CHKERRCUSP(stat); 1854 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->handle = handle; 1855 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->nnz = 0; 1856 } 1857 1858 /* Here we overload MatGetFactor_petsc_C which enables -mat_type aijcusparse to use the 1859 default cusparse tri solve. Note the difference with the implementation in 1860 MatCreate_SeqAIJCUSP in ../seqcusp/aijcusp.cu */ 1861 ierr = PetscObjectComposeFunction((PetscObject)B,"MatGetFactor_petsc_C",MatGetFactor_seqaij_cusparse);CHKERRQ(ierr); 1862 1863 B->ops->assemblyend = MatAssemblyEnd_SeqAIJCUSPARSE; 1864 B->ops->destroy = MatDestroy_SeqAIJCUSPARSE; 1865 B->ops->getvecs = MatCreateVecs_SeqAIJCUSPARSE; 1866 B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE; 1867 B->ops->mult = MatMult_SeqAIJCUSPARSE; 1868 B->ops->multadd = MatMultAdd_SeqAIJCUSPARSE; 1869 B->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE; 1870 B->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE; 1871 1872 ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 1873 1874 B->valid_GPU_matrix = PETSC_CUSP_UNALLOCATED; 1875 1876 ierr = PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr); 1877 PetscFunctionReturn(0); 1878 } 1879 1880 /*M 1881 MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices. 1882 1883 A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either 1884 CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later. 1885 All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library. 1886 1887 Options Database Keys: 1888 + -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions() 1889 . -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 1890 . -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 1891 1892 Level: beginner 1893 1894 .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 1895 M*/ 1896 1897 #undef __FUNCT__ 1898 #define __FUNCT__ "Mat_SeqAIJCUSPARSE_Destroy" 1899 static PetscErrorCode Mat_SeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct) 1900 { 1901 cusparseStatus_t stat; 1902 cusparseHandle_t handle; 1903 1904 PetscFunctionBegin; 1905 if (*cusparsestruct) { 1906 Mat_SeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format); 1907 Mat_SeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format); 1908 delete (*cusparsestruct)->workVector; 1909 if (handle = (*cusparsestruct)->handle) { 1910 stat = cusparseDestroy(handle);CHKERRCUSP(stat); 1911 } 1912 delete *cusparsestruct; 1913 *cusparsestruct = 0; 1914 } 1915 PetscFunctionReturn(0); 1916 } 1917 1918 #undef __FUNCT__ 1919 #define __FUNCT__ "CsrMatrix_Destroy" 1920 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat) 1921 { 1922 PetscFunctionBegin; 1923 if (*mat) { 1924 delete (*mat)->values; 1925 delete (*mat)->column_indices; 1926 delete (*mat)->row_offsets; 1927 delete *mat; 1928 *mat = 0; 1929 } 1930 PetscFunctionReturn(0); 1931 } 1932 1933 #undef __FUNCT__ 1934 #define __FUNCT__ "Mat_SeqAIJCUSPARSETriFactorStruct_Destroy" 1935 static PetscErrorCode Mat_SeqAIJCUSPARSETriFactorStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor) 1936 { 1937 cusparseStatus_t stat; 1938 PetscErrorCode ierr; 1939 1940 PetscFunctionBegin; 1941 if (*trifactor) { 1942 if ((*trifactor)->descr) { stat = cusparseDestroyMatDescr((*trifactor)->descr);CHKERRCUSP(stat); } 1943 if ((*trifactor)->solveInfo) { stat = cusparseDestroySolveAnalysisInfo((*trifactor)->solveInfo);CHKERRCUSP(stat); } 1944 ierr = CsrMatrix_Destroy(&(*trifactor)->csrMat);CHKERRQ(ierr); 1945 delete *trifactor; 1946 *trifactor = 0; 1947 } 1948 PetscFunctionReturn(0); 1949 } 1950 1951 #undef __FUNCT__ 1952 #define __FUNCT__ "Mat_SeqAIJCUSPARSEMultStruct_Destroy" 1953 static PetscErrorCode Mat_SeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format) 1954 { 1955 CsrMatrix *mat; 1956 cusparseStatus_t stat; 1957 cudaError_t err; 1958 1959 PetscFunctionBegin; 1960 if (*matstruct) { 1961 if ((*matstruct)->mat) { 1962 if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) { 1963 cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat; 1964 stat = cusparseDestroyHybMat(hybMat);CHKERRCUSP(stat); 1965 } else { 1966 mat = (CsrMatrix*)(*matstruct)->mat; 1967 CsrMatrix_Destroy(&mat); 1968 } 1969 } 1970 if ((*matstruct)->descr) { stat = cusparseDestroyMatDescr((*matstruct)->descr);CHKERRCUSP(stat); } 1971 delete (*matstruct)->cprowIndices; 1972 if ((*matstruct)->alpha) { err=cudaFree((*matstruct)->alpha);CHKERRCUSP(err); } 1973 if ((*matstruct)->beta) { err=cudaFree((*matstruct)->beta);CHKERRCUSP(err); } 1974 delete *matstruct; 1975 *matstruct = 0; 1976 } 1977 PetscFunctionReturn(0); 1978 } 1979 1980 #undef __FUNCT__ 1981 #define __FUNCT__ "Mat_SeqAIJCUSPARSETriFactors_Destroy" 1982 static PetscErrorCode Mat_SeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors) 1983 { 1984 cusparseHandle_t handle; 1985 cusparseStatus_t stat; 1986 1987 PetscFunctionBegin; 1988 if (*trifactors) { 1989 Mat_SeqAIJCUSPARSETriFactorStruct_Destroy(&(*trifactors)->loTriFactorPtr); 1990 Mat_SeqAIJCUSPARSETriFactorStruct_Destroy(&(*trifactors)->upTriFactorPtr); 1991 Mat_SeqAIJCUSPARSETriFactorStruct_Destroy(&(*trifactors)->loTriFactorPtrTranspose); 1992 Mat_SeqAIJCUSPARSETriFactorStruct_Destroy(&(*trifactors)->upTriFactorPtrTranspose); 1993 delete (*trifactors)->rpermIndices; 1994 delete (*trifactors)->cpermIndices; 1995 delete (*trifactors)->workVector; 1996 if (handle = (*trifactors)->handle) { 1997 stat = cusparseDestroy(handle);CHKERRCUSP(stat); 1998 } 1999 delete *trifactors; 2000 *trifactors = 0; 2001 } 2002 PetscFunctionReturn(0); 2003 } 2004 2005