1 /* 2 Defines the basic matrix operations for the AIJ (compressed row) 3 matrix storage format using the CUSPARSE library, 4 */ 5 #define PETSC_SKIP_SPINLOCK 6 7 #include <petscconf.h> 8 #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/ 9 #include <../src/mat/impls/sbaij/seq/sbaij.h> 10 #include <../src/vec/vec/impls/dvecimpl.h> 11 #include <petsc/private/vecimpl.h> 12 #undef VecType 13 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h> 14 15 const char *const MatCUSPARSEStorageFormats[] = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0}; 16 17 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*); 18 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*); 19 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*); 20 21 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 22 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 23 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*); 24 25 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec); 26 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 27 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 28 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 29 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptions *PetscOptionsObject,Mat); 30 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec); 31 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 32 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 33 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 34 35 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**); 36 static PetscErrorCode Mat_SeqAIJCUSPARSETriFactorStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**); 37 static PetscErrorCode Mat_SeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat); 38 static PetscErrorCode Mat_SeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**); 39 static PetscErrorCode Mat_SeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**); 40 41 #undef __FUNCT__ 42 #define __FUNCT__ "MatCUSPARSESetStream" 43 PetscErrorCode MatCUSPARSESetStream(Mat A,const cudaStream_t stream) 44 { 45 cusparseStatus_t stat; 46 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 47 48 PetscFunctionBegin; 49 cusparsestruct->stream = stream; 50 stat = cusparseSetStream(cusparsestruct->handle,cusparsestruct->stream);CHKERRCUSP(stat); 51 PetscFunctionReturn(0); 52 } 53 54 #undef __FUNCT__ 55 #define __FUNCT__ "MatCUSPARSESetHandle" 56 PetscErrorCode MatCUSPARSESetHandle(Mat A,const cusparseHandle_t handle) 57 { 58 cusparseStatus_t stat; 59 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 60 61 PetscFunctionBegin; 62 if (cusparsestruct->handle) 63 stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUSP(stat); 64 cusparsestruct->handle = handle; 65 stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSP(stat); 66 PetscFunctionReturn(0); 67 } 68 69 #undef __FUNCT__ 70 #define __FUNCT__ "MatCUSPARSEClearHandle" 71 PetscErrorCode MatCUSPARSEClearHandle(Mat A) 72 { 73 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 74 PetscFunctionBegin; 75 if (cusparsestruct->handle) 76 cusparsestruct->handle = 0; 77 PetscFunctionReturn(0); 78 } 79 80 #undef __FUNCT__ 81 #define __FUNCT__ "MatFactorGetSolverPackage_seqaij_cusparse" 82 PetscErrorCode MatFactorGetSolverPackage_seqaij_cusparse(Mat A,const MatSolverPackage *type) 83 { 84 PetscFunctionBegin; 85 *type = MATSOLVERCUSPARSE; 86 PetscFunctionReturn(0); 87 } 88 89 /*MC 90 MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices 91 on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported 92 algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer 93 performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the 94 CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these 95 algorithms are not recommended. This class does NOT support direct solver operations. 96 97 Level: beginner 98 99 .seealso: PCFactorSetMatSolverPackage(), MatSolverPackage, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 100 M*/ 101 102 #undef __FUNCT__ 103 #define __FUNCT__ "MatGetFactor_seqaijcusparse_cusparse" 104 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B) 105 { 106 PetscErrorCode ierr; 107 PetscInt n = A->rmap->n; 108 109 PetscFunctionBegin; 110 ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr); 111 (*B)->factortype = ftype; 112 ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr); 113 ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 114 115 if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) { 116 ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr); 117 (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE; 118 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJCUSPARSE; 119 } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) { 120 (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJCUSPARSE; 121 (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE; 122 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types"); 123 124 ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr); 125 ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverPackage_C",MatFactorGetSolverPackage_seqaij_cusparse);CHKERRQ(ierr); 126 PetscFunctionReturn(0); 127 } 128 129 #undef __FUNCT__ 130 #define __FUNCT__ "MatCUSPARSESetFormat_SeqAIJCUSPARSE" 131 PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 132 { 133 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 134 135 PetscFunctionBegin; 136 #if CUDA_VERSION>=4020 137 switch (op) { 138 case MAT_CUSPARSE_MULT: 139 cusparsestruct->format = format; 140 break; 141 case MAT_CUSPARSE_ALL: 142 cusparsestruct->format = format; 143 break; 144 default: 145 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op); 146 } 147 #else 148 if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) 149 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"ELL (Ellpack) and HYB (Hybrid) storage format require CUDA 4.2 or later."); 150 #endif 151 PetscFunctionReturn(0); 152 } 153 154 /*@ 155 MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular 156 operation. Only the MatMult operation can use different GPU storage formats 157 for MPIAIJCUSPARSE matrices. 158 Not Collective 159 160 Input Parameters: 161 + A - Matrix of type SEQAIJCUSPARSE 162 . op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL. 163 - format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2) 164 165 Output Parameter: 166 167 Level: intermediate 168 169 .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 170 @*/ 171 #undef __FUNCT__ 172 #define __FUNCT__ "MatCUSPARSESetFormat" 173 PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 174 { 175 PetscErrorCode ierr; 176 177 PetscFunctionBegin; 178 PetscValidHeaderSpecific(A, MAT_CLASSID,1); 179 ierr = PetscTryMethod(A, "MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));CHKERRQ(ierr); 180 PetscFunctionReturn(0); 181 } 182 183 #undef __FUNCT__ 184 #define __FUNCT__ "MatSetFromOptions_SeqAIJCUSPARSE" 185 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptions *PetscOptionsObject,Mat A) 186 { 187 PetscErrorCode ierr; 188 MatCUSPARSEStorageFormat format; 189 PetscBool flg; 190 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 191 192 PetscFunctionBegin; 193 ierr = PetscOptionsHead(PetscOptionsObject,"SeqAIJCUSPARSE options");CHKERRQ(ierr); 194 ierr = PetscObjectOptionsBegin((PetscObject)A); 195 if (A->factortype==MAT_FACTOR_NONE) { 196 ierr = PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV", 197 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 198 if (flg) { 199 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);CHKERRQ(ierr); 200 } 201 } 202 ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve", 203 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 204 if (flg) { 205 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr); 206 } 207 ierr = PetscOptionsEnd();CHKERRQ(ierr); 208 PetscFunctionReturn(0); 209 210 } 211 212 #undef __FUNCT__ 213 #define __FUNCT__ "MatILUFactorSymbolic_SeqAIJCUSPARSE" 214 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 215 { 216 PetscErrorCode ierr; 217 218 PetscFunctionBegin; 219 ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 220 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 221 PetscFunctionReturn(0); 222 } 223 224 #undef __FUNCT__ 225 #define __FUNCT__ "MatLUFactorSymbolic_SeqAIJCUSPARSE" 226 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 227 { 228 PetscErrorCode ierr; 229 230 PetscFunctionBegin; 231 ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 232 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 233 PetscFunctionReturn(0); 234 } 235 236 #undef __FUNCT__ 237 #define __FUNCT__ "MatICCFactorSymbolic_SeqAIJCUSPARSE" 238 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info) 239 { 240 PetscErrorCode ierr; 241 242 PetscFunctionBegin; 243 ierr = MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr); 244 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 245 PetscFunctionReturn(0); 246 } 247 248 #undef __FUNCT__ 249 #define __FUNCT__ "MatCholeskyFactorSymbolic_SeqAIJCUSPARSE" 250 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info) 251 { 252 PetscErrorCode ierr; 253 254 PetscFunctionBegin; 255 ierr = MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr); 256 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 257 PetscFunctionReturn(0); 258 } 259 260 #undef __FUNCT__ 261 #define __FUNCT__ "MatSeqAIJCUSPARSEBuildILULowerTriMatrix" 262 static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A) 263 { 264 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 265 PetscInt n = A->rmap->n; 266 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 267 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 268 cusparseStatus_t stat; 269 const PetscInt *ai = a->i,*aj = a->j,*vi; 270 const MatScalar *aa = a->a,*v; 271 PetscInt *AiLo, *AjLo; 272 PetscScalar *AALo; 273 PetscInt i,nz, nzLower, offset, rowOffset; 274 PetscErrorCode ierr; 275 276 PetscFunctionBegin; 277 if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUSP_CPU) { 278 try { 279 /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */ 280 nzLower=n+ai[n]-ai[1]; 281 282 /* Allocate Space for the lower triangular matrix */ 283 ierr = cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUSP(ierr); 284 ierr = cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUSP(ierr); 285 ierr = cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUSP(ierr); 286 287 /* Fill the lower triangular matrix */ 288 AiLo[0] = (PetscInt) 0; 289 AiLo[n] = nzLower; 290 AjLo[0] = (PetscInt) 0; 291 AALo[0] = (MatScalar) 1.0; 292 v = aa; 293 vi = aj; 294 offset = 1; 295 rowOffset= 1; 296 for (i=1; i<n; i++) { 297 nz = ai[i+1] - ai[i]; 298 /* additional 1 for the term on the diagonal */ 299 AiLo[i] = rowOffset; 300 rowOffset += nz+1; 301 302 ierr = PetscMemcpy(&(AjLo[offset]), vi, nz*sizeof(PetscInt));CHKERRQ(ierr); 303 ierr = PetscMemcpy(&(AALo[offset]), v, nz*sizeof(PetscScalar));CHKERRQ(ierr); 304 305 offset += nz; 306 AjLo[offset] = (PetscInt) i; 307 AALo[offset] = (MatScalar) 1.0; 308 offset += 1; 309 310 v += nz; 311 vi += nz; 312 } 313 314 /* allocate space for the triangular factor information */ 315 loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct; 316 317 /* Create the matrix description */ 318 stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSP(stat); 319 stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSP(stat); 320 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSP(stat); 321 stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER);CHKERRCUSP(stat); 322 stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSP(stat); 323 324 /* Create the solve analysis information */ 325 stat = cusparseCreateSolveAnalysisInfo(&loTriFactor->solveInfo);CHKERRCUSP(stat); 326 327 /* set the operation */ 328 loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 329 330 /* set the matrix */ 331 loTriFactor->csrMat = new CsrMatrix; 332 loTriFactor->csrMat->num_rows = n; 333 loTriFactor->csrMat->num_cols = n; 334 loTriFactor->csrMat->num_entries = nzLower; 335 336 loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1); 337 loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1); 338 339 loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower); 340 loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower); 341 342 loTriFactor->csrMat->values = new THRUSTARRAY(nzLower); 343 loTriFactor->csrMat->values->assign(AALo, AALo+nzLower); 344 345 /* perform the solve analysis */ 346 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, 347 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 348 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 349 loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo);CHKERRCUSP(stat); 350 351 /* assign the pointer. Is this really necessary? */ 352 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor; 353 354 ierr = cudaFreeHost(AiLo);CHKERRCUSP(ierr); 355 ierr = cudaFreeHost(AjLo);CHKERRCUSP(ierr); 356 ierr = cudaFreeHost(AALo);CHKERRCUSP(ierr); 357 } catch(char *ex) { 358 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 359 } 360 } 361 PetscFunctionReturn(0); 362 } 363 364 #undef __FUNCT__ 365 #define __FUNCT__ "MatSeqAIJCUSPARSEBuildILUUpperTriMatrix" 366 static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A) 367 { 368 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 369 PetscInt n = A->rmap->n; 370 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 371 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 372 cusparseStatus_t stat; 373 const PetscInt *aj = a->j,*adiag = a->diag,*vi; 374 const MatScalar *aa = a->a,*v; 375 PetscInt *AiUp, *AjUp; 376 PetscScalar *AAUp; 377 PetscInt i,nz, nzUpper, offset; 378 PetscErrorCode ierr; 379 380 PetscFunctionBegin; 381 if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUSP_CPU) { 382 try { 383 /* next, figure out the number of nonzeros in the upper triangular matrix. */ 384 nzUpper = adiag[0]-adiag[n]; 385 386 /* Allocate Space for the upper triangular matrix */ 387 ierr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUSP(ierr); 388 ierr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUSP(ierr); 389 ierr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUSP(ierr); 390 391 /* Fill the upper triangular matrix */ 392 AiUp[0]=(PetscInt) 0; 393 AiUp[n]=nzUpper; 394 offset = nzUpper; 395 for (i=n-1; i>=0; i--) { 396 v = aa + adiag[i+1] + 1; 397 vi = aj + adiag[i+1] + 1; 398 399 /* number of elements NOT on the diagonal */ 400 nz = adiag[i] - adiag[i+1]-1; 401 402 /* decrement the offset */ 403 offset -= (nz+1); 404 405 /* first, set the diagonal elements */ 406 AjUp[offset] = (PetscInt) i; 407 AAUp[offset] = 1./v[nz]; 408 AiUp[i] = AiUp[i+1] - (nz+1); 409 410 ierr = PetscMemcpy(&(AjUp[offset+1]), vi, nz*sizeof(PetscInt));CHKERRQ(ierr); 411 ierr = PetscMemcpy(&(AAUp[offset+1]), v, nz*sizeof(PetscScalar));CHKERRQ(ierr); 412 } 413 414 /* allocate space for the triangular factor information */ 415 upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct; 416 417 /* Create the matrix description */ 418 stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSP(stat); 419 stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSP(stat); 420 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSP(stat); 421 stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSP(stat); 422 stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSP(stat); 423 424 /* Create the solve analysis information */ 425 stat = cusparseCreateSolveAnalysisInfo(&upTriFactor->solveInfo);CHKERRCUSP(stat); 426 427 /* set the operation */ 428 upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 429 430 /* set the matrix */ 431 upTriFactor->csrMat = new CsrMatrix; 432 upTriFactor->csrMat->num_rows = n; 433 upTriFactor->csrMat->num_cols = n; 434 upTriFactor->csrMat->num_entries = nzUpper; 435 436 upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1); 437 upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1); 438 439 upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper); 440 upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper); 441 442 upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper); 443 upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper); 444 445 /* perform the solve analysis */ 446 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, 447 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 448 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 449 upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo);CHKERRCUSP(stat); 450 451 /* assign the pointer. Is this really necessary? */ 452 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor; 453 454 ierr = cudaFreeHost(AiUp);CHKERRCUSP(ierr); 455 ierr = cudaFreeHost(AjUp);CHKERRCUSP(ierr); 456 ierr = cudaFreeHost(AAUp);CHKERRCUSP(ierr); 457 } catch(char *ex) { 458 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 459 } 460 } 461 PetscFunctionReturn(0); 462 } 463 464 #undef __FUNCT__ 465 #define __FUNCT__ "MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU" 466 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A) 467 { 468 PetscErrorCode ierr; 469 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 470 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 471 IS isrow = a->row,iscol = a->icol; 472 PetscBool row_identity,col_identity; 473 const PetscInt *r,*c; 474 PetscInt n = A->rmap->n; 475 476 PetscFunctionBegin; 477 ierr = MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);CHKERRQ(ierr); 478 ierr = MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);CHKERRQ(ierr); 479 480 cusparseTriFactors->workVector = new THRUSTARRAY; 481 cusparseTriFactors->workVector->resize(n); 482 cusparseTriFactors->nnz=a->nz; 483 484 A->valid_GPU_matrix = PETSC_CUSP_BOTH; 485 /*lower triangular indices */ 486 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 487 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 488 if (!row_identity) { 489 cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 490 cusparseTriFactors->rpermIndices->assign(r, r+n); 491 } 492 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 493 494 /*upper triangular indices */ 495 ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr); 496 ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 497 if (!col_identity) { 498 cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 499 cusparseTriFactors->cpermIndices->assign(c, c+n); 500 } 501 ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr); 502 PetscFunctionReturn(0); 503 } 504 505 #undef __FUNCT__ 506 #define __FUNCT__ "MatSeqAIJCUSPARSEBuildICCTriMatrices" 507 static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A) 508 { 509 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 510 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 511 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 512 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 513 cusparseStatus_t stat; 514 PetscErrorCode ierr; 515 PetscInt *AiUp, *AjUp; 516 PetscScalar *AAUp; 517 PetscScalar *AALo; 518 PetscInt nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j; 519 Mat_SeqSBAIJ *b = (Mat_SeqSBAIJ*)A->data; 520 const PetscInt *ai = b->i,*aj = b->j,*vj; 521 const MatScalar *aa = b->a,*v; 522 523 PetscFunctionBegin; 524 if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUSP_CPU) { 525 try { 526 /* Allocate Space for the upper triangular matrix */ 527 ierr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUSP(ierr); 528 ierr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUSP(ierr); 529 ierr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUSP(ierr); 530 ierr = cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUSP(ierr); 531 532 /* Fill the upper triangular matrix */ 533 AiUp[0]=(PetscInt) 0; 534 AiUp[n]=nzUpper; 535 offset = 0; 536 for (i=0; i<n; i++) { 537 /* set the pointers */ 538 v = aa + ai[i]; 539 vj = aj + ai[i]; 540 nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */ 541 542 /* first, set the diagonal elements */ 543 AjUp[offset] = (PetscInt) i; 544 AAUp[offset] = 1.0/v[nz]; 545 AiUp[i] = offset; 546 AALo[offset] = 1.0/v[nz]; 547 548 offset+=1; 549 if (nz>0) { 550 ierr = PetscMemcpy(&(AjUp[offset]), vj, nz*sizeof(PetscInt));CHKERRQ(ierr); 551 ierr = PetscMemcpy(&(AAUp[offset]), v, nz*sizeof(PetscScalar));CHKERRQ(ierr); 552 for (j=offset; j<offset+nz; j++) { 553 AAUp[j] = -AAUp[j]; 554 AALo[j] = AAUp[j]/v[nz]; 555 } 556 offset+=nz; 557 } 558 } 559 560 /* allocate space for the triangular factor information */ 561 upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct; 562 563 /* Create the matrix description */ 564 stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSP(stat); 565 stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSP(stat); 566 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSP(stat); 567 stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSP(stat); 568 stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSP(stat); 569 570 /* Create the solve analysis information */ 571 stat = cusparseCreateSolveAnalysisInfo(&upTriFactor->solveInfo);CHKERRCUSP(stat); 572 573 /* set the operation */ 574 upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 575 576 /* set the matrix */ 577 upTriFactor->csrMat = new CsrMatrix; 578 upTriFactor->csrMat->num_rows = A->rmap->n; 579 upTriFactor->csrMat->num_cols = A->cmap->n; 580 upTriFactor->csrMat->num_entries = a->nz; 581 582 upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 583 upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1); 584 585 upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 586 upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz); 587 588 upTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 589 upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz); 590 591 /* perform the solve analysis */ 592 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, 593 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 594 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 595 upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo);CHKERRCUSP(stat); 596 597 /* assign the pointer. Is this really necessary? */ 598 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor; 599 600 /* allocate space for the triangular factor information */ 601 loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct; 602 603 /* Create the matrix description */ 604 stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSP(stat); 605 stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSP(stat); 606 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSP(stat); 607 stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSP(stat); 608 stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSP(stat); 609 610 /* Create the solve analysis information */ 611 stat = cusparseCreateSolveAnalysisInfo(&loTriFactor->solveInfo);CHKERRCUSP(stat); 612 613 /* set the operation */ 614 loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE; 615 616 /* set the matrix */ 617 loTriFactor->csrMat = new CsrMatrix; 618 loTriFactor->csrMat->num_rows = A->rmap->n; 619 loTriFactor->csrMat->num_cols = A->cmap->n; 620 loTriFactor->csrMat->num_entries = a->nz; 621 622 loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 623 loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1); 624 625 loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 626 loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz); 627 628 loTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 629 loTriFactor->csrMat->values->assign(AALo, AALo+a->nz); 630 631 /* perform the solve analysis */ 632 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, 633 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 634 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 635 loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo);CHKERRCUSP(stat); 636 637 /* assign the pointer. Is this really necessary? */ 638 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor; 639 640 A->valid_GPU_matrix = PETSC_CUSP_BOTH; 641 ierr = cudaFreeHost(AiUp);CHKERRCUSP(ierr); 642 ierr = cudaFreeHost(AjUp);CHKERRCUSP(ierr); 643 ierr = cudaFreeHost(AAUp);CHKERRCUSP(ierr); 644 ierr = cudaFreeHost(AALo);CHKERRCUSP(ierr); 645 } catch(char *ex) { 646 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 647 } 648 } 649 PetscFunctionReturn(0); 650 } 651 652 #undef __FUNCT__ 653 #define __FUNCT__ "MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU" 654 static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A) 655 { 656 PetscErrorCode ierr; 657 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 658 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 659 IS ip = a->row; 660 const PetscInt *rip; 661 PetscBool perm_identity; 662 PetscInt n = A->rmap->n; 663 664 PetscFunctionBegin; 665 ierr = MatSeqAIJCUSPARSEBuildICCTriMatrices(A);CHKERRQ(ierr); 666 cusparseTriFactors->workVector = new THRUSTARRAY; 667 cusparseTriFactors->workVector->resize(n); 668 cusparseTriFactors->nnz=(a->nz-n)*2 + n; 669 670 /*lower triangular indices */ 671 ierr = ISGetIndices(ip,&rip);CHKERRQ(ierr); 672 ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr); 673 if (!perm_identity) { 674 cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 675 cusparseTriFactors->rpermIndices->assign(rip, rip+n); 676 cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 677 cusparseTriFactors->cpermIndices->assign(rip, rip+n); 678 } 679 ierr = ISRestoreIndices(ip,&rip);CHKERRQ(ierr); 680 PetscFunctionReturn(0); 681 } 682 683 #undef __FUNCT__ 684 #define __FUNCT__ "MatLUFactorNumeric_SeqAIJCUSPARSE" 685 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info) 686 { 687 Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 688 IS isrow = b->row,iscol = b->col; 689 PetscBool row_identity,col_identity; 690 PetscErrorCode ierr; 691 692 PetscFunctionBegin; 693 ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 694 /* determine which version of MatSolve needs to be used. */ 695 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 696 ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 697 if (row_identity && col_identity) { 698 B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 699 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 700 } else { 701 B->ops->solve = MatSolve_SeqAIJCUSPARSE; 702 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 703 } 704 705 /* get the triangular factors */ 706 ierr = MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);CHKERRQ(ierr); 707 PetscFunctionReturn(0); 708 } 709 710 #undef __FUNCT__ 711 #define __FUNCT__ "MatCholeskyFactorNumeric_SeqAIJCUSPARSE" 712 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info) 713 { 714 Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 715 IS ip = b->row; 716 PetscBool perm_identity; 717 PetscErrorCode ierr; 718 719 PetscFunctionBegin; 720 ierr = MatCholeskyFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 721 722 /* determine which version of MatSolve needs to be used. */ 723 ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr); 724 if (perm_identity) { 725 B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 726 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 727 } else { 728 B->ops->solve = MatSolve_SeqAIJCUSPARSE; 729 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 730 } 731 732 /* get the triangular factors */ 733 ierr = MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);CHKERRQ(ierr); 734 PetscFunctionReturn(0); 735 } 736 737 #undef __FUNCT__ 738 #define __FUNCT__ "MatSeqAIJCUSPARSEAnalyzeTransposeForSolve" 739 static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A) 740 { 741 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 742 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 743 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 744 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 745 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 746 cusparseStatus_t stat; 747 cusparseIndexBase_t indexBase; 748 cusparseMatrixType_t matrixType; 749 cusparseFillMode_t fillMode; 750 cusparseDiagType_t diagType; 751 752 PetscFunctionBegin; 753 754 /*********************************************/ 755 /* Now the Transpose of the Lower Tri Factor */ 756 /*********************************************/ 757 758 /* allocate space for the transpose of the lower triangular factor */ 759 loTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct; 760 761 /* set the matrix descriptors of the lower triangular factor */ 762 matrixType = cusparseGetMatType(loTriFactor->descr); 763 indexBase = cusparseGetMatIndexBase(loTriFactor->descr); 764 fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ? 765 CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 766 diagType = cusparseGetMatDiagType(loTriFactor->descr); 767 768 /* Create the matrix description */ 769 stat = cusparseCreateMatDescr(&loTriFactorT->descr);CHKERRCUSP(stat); 770 stat = cusparseSetMatIndexBase(loTriFactorT->descr, indexBase);CHKERRCUSP(stat); 771 stat = cusparseSetMatType(loTriFactorT->descr, matrixType);CHKERRCUSP(stat); 772 stat = cusparseSetMatFillMode(loTriFactorT->descr, fillMode);CHKERRCUSP(stat); 773 stat = cusparseSetMatDiagType(loTriFactorT->descr, diagType);CHKERRCUSP(stat); 774 775 /* Create the solve analysis information */ 776 stat = cusparseCreateSolveAnalysisInfo(&loTriFactorT->solveInfo);CHKERRCUSP(stat); 777 778 /* set the operation */ 779 loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 780 781 /* allocate GPU space for the CSC of the lower triangular factor*/ 782 loTriFactorT->csrMat = new CsrMatrix; 783 loTriFactorT->csrMat->num_rows = loTriFactor->csrMat->num_rows; 784 loTriFactorT->csrMat->num_cols = loTriFactor->csrMat->num_cols; 785 loTriFactorT->csrMat->num_entries = loTriFactor->csrMat->num_entries; 786 loTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(loTriFactor->csrMat->num_rows+1); 787 loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactor->csrMat->num_entries); 788 loTriFactorT->csrMat->values = new THRUSTARRAY(loTriFactor->csrMat->num_entries); 789 790 /* compute the transpose of the lower triangular factor, i.e. the CSC */ 791 stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, 792 loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, 793 loTriFactor->csrMat->values->data().get(), 794 loTriFactor->csrMat->row_offsets->data().get(), 795 loTriFactor->csrMat->column_indices->data().get(), 796 loTriFactorT->csrMat->values->data().get(), 797 loTriFactorT->csrMat->column_indices->data().get(), 798 loTriFactorT->csrMat->row_offsets->data().get(), 799 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSP(stat); 800 801 /* perform the solve analysis on the transposed matrix */ 802 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp, 803 loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, 804 loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(), 805 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), 806 loTriFactorT->solveInfo);CHKERRCUSP(stat); 807 808 /* assign the pointer. Is this really necessary? */ 809 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT; 810 811 /*********************************************/ 812 /* Now the Transpose of the Upper Tri Factor */ 813 /*********************************************/ 814 815 /* allocate space for the transpose of the upper triangular factor */ 816 upTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct; 817 818 /* set the matrix descriptors of the upper triangular factor */ 819 matrixType = cusparseGetMatType(upTriFactor->descr); 820 indexBase = cusparseGetMatIndexBase(upTriFactor->descr); 821 fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ? 822 CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 823 diagType = cusparseGetMatDiagType(upTriFactor->descr); 824 825 /* Create the matrix description */ 826 stat = cusparseCreateMatDescr(&upTriFactorT->descr);CHKERRCUSP(stat); 827 stat = cusparseSetMatIndexBase(upTriFactorT->descr, indexBase);CHKERRCUSP(stat); 828 stat = cusparseSetMatType(upTriFactorT->descr, matrixType);CHKERRCUSP(stat); 829 stat = cusparseSetMatFillMode(upTriFactorT->descr, fillMode);CHKERRCUSP(stat); 830 stat = cusparseSetMatDiagType(upTriFactorT->descr, diagType);CHKERRCUSP(stat); 831 832 /* Create the solve analysis information */ 833 stat = cusparseCreateSolveAnalysisInfo(&upTriFactorT->solveInfo);CHKERRCUSP(stat); 834 835 /* set the operation */ 836 upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 837 838 /* allocate GPU space for the CSC of the upper triangular factor*/ 839 upTriFactorT->csrMat = new CsrMatrix; 840 upTriFactorT->csrMat->num_rows = upTriFactor->csrMat->num_rows; 841 upTriFactorT->csrMat->num_cols = upTriFactor->csrMat->num_cols; 842 upTriFactorT->csrMat->num_entries = upTriFactor->csrMat->num_entries; 843 upTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(upTriFactor->csrMat->num_rows+1); 844 upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactor->csrMat->num_entries); 845 upTriFactorT->csrMat->values = new THRUSTARRAY(upTriFactor->csrMat->num_entries); 846 847 /* compute the transpose of the upper triangular factor, i.e. the CSC */ 848 stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, 849 upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, 850 upTriFactor->csrMat->values->data().get(), 851 upTriFactor->csrMat->row_offsets->data().get(), 852 upTriFactor->csrMat->column_indices->data().get(), 853 upTriFactorT->csrMat->values->data().get(), 854 upTriFactorT->csrMat->column_indices->data().get(), 855 upTriFactorT->csrMat->row_offsets->data().get(), 856 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSP(stat); 857 858 /* perform the solve analysis on the transposed matrix */ 859 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp, 860 upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, 861 upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(), 862 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), 863 upTriFactorT->solveInfo);CHKERRCUSP(stat); 864 865 /* assign the pointer. Is this really necessary? */ 866 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT; 867 PetscFunctionReturn(0); 868 } 869 870 #undef __FUNCT__ 871 #define __FUNCT__ "MatSeqAIJCUSPARSEGenerateTransposeForMult" 872 static PetscErrorCode MatSeqAIJCUSPARSEGenerateTransposeForMult(Mat A) 873 { 874 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 875 Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 876 Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 877 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 878 cusparseStatus_t stat; 879 cusparseIndexBase_t indexBase; 880 cudaError_t err; 881 882 PetscFunctionBegin; 883 884 /* allocate space for the triangular factor information */ 885 matstructT = new Mat_SeqAIJCUSPARSEMultStruct; 886 stat = cusparseCreateMatDescr(&matstructT->descr);CHKERRCUSP(stat); 887 indexBase = cusparseGetMatIndexBase(matstruct->descr); 888 stat = cusparseSetMatIndexBase(matstructT->descr, indexBase);CHKERRCUSP(stat); 889 stat = cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSP(stat); 890 891 /* set alpha and beta */ 892 err = cudaMalloc((void **)&(matstructT->alpha),sizeof(PetscScalar));CHKERRCUSP(err); 893 err = cudaMemcpy(matstructT->alpha,&ALPHA,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUSP(err); 894 err = cudaMalloc((void **)&(matstructT->beta),sizeof(PetscScalar));CHKERRCUSP(err); 895 err = cudaMemcpy(matstructT->beta,&BETA,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUSP(err); 896 stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSP(stat); 897 898 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 899 CsrMatrix *matrix = (CsrMatrix*)matstruct->mat; 900 CsrMatrix *matrixT= new CsrMatrix; 901 matrixT->num_rows = A->rmap->n; 902 matrixT->num_cols = A->cmap->n; 903 matrixT->num_entries = a->nz; 904 matrixT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 905 matrixT->column_indices = new THRUSTINTARRAY32(a->nz); 906 matrixT->values = new THRUSTARRAY(a->nz); 907 908 /* compute the transpose of the upper triangular factor, i.e. the CSC */ 909 indexBase = cusparseGetMatIndexBase(matstruct->descr); 910 stat = cusparse_csr2csc(cusparsestruct->handle, matrix->num_rows, 911 matrix->num_cols, matrix->num_entries, 912 matrix->values->data().get(), 913 matrix->row_offsets->data().get(), 914 matrix->column_indices->data().get(), 915 matrixT->values->data().get(), 916 matrixT->column_indices->data().get(), 917 matrixT->row_offsets->data().get(), 918 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSP(stat); 919 920 /* assign the pointer */ 921 matstructT->mat = matrixT; 922 923 } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) { 924 #if CUDA_VERSION>=5000 925 /* First convert HYB to CSR */ 926 CsrMatrix *temp= new CsrMatrix; 927 temp->num_rows = A->rmap->n; 928 temp->num_cols = A->cmap->n; 929 temp->num_entries = a->nz; 930 temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 931 temp->column_indices = new THRUSTINTARRAY32(a->nz); 932 temp->values = new THRUSTARRAY(a->nz); 933 934 935 stat = cusparse_hyb2csr(cusparsestruct->handle, 936 matstruct->descr, (cusparseHybMat_t)matstruct->mat, 937 temp->values->data().get(), 938 temp->row_offsets->data().get(), 939 temp->column_indices->data().get());CHKERRCUSP(stat); 940 941 /* Next, convert CSR to CSC (i.e. the matrix transpose) */ 942 CsrMatrix *tempT= new CsrMatrix; 943 tempT->num_rows = A->rmap->n; 944 tempT->num_cols = A->cmap->n; 945 tempT->num_entries = a->nz; 946 tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 947 tempT->column_indices = new THRUSTINTARRAY32(a->nz); 948 tempT->values = new THRUSTARRAY(a->nz); 949 950 stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows, 951 temp->num_cols, temp->num_entries, 952 temp->values->data().get(), 953 temp->row_offsets->data().get(), 954 temp->column_indices->data().get(), 955 tempT->values->data().get(), 956 tempT->column_indices->data().get(), 957 tempT->row_offsets->data().get(), 958 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSP(stat); 959 960 /* Last, convert CSC to HYB */ 961 cusparseHybMat_t hybMat; 962 stat = cusparseCreateHybMat(&hybMat);CHKERRCUSP(stat); 963 cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ? 964 CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 965 stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n, 966 matstructT->descr, tempT->values->data().get(), 967 tempT->row_offsets->data().get(), 968 tempT->column_indices->data().get(), 969 hybMat, 0, partition);CHKERRCUSP(stat); 970 971 /* assign the pointer */ 972 matstructT->mat = hybMat; 973 974 /* delete temporaries */ 975 if (tempT) { 976 if (tempT->values) delete (THRUSTARRAY*) tempT->values; 977 if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices; 978 if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets; 979 delete (CsrMatrix*) tempT; 980 } 981 if (temp) { 982 if (temp->values) delete (THRUSTARRAY*) temp->values; 983 if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices; 984 if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets; 985 delete (CsrMatrix*) temp; 986 } 987 #else 988 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"ELL (Ellpack) and HYB (Hybrid) storage format for the Matrix Transpose (in MatMultTranspose) require CUDA 5.0 or later."); 989 #endif 990 } 991 /* assign the compressed row indices */ 992 matstructT->cprowIndices = new THRUSTINTARRAY; 993 994 /* assign the pointer */ 995 ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT; 996 PetscFunctionReturn(0); 997 } 998 999 #undef __FUNCT__ 1000 #define __FUNCT__ "MatSolveTranspose_SeqAIJCUSPARSE" 1001 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 1002 { 1003 CUSPARRAY *xGPU, *bGPU; 1004 cusparseStatus_t stat; 1005 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1006 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1007 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1008 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1009 PetscErrorCode ierr; 1010 1011 PetscFunctionBegin; 1012 /* Analyze the matrix and create the transpose ... on the fly */ 1013 if (!loTriFactorT && !upTriFactorT) { 1014 ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr); 1015 loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1016 upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1017 } 1018 1019 /* Get the GPU pointers */ 1020 ierr = VecCUSPGetArrayWrite(xx,&xGPU);CHKERRQ(ierr); 1021 ierr = VecCUSPGetArrayRead(bb,&bGPU);CHKERRQ(ierr); 1022 1023 /* First, reorder with the row permutation */ 1024 thrust::copy(thrust::make_permutation_iterator(bGPU->begin(), cusparseTriFactors->rpermIndices->begin()), 1025 thrust::make_permutation_iterator(bGPU->end(), cusparseTriFactors->rpermIndices->end()), 1026 xGPU->begin()); 1027 1028 /* First, solve U */ 1029 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, 1030 upTriFactorT->csrMat->num_rows, &ALPHA, upTriFactorT->descr, 1031 upTriFactorT->csrMat->values->data().get(), 1032 upTriFactorT->csrMat->row_offsets->data().get(), 1033 upTriFactorT->csrMat->column_indices->data().get(), 1034 upTriFactorT->solveInfo, 1035 xGPU->data().get(), tempGPU->data().get());CHKERRCUSP(stat); 1036 1037 /* Then, solve L */ 1038 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, 1039 loTriFactorT->csrMat->num_rows, &ALPHA, loTriFactorT->descr, 1040 loTriFactorT->csrMat->values->data().get(), 1041 loTriFactorT->csrMat->row_offsets->data().get(), 1042 loTriFactorT->csrMat->column_indices->data().get(), 1043 loTriFactorT->solveInfo, 1044 tempGPU->data().get(), xGPU->data().get());CHKERRCUSP(stat); 1045 1046 /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */ 1047 thrust::copy(thrust::make_permutation_iterator(xGPU->begin(), cusparseTriFactors->cpermIndices->begin()), 1048 thrust::make_permutation_iterator(xGPU->end(), cusparseTriFactors->cpermIndices->end()), 1049 tempGPU->begin()); 1050 1051 /* Copy the temporary to the full solution. */ 1052 thrust::copy(tempGPU->begin(), tempGPU->end(), xGPU->begin()); 1053 1054 /* restore */ 1055 ierr = VecCUSPRestoreArrayRead(bb,&bGPU);CHKERRQ(ierr); 1056 ierr = VecCUSPRestoreArrayWrite(xx,&xGPU);CHKERRQ(ierr); 1057 ierr = WaitForGPU();CHKERRCUSP(ierr); 1058 1059 ierr = PetscLogFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1060 PetscFunctionReturn(0); 1061 } 1062 1063 #undef __FUNCT__ 1064 #define __FUNCT__ "MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering" 1065 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 1066 { 1067 CUSPARRAY *xGPU,*bGPU; 1068 cusparseStatus_t stat; 1069 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1070 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1071 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1072 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1073 PetscErrorCode ierr; 1074 1075 PetscFunctionBegin; 1076 /* Analyze the matrix and create the transpose ... on the fly */ 1077 if (!loTriFactorT && !upTriFactorT) { 1078 ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr); 1079 loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1080 upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1081 } 1082 1083 /* Get the GPU pointers */ 1084 ierr = VecCUSPGetArrayWrite(xx,&xGPU);CHKERRQ(ierr); 1085 ierr = VecCUSPGetArrayRead(bb,&bGPU);CHKERRQ(ierr); 1086 1087 /* First, solve U */ 1088 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, 1089 upTriFactorT->csrMat->num_rows, &ALPHA, upTriFactorT->descr, 1090 upTriFactorT->csrMat->values->data().get(), 1091 upTriFactorT->csrMat->row_offsets->data().get(), 1092 upTriFactorT->csrMat->column_indices->data().get(), 1093 upTriFactorT->solveInfo, 1094 bGPU->data().get(), tempGPU->data().get());CHKERRCUSP(stat); 1095 1096 /* Then, solve L */ 1097 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, 1098 loTriFactorT->csrMat->num_rows, &ALPHA, loTriFactorT->descr, 1099 loTriFactorT->csrMat->values->data().get(), 1100 loTriFactorT->csrMat->row_offsets->data().get(), 1101 loTriFactorT->csrMat->column_indices->data().get(), 1102 loTriFactorT->solveInfo, 1103 tempGPU->data().get(), xGPU->data().get());CHKERRCUSP(stat); 1104 1105 /* restore */ 1106 ierr = VecCUSPRestoreArrayRead(bb,&bGPU);CHKERRQ(ierr); 1107 ierr = VecCUSPRestoreArrayWrite(xx,&xGPU);CHKERRQ(ierr); 1108 ierr = WaitForGPU();CHKERRCUSP(ierr); 1109 ierr = PetscLogFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1110 PetscFunctionReturn(0); 1111 } 1112 1113 #undef __FUNCT__ 1114 #define __FUNCT__ "MatSolve_SeqAIJCUSPARSE" 1115 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 1116 { 1117 CUSPARRAY *xGPU,*bGPU; 1118 cusparseStatus_t stat; 1119 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1120 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1121 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1122 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1123 PetscErrorCode ierr; 1124 VecType t; 1125 PetscBool flg; 1126 1127 PetscFunctionBegin; 1128 ierr = VecGetType(bb,&t);CHKERRQ(ierr); 1129 ierr = PetscStrcmp(t,VECSEQCUSP,&flg);CHKERRQ(ierr); 1130 if (!flg) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Vector of type %s passed into MatSolve_SeqAIJCUSPARSE (Arg #2). Can only deal with %s\n.",t,VECSEQCUSP); 1131 ierr = VecGetType(xx,&t);CHKERRQ(ierr); 1132 ierr = PetscStrcmp(t,VECSEQCUSP,&flg);CHKERRQ(ierr); 1133 if (!flg) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Vector of type %s passed into MatSolve_SeqAIJCUSPARSE (Arg #3). Can only deal with %s\n.",t,VECSEQCUSP); 1134 1135 /* Get the GPU pointers */ 1136 ierr = VecCUSPGetArrayWrite(xx,&xGPU);CHKERRQ(ierr); 1137 ierr = VecCUSPGetArrayRead(bb,&bGPU);CHKERRQ(ierr); 1138 1139 /* First, reorder with the row permutation */ 1140 thrust::copy(thrust::make_permutation_iterator(bGPU->begin(), cusparseTriFactors->rpermIndices->begin()), 1141 thrust::make_permutation_iterator(bGPU->end(), cusparseTriFactors->rpermIndices->end()), 1142 xGPU->begin()); 1143 1144 /* Next, solve L */ 1145 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp, 1146 loTriFactor->csrMat->num_rows, &ALPHA, loTriFactor->descr, 1147 loTriFactor->csrMat->values->data().get(), 1148 loTriFactor->csrMat->row_offsets->data().get(), 1149 loTriFactor->csrMat->column_indices->data().get(), 1150 loTriFactor->solveInfo, 1151 xGPU->data().get(), tempGPU->data().get());CHKERRCUSP(stat); 1152 1153 /* Then, solve U */ 1154 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp, 1155 upTriFactor->csrMat->num_rows, &ALPHA, upTriFactor->descr, 1156 upTriFactor->csrMat->values->data().get(), 1157 upTriFactor->csrMat->row_offsets->data().get(), 1158 upTriFactor->csrMat->column_indices->data().get(), 1159 upTriFactor->solveInfo, 1160 tempGPU->data().get(), xGPU->data().get());CHKERRCUSP(stat); 1161 1162 /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */ 1163 thrust::copy(thrust::make_permutation_iterator(xGPU->begin(), cusparseTriFactors->cpermIndices->begin()), 1164 thrust::make_permutation_iterator(xGPU->end(), cusparseTriFactors->cpermIndices->end()), 1165 tempGPU->begin()); 1166 1167 /* Copy the temporary to the full solution. */ 1168 thrust::copy(tempGPU->begin(), tempGPU->end(), xGPU->begin()); 1169 1170 ierr = VecCUSPRestoreArrayRead(bb,&bGPU);CHKERRQ(ierr); 1171 ierr = VecCUSPRestoreArrayWrite(xx,&xGPU);CHKERRQ(ierr); 1172 ierr = WaitForGPU();CHKERRCUSP(ierr); 1173 ierr = PetscLogFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1174 PetscFunctionReturn(0); 1175 } 1176 1177 #undef __FUNCT__ 1178 #define __FUNCT__ "MatSolve_SeqAIJCUSPARSE_NaturalOrdering" 1179 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 1180 { 1181 CUSPARRAY *xGPU,*bGPU; 1182 cusparseStatus_t stat; 1183 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1184 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1185 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1186 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1187 PetscErrorCode ierr; 1188 1189 PetscFunctionBegin; 1190 /* Get the GPU pointers */ 1191 ierr = VecCUSPGetArrayWrite(xx,&xGPU);CHKERRQ(ierr); 1192 ierr = VecCUSPGetArrayRead(bb,&bGPU);CHKERRQ(ierr); 1193 1194 /* First, solve L */ 1195 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp, 1196 loTriFactor->csrMat->num_rows, &ALPHA, loTriFactor->descr, 1197 loTriFactor->csrMat->values->data().get(), 1198 loTriFactor->csrMat->row_offsets->data().get(), 1199 loTriFactor->csrMat->column_indices->data().get(), 1200 loTriFactor->solveInfo, 1201 bGPU->data().get(), tempGPU->data().get());CHKERRCUSP(stat); 1202 1203 /* Next, solve U */ 1204 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp, 1205 upTriFactor->csrMat->num_rows, &ALPHA, upTriFactor->descr, 1206 upTriFactor->csrMat->values->data().get(), 1207 upTriFactor->csrMat->row_offsets->data().get(), 1208 upTriFactor->csrMat->column_indices->data().get(), 1209 upTriFactor->solveInfo, 1210 tempGPU->data().get(), xGPU->data().get());CHKERRCUSP(stat); 1211 1212 ierr = VecCUSPRestoreArrayRead(bb,&bGPU);CHKERRQ(ierr); 1213 ierr = VecCUSPRestoreArrayWrite(xx,&xGPU);CHKERRQ(ierr); 1214 ierr = WaitForGPU();CHKERRCUSP(ierr); 1215 ierr = PetscLogFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1216 PetscFunctionReturn(0); 1217 } 1218 1219 #undef __FUNCT__ 1220 #define __FUNCT__ "MatSeqAIJCUSPARSECopyToGPU" 1221 static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A) 1222 { 1223 1224 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1225 Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 1226 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1227 PetscInt m = A->rmap->n,*ii,*ridx; 1228 PetscErrorCode ierr; 1229 cusparseStatus_t stat; 1230 cudaError_t err; 1231 1232 PetscFunctionBegin; 1233 if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUSP_CPU) { 1234 ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1235 Mat_SeqAIJCUSPARSEMultStruct_Destroy(&matstruct,cusparsestruct->format); 1236 try { 1237 cusparsestruct->nonzerorow=0; 1238 for (int j = 0; j<m; j++) cusparsestruct->nonzerorow += ((a->i[j+1]-a->i[j])>0); 1239 1240 if (a->compressedrow.use) { 1241 m = a->compressedrow.nrows; 1242 ii = a->compressedrow.i; 1243 ridx = a->compressedrow.rindex; 1244 } else { 1245 /* Forcing compressed row on the GPU */ 1246 int k=0; 1247 ierr = PetscMalloc1(cusparsestruct->nonzerorow+1, &ii);CHKERRQ(ierr); 1248 ierr = PetscMalloc1(cusparsestruct->nonzerorow, &ridx);CHKERRQ(ierr); 1249 ii[0]=0; 1250 for (int j = 0; j<m; j++) { 1251 if ((a->i[j+1]-a->i[j])>0) { 1252 ii[k] = a->i[j]; 1253 ridx[k]= j; 1254 k++; 1255 } 1256 } 1257 ii[cusparsestruct->nonzerorow] = a->nz; 1258 m = cusparsestruct->nonzerorow; 1259 } 1260 1261 /* allocate space for the triangular factor information */ 1262 matstruct = new Mat_SeqAIJCUSPARSEMultStruct; 1263 stat = cusparseCreateMatDescr(&matstruct->descr);CHKERRCUSP(stat); 1264 stat = cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSP(stat); 1265 stat = cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSP(stat); 1266 1267 err = cudaMalloc((void **)&(matstruct->alpha),sizeof(PetscScalar));CHKERRCUSP(err); 1268 err = cudaMemcpy(matstruct->alpha,&ALPHA,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUSP(err); 1269 err = cudaMalloc((void **)&(matstruct->beta),sizeof(PetscScalar));CHKERRCUSP(err); 1270 err = cudaMemcpy(matstruct->beta,&BETA,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUSP(err); 1271 stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSP(stat); 1272 1273 /* Build a hybrid/ellpack matrix if this option is chosen for the storage */ 1274 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1275 /* set the matrix */ 1276 CsrMatrix *matrix= new CsrMatrix; 1277 matrix->num_rows = m; 1278 matrix->num_cols = A->cmap->n; 1279 matrix->num_entries = a->nz; 1280 matrix->row_offsets = new THRUSTINTARRAY32(m+1); 1281 matrix->row_offsets->assign(ii, ii + m+1); 1282 1283 matrix->column_indices = new THRUSTINTARRAY32(a->nz); 1284 matrix->column_indices->assign(a->j, a->j+a->nz); 1285 1286 matrix->values = new THRUSTARRAY(a->nz); 1287 matrix->values->assign(a->a, a->a+a->nz); 1288 1289 /* assign the pointer */ 1290 matstruct->mat = matrix; 1291 1292 } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) { 1293 #if CUDA_VERSION>=4020 1294 CsrMatrix *matrix= new CsrMatrix; 1295 matrix->num_rows = m; 1296 matrix->num_cols = A->cmap->n; 1297 matrix->num_entries = a->nz; 1298 matrix->row_offsets = new THRUSTINTARRAY32(m+1); 1299 matrix->row_offsets->assign(ii, ii + m+1); 1300 1301 matrix->column_indices = new THRUSTINTARRAY32(a->nz); 1302 matrix->column_indices->assign(a->j, a->j+a->nz); 1303 1304 matrix->values = new THRUSTARRAY(a->nz); 1305 matrix->values->assign(a->a, a->a+a->nz); 1306 1307 cusparseHybMat_t hybMat; 1308 stat = cusparseCreateHybMat(&hybMat);CHKERRCUSP(stat); 1309 cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ? 1310 CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 1311 stat = cusparse_csr2hyb(cusparsestruct->handle, matrix->num_rows, matrix->num_cols, 1312 matstruct->descr, matrix->values->data().get(), 1313 matrix->row_offsets->data().get(), 1314 matrix->column_indices->data().get(), 1315 hybMat, 0, partition);CHKERRCUSP(stat); 1316 /* assign the pointer */ 1317 matstruct->mat = hybMat; 1318 1319 if (matrix) { 1320 if (matrix->values) delete (THRUSTARRAY*)matrix->values; 1321 if (matrix->column_indices) delete (THRUSTINTARRAY32*)matrix->column_indices; 1322 if (matrix->row_offsets) delete (THRUSTINTARRAY32*)matrix->row_offsets; 1323 delete (CsrMatrix*)matrix; 1324 } 1325 #endif 1326 } 1327 1328 /* assign the compressed row indices */ 1329 matstruct->cprowIndices = new THRUSTINTARRAY(m); 1330 matstruct->cprowIndices->assign(ridx,ridx+m); 1331 1332 /* assign the pointer */ 1333 cusparsestruct->mat = matstruct; 1334 1335 if (!a->compressedrow.use) { 1336 ierr = PetscFree(ii);CHKERRQ(ierr); 1337 ierr = PetscFree(ridx);CHKERRQ(ierr); 1338 } 1339 cusparsestruct->workVector = new THRUSTARRAY; 1340 cusparsestruct->workVector->resize(m); 1341 } catch(char *ex) { 1342 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 1343 } 1344 ierr = WaitForGPU();CHKERRCUSP(ierr); 1345 1346 A->valid_GPU_matrix = PETSC_CUSP_BOTH; 1347 1348 ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1349 } 1350 PetscFunctionReturn(0); 1351 } 1352 1353 #undef __FUNCT__ 1354 #define __FUNCT__ "MatCreateVecs_SeqAIJCUSPARSE" 1355 static PetscErrorCode MatCreateVecs_SeqAIJCUSPARSE(Mat mat, Vec *right, Vec *left) 1356 { 1357 PetscErrorCode ierr; 1358 PetscInt rbs,cbs; 1359 1360 PetscFunctionBegin; 1361 ierr = MatGetBlockSizes(mat,&rbs,&cbs);CHKERRQ(ierr); 1362 if (right) { 1363 ierr = VecCreate(PetscObjectComm((PetscObject)mat),right);CHKERRQ(ierr); 1364 ierr = VecSetSizes(*right,mat->cmap->n,PETSC_DETERMINE);CHKERRQ(ierr); 1365 ierr = VecSetBlockSize(*right,cbs);CHKERRQ(ierr); 1366 ierr = VecSetType(*right,VECSEQCUSP);CHKERRQ(ierr); 1367 ierr = PetscLayoutReference(mat->cmap,&(*right)->map);CHKERRQ(ierr); 1368 } 1369 if (left) { 1370 ierr = VecCreate(PetscObjectComm((PetscObject)mat),left);CHKERRQ(ierr); 1371 ierr = VecSetSizes(*left,mat->rmap->n,PETSC_DETERMINE);CHKERRQ(ierr); 1372 ierr = VecSetBlockSize(*left,rbs);CHKERRQ(ierr); 1373 ierr = VecSetType(*left,VECSEQCUSP);CHKERRQ(ierr); 1374 ierr = PetscLayoutReference(mat->rmap,&(*left)->map);CHKERRQ(ierr); 1375 } 1376 PetscFunctionReturn(0); 1377 } 1378 1379 struct VecCUSPPlusEquals 1380 { 1381 template <typename Tuple> 1382 __host__ __device__ 1383 void operator()(Tuple t) 1384 { 1385 thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t); 1386 } 1387 }; 1388 1389 #undef __FUNCT__ 1390 #define __FUNCT__ "MatMult_SeqAIJCUSPARSE" 1391 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 1392 { 1393 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1394 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1395 Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 1396 CUSPARRAY *xarray,*yarray; 1397 PetscErrorCode ierr; 1398 cusparseStatus_t stat; 1399 1400 PetscFunctionBegin; 1401 /* The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE 1402 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); */ 1403 ierr = VecCUSPGetArrayRead(xx,&xarray);CHKERRQ(ierr); 1404 ierr = VecCUSPGetArrayWrite(yy,&yarray);CHKERRQ(ierr); 1405 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1406 CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 1407 stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1408 mat->num_rows, mat->num_cols, mat->num_entries, 1409 matstruct->alpha, matstruct->descr, mat->values->data().get(), mat->row_offsets->data().get(), 1410 mat->column_indices->data().get(), xarray->data().get(), matstruct->beta, 1411 yarray->data().get());CHKERRCUSP(stat); 1412 } else { 1413 #if CUDA_VERSION>=4020 1414 cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat; 1415 stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1416 matstruct->alpha, matstruct->descr, hybMat, 1417 xarray->data().get(), matstruct->beta, 1418 yarray->data().get());CHKERRCUSP(stat); 1419 #endif 1420 } 1421 ierr = VecCUSPRestoreArrayRead(xx,&xarray);CHKERRQ(ierr); 1422 ierr = VecCUSPRestoreArrayWrite(yy,&yarray);CHKERRQ(ierr); 1423 if (!cusparsestruct->stream) { 1424 ierr = WaitForGPU();CHKERRCUSP(ierr); 1425 } 1426 ierr = PetscLogFlops(2.0*a->nz - cusparsestruct->nonzerorow);CHKERRQ(ierr); 1427 PetscFunctionReturn(0); 1428 } 1429 1430 #undef __FUNCT__ 1431 #define __FUNCT__ "MatMultTranspose_SeqAIJCUSPARSE" 1432 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 1433 { 1434 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1435 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1436 Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 1437 CUSPARRAY *xarray,*yarray; 1438 PetscErrorCode ierr; 1439 cusparseStatus_t stat; 1440 1441 PetscFunctionBegin; 1442 /* The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE 1443 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); */ 1444 if (!matstructT) { 1445 ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr); 1446 matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 1447 } 1448 ierr = VecCUSPGetArrayRead(xx,&xarray);CHKERRQ(ierr); 1449 ierr = VecCUSPGetArrayWrite(yy,&yarray);CHKERRQ(ierr); 1450 1451 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1452 CsrMatrix *mat = (CsrMatrix*)matstructT->mat; 1453 stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1454 mat->num_rows, mat->num_cols, 1455 mat->num_entries, matstructT->alpha, matstructT->descr, 1456 mat->values->data().get(), mat->row_offsets->data().get(), 1457 mat->column_indices->data().get(), xarray->data().get(), matstructT->beta, 1458 yarray->data().get());CHKERRCUSP(stat); 1459 } else { 1460 #if CUDA_VERSION>=4020 1461 cusparseHybMat_t hybMat = (cusparseHybMat_t)matstructT->mat; 1462 stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1463 matstructT->alpha, matstructT->descr, hybMat, 1464 xarray->data().get(), matstructT->beta, 1465 yarray->data().get());CHKERRCUSP(stat); 1466 #endif 1467 } 1468 ierr = VecCUSPRestoreArrayRead(xx,&xarray);CHKERRQ(ierr); 1469 ierr = VecCUSPRestoreArrayWrite(yy,&yarray);CHKERRQ(ierr); 1470 if (!cusparsestruct->stream) { 1471 ierr = WaitForGPU();CHKERRCUSP(ierr); 1472 } 1473 ierr = PetscLogFlops(2.0*a->nz - cusparsestruct->nonzerorow);CHKERRQ(ierr); 1474 PetscFunctionReturn(0); 1475 } 1476 1477 1478 #undef __FUNCT__ 1479 #define __FUNCT__ "MatMultAdd_SeqAIJCUSPARSE" 1480 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 1481 { 1482 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1483 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1484 Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 1485 CUSPARRAY *xarray,*yarray,*zarray; 1486 PetscErrorCode ierr; 1487 cusparseStatus_t stat; 1488 1489 PetscFunctionBegin; 1490 /* The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE 1491 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); */ 1492 try { 1493 ierr = VecCopy_SeqCUSP(yy,zz);CHKERRQ(ierr); 1494 ierr = VecCUSPGetArrayRead(xx,&xarray);CHKERRQ(ierr); 1495 ierr = VecCUSPGetArrayRead(yy,&yarray);CHKERRQ(ierr); 1496 ierr = VecCUSPGetArrayWrite(zz,&zarray);CHKERRQ(ierr); 1497 1498 /* multiply add */ 1499 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1500 CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 1501 /* here we need to be careful to set the number of rows in the multiply to the 1502 number of compressed rows in the matrix ... which is equivalent to the 1503 size of the workVector */ 1504 stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1505 mat->num_rows, mat->num_cols, 1506 mat->num_entries, matstruct->alpha, matstruct->descr, 1507 mat->values->data().get(), mat->row_offsets->data().get(), 1508 mat->column_indices->data().get(), xarray->data().get(), matstruct->beta, 1509 cusparsestruct->workVector->data().get());CHKERRCUSP(stat); 1510 } else { 1511 #if CUDA_VERSION>=4020 1512 cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat; 1513 if (cusparsestruct->workVector->size()) { 1514 stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1515 matstruct->alpha, matstruct->descr, hybMat, 1516 xarray->data().get(), matstruct->beta, 1517 cusparsestruct->workVector->data().get());CHKERRCUSP(stat); 1518 } 1519 #endif 1520 } 1521 1522 /* scatter the data from the temporary into the full vector with a += operation */ 1523 thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zarray->begin(), matstruct->cprowIndices->begin()))), 1524 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zarray->begin(), matstruct->cprowIndices->begin()))) + cusparsestruct->workVector->size(), 1525 VecCUSPPlusEquals()); 1526 ierr = VecCUSPRestoreArrayRead(xx,&xarray);CHKERRQ(ierr); 1527 ierr = VecCUSPRestoreArrayRead(yy,&yarray);CHKERRQ(ierr); 1528 ierr = VecCUSPRestoreArrayWrite(zz,&zarray);CHKERRQ(ierr); 1529 1530 } catch(char *ex) { 1531 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 1532 } 1533 ierr = WaitForGPU();CHKERRCUSP(ierr); 1534 ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr); 1535 PetscFunctionReturn(0); 1536 } 1537 1538 #undef __FUNCT__ 1539 #define __FUNCT__ "MatMultTransposeAdd_SeqAIJCUSPARSE" 1540 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 1541 { 1542 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1543 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1544 Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 1545 CUSPARRAY *xarray,*yarray,*zarray; 1546 PetscErrorCode ierr; 1547 cusparseStatus_t stat; 1548 1549 PetscFunctionBegin; 1550 /* The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE 1551 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); */ 1552 if (!matstructT) { 1553 ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr); 1554 matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 1555 } 1556 1557 try { 1558 ierr = VecCopy_SeqCUSP(yy,zz);CHKERRQ(ierr); 1559 ierr = VecCUSPGetArrayRead(xx,&xarray);CHKERRQ(ierr); 1560 ierr = VecCUSPGetArrayRead(yy,&yarray);CHKERRQ(ierr); 1561 ierr = VecCUSPGetArrayWrite(zz,&zarray);CHKERRQ(ierr); 1562 1563 /* multiply add with matrix transpose */ 1564 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1565 CsrMatrix *mat = (CsrMatrix*)matstructT->mat; 1566 /* here we need to be careful to set the number of rows in the multiply to the 1567 number of compressed rows in the matrix ... which is equivalent to the 1568 size of the workVector */ 1569 stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1570 mat->num_rows, mat->num_cols, 1571 mat->num_entries, matstructT->alpha, matstructT->descr, 1572 mat->values->data().get(), mat->row_offsets->data().get(), 1573 mat->column_indices->data().get(), xarray->data().get(), matstructT->beta, 1574 cusparsestruct->workVector->data().get());CHKERRCUSP(stat); 1575 } else { 1576 #if CUDA_VERSION>=4020 1577 cusparseHybMat_t hybMat = (cusparseHybMat_t)matstructT->mat; 1578 if (cusparsestruct->workVector->size()) { 1579 stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1580 matstructT->alpha, matstructT->descr, hybMat, 1581 xarray->data().get(), matstructT->beta, 1582 cusparsestruct->workVector->data().get());CHKERRCUSP(stat); 1583 } 1584 #endif 1585 } 1586 1587 /* scatter the data from the temporary into the full vector with a += operation */ 1588 thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zarray->begin(), matstructT->cprowIndices->begin()))), 1589 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zarray->begin(), matstructT->cprowIndices->begin()))) + cusparsestruct->workVector->size(), 1590 VecCUSPPlusEquals()); 1591 1592 ierr = VecCUSPRestoreArrayRead(xx,&xarray);CHKERRQ(ierr); 1593 ierr = VecCUSPRestoreArrayRead(yy,&yarray);CHKERRQ(ierr); 1594 ierr = VecCUSPRestoreArrayWrite(zz,&zarray);CHKERRQ(ierr); 1595 1596 } catch(char *ex) { 1597 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 1598 } 1599 ierr = WaitForGPU();CHKERRCUSP(ierr); 1600 ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr); 1601 PetscFunctionReturn(0); 1602 } 1603 1604 #undef __FUNCT__ 1605 #define __FUNCT__ "MatAssemblyEnd_SeqAIJCUSPARSE" 1606 static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode) 1607 { 1608 PetscErrorCode ierr; 1609 1610 PetscFunctionBegin; 1611 ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr); 1612 if (A->factortype==MAT_FACTOR_NONE) { 1613 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 1614 } 1615 if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0); 1616 A->ops->mult = MatMult_SeqAIJCUSPARSE; 1617 A->ops->multadd = MatMultAdd_SeqAIJCUSPARSE; 1618 A->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE; 1619 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE; 1620 PetscFunctionReturn(0); 1621 } 1622 1623 /* --------------------------------------------------------------------------------*/ 1624 #undef __FUNCT__ 1625 #define __FUNCT__ "MatCreateSeqAIJCUSPARSE" 1626 /*@ 1627 MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format 1628 (the default parallel PETSc format). This matrix will ultimately pushed down 1629 to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix 1630 assembly performance the user should preallocate the matrix storage by setting 1631 the parameter nz (or the array nnz). By setting these parameters accurately, 1632 performance during matrix assembly can be increased by more than a factor of 50. 1633 1634 Collective on MPI_Comm 1635 1636 Input Parameters: 1637 + comm - MPI communicator, set to PETSC_COMM_SELF 1638 . m - number of rows 1639 . n - number of columns 1640 . nz - number of nonzeros per row (same for all rows) 1641 - nnz - array containing the number of nonzeros in the various rows 1642 (possibly different for each row) or NULL 1643 1644 Output Parameter: 1645 . A - the matrix 1646 1647 It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 1648 MatXXXXSetPreallocation() paradgm instead of this routine directly. 1649 [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 1650 1651 Notes: 1652 If nnz is given then nz is ignored 1653 1654 The AIJ format (also called the Yale sparse matrix format or 1655 compressed row storage), is fully compatible with standard Fortran 77 1656 storage. That is, the stored row and column indices can begin at 1657 either one (as in Fortran) or zero. See the users' manual for details. 1658 1659 Specify the preallocated storage with either nz or nnz (not both). 1660 Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 1661 allocation. For large problems you MUST preallocate memory or you 1662 will get TERRIBLE performance, see the users' manual chapter on matrices. 1663 1664 By default, this format uses inodes (identical nodes) when possible, to 1665 improve numerical efficiency of matrix-vector products and solves. We 1666 search for consecutive rows with the same nonzero structure, thereby 1667 reusing matrix information to achieve increased efficiency. 1668 1669 Level: intermediate 1670 1671 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE 1672 @*/ 1673 PetscErrorCode MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A) 1674 { 1675 PetscErrorCode ierr; 1676 1677 PetscFunctionBegin; 1678 ierr = MatCreate(comm,A);CHKERRQ(ierr); 1679 ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr); 1680 ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 1681 ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr); 1682 PetscFunctionReturn(0); 1683 } 1684 1685 #undef __FUNCT__ 1686 #define __FUNCT__ "MatDestroy_SeqAIJCUSPARSE" 1687 static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A) 1688 { 1689 PetscErrorCode ierr; 1690 1691 PetscFunctionBegin; 1692 if (A->factortype==MAT_FACTOR_NONE) { 1693 if (A->valid_GPU_matrix != PETSC_CUSP_UNALLOCATED) { 1694 ierr = Mat_SeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr);CHKERRQ(ierr); 1695 } 1696 } else { 1697 ierr = Mat_SeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr);CHKERRQ(ierr); 1698 } 1699 ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr); 1700 PetscFunctionReturn(0); 1701 } 1702 1703 #undef __FUNCT__ 1704 #define __FUNCT__ "MatCreate_SeqAIJCUSPARSE" 1705 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B) 1706 { 1707 PetscErrorCode ierr; 1708 cusparseStatus_t stat; 1709 cusparseHandle_t handle=0; 1710 1711 PetscFunctionBegin; 1712 ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr); 1713 if (B->factortype==MAT_FACTOR_NONE) { 1714 /* you cannot check the inode.use flag here since the matrix was just created. 1715 now build a GPU matrix data structure */ 1716 B->spptr = new Mat_SeqAIJCUSPARSE; 1717 ((Mat_SeqAIJCUSPARSE*)B->spptr)->mat = 0; 1718 ((Mat_SeqAIJCUSPARSE*)B->spptr)->matTranspose = 0; 1719 ((Mat_SeqAIJCUSPARSE*)B->spptr)->workVector = 0; 1720 ((Mat_SeqAIJCUSPARSE*)B->spptr)->format = MAT_CUSPARSE_CSR; 1721 ((Mat_SeqAIJCUSPARSE*)B->spptr)->stream = 0; 1722 ((Mat_SeqAIJCUSPARSE*)B->spptr)->handle = 0; 1723 stat = cusparseCreate(&handle);CHKERRCUSP(stat); 1724 ((Mat_SeqAIJCUSPARSE*)B->spptr)->handle = handle; 1725 ((Mat_SeqAIJCUSPARSE*)B->spptr)->stream = 0; 1726 } else { 1727 /* NEXT, set the pointers to the triangular factors */ 1728 B->spptr = new Mat_SeqAIJCUSPARSETriFactors; 1729 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->loTriFactorPtr = 0; 1730 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->upTriFactorPtr = 0; 1731 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->loTriFactorPtrTranspose = 0; 1732 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->upTriFactorPtrTranspose = 0; 1733 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->rpermIndices = 0; 1734 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->cpermIndices = 0; 1735 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->workVector = 0; 1736 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->handle = 0; 1737 stat = cusparseCreate(&handle);CHKERRCUSP(stat); 1738 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->handle = handle; 1739 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->nnz = 0; 1740 } 1741 1742 B->ops->assemblyend = MatAssemblyEnd_SeqAIJCUSPARSE; 1743 B->ops->destroy = MatDestroy_SeqAIJCUSPARSE; 1744 B->ops->getvecs = MatCreateVecs_SeqAIJCUSPARSE; 1745 B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE; 1746 B->ops->mult = MatMult_SeqAIJCUSPARSE; 1747 B->ops->multadd = MatMultAdd_SeqAIJCUSPARSE; 1748 B->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE; 1749 B->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE; 1750 1751 ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 1752 1753 B->valid_GPU_matrix = PETSC_CUSP_UNALLOCATED; 1754 1755 ierr = PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr); 1756 PetscFunctionReturn(0); 1757 } 1758 1759 /*M 1760 MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices. 1761 1762 A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either 1763 CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later. 1764 All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library. 1765 1766 Options Database Keys: 1767 + -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions() 1768 . -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 1769 . -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 1770 1771 Level: beginner 1772 1773 .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 1774 M*/ 1775 1776 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat,MatFactorType,Mat*); 1777 1778 1779 #undef __FUNCT__ 1780 #define __FUNCT__ "MatSolverPackageRegister_CUSPARSE" 1781 PETSC_EXTERN PetscErrorCode MatSolverPackageRegister_CUSPARSE(void) 1782 { 1783 PetscErrorCode ierr; 1784 1785 PetscFunctionBegin; 1786 ierr = MatSolverPackageRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 1787 ierr = MatSolverPackageRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 1788 ierr = MatSolverPackageRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 1789 ierr = MatSolverPackageRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 1790 PetscFunctionReturn(0); 1791 } 1792 1793 1794 #undef __FUNCT__ 1795 #define __FUNCT__ "Mat_SeqAIJCUSPARSE_Destroy" 1796 static PetscErrorCode Mat_SeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct) 1797 { 1798 cusparseStatus_t stat; 1799 cusparseHandle_t handle; 1800 1801 PetscFunctionBegin; 1802 if (*cusparsestruct) { 1803 Mat_SeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format); 1804 Mat_SeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format); 1805 delete (*cusparsestruct)->workVector; 1806 if (handle = (*cusparsestruct)->handle) { 1807 stat = cusparseDestroy(handle);CHKERRCUSP(stat); 1808 } 1809 delete *cusparsestruct; 1810 *cusparsestruct = 0; 1811 } 1812 PetscFunctionReturn(0); 1813 } 1814 1815 #undef __FUNCT__ 1816 #define __FUNCT__ "CsrMatrix_Destroy" 1817 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat) 1818 { 1819 PetscFunctionBegin; 1820 if (*mat) { 1821 delete (*mat)->values; 1822 delete (*mat)->column_indices; 1823 delete (*mat)->row_offsets; 1824 delete *mat; 1825 *mat = 0; 1826 } 1827 PetscFunctionReturn(0); 1828 } 1829 1830 #undef __FUNCT__ 1831 #define __FUNCT__ "Mat_SeqAIJCUSPARSETriFactorStruct_Destroy" 1832 static PetscErrorCode Mat_SeqAIJCUSPARSETriFactorStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor) 1833 { 1834 cusparseStatus_t stat; 1835 PetscErrorCode ierr; 1836 1837 PetscFunctionBegin; 1838 if (*trifactor) { 1839 if ((*trifactor)->descr) { stat = cusparseDestroyMatDescr((*trifactor)->descr);CHKERRCUSP(stat); } 1840 if ((*trifactor)->solveInfo) { stat = cusparseDestroySolveAnalysisInfo((*trifactor)->solveInfo);CHKERRCUSP(stat); } 1841 ierr = CsrMatrix_Destroy(&(*trifactor)->csrMat);CHKERRQ(ierr); 1842 delete *trifactor; 1843 *trifactor = 0; 1844 } 1845 PetscFunctionReturn(0); 1846 } 1847 1848 #undef __FUNCT__ 1849 #define __FUNCT__ "Mat_SeqAIJCUSPARSEMultStruct_Destroy" 1850 static PetscErrorCode Mat_SeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format) 1851 { 1852 CsrMatrix *mat; 1853 cusparseStatus_t stat; 1854 cudaError_t err; 1855 1856 PetscFunctionBegin; 1857 if (*matstruct) { 1858 if ((*matstruct)->mat) { 1859 if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) { 1860 cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat; 1861 stat = cusparseDestroyHybMat(hybMat);CHKERRCUSP(stat); 1862 } else { 1863 mat = (CsrMatrix*)(*matstruct)->mat; 1864 CsrMatrix_Destroy(&mat); 1865 } 1866 } 1867 if ((*matstruct)->descr) { stat = cusparseDestroyMatDescr((*matstruct)->descr);CHKERRCUSP(stat); } 1868 delete (*matstruct)->cprowIndices; 1869 if ((*matstruct)->alpha) { err=cudaFree((*matstruct)->alpha);CHKERRCUSP(err); } 1870 if ((*matstruct)->beta) { err=cudaFree((*matstruct)->beta);CHKERRCUSP(err); } 1871 delete *matstruct; 1872 *matstruct = 0; 1873 } 1874 PetscFunctionReturn(0); 1875 } 1876 1877 #undef __FUNCT__ 1878 #define __FUNCT__ "Mat_SeqAIJCUSPARSETriFactors_Destroy" 1879 static PetscErrorCode Mat_SeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors) 1880 { 1881 cusparseHandle_t handle; 1882 cusparseStatus_t stat; 1883 1884 PetscFunctionBegin; 1885 if (*trifactors) { 1886 Mat_SeqAIJCUSPARSETriFactorStruct_Destroy(&(*trifactors)->loTriFactorPtr); 1887 Mat_SeqAIJCUSPARSETriFactorStruct_Destroy(&(*trifactors)->upTriFactorPtr); 1888 Mat_SeqAIJCUSPARSETriFactorStruct_Destroy(&(*trifactors)->loTriFactorPtrTranspose); 1889 Mat_SeqAIJCUSPARSETriFactorStruct_Destroy(&(*trifactors)->upTriFactorPtrTranspose); 1890 delete (*trifactors)->rpermIndices; 1891 delete (*trifactors)->cpermIndices; 1892 delete (*trifactors)->workVector; 1893 if (handle = (*trifactors)->handle) { 1894 stat = cusparseDestroy(handle);CHKERRCUSP(stat); 1895 } 1896 delete *trifactors; 1897 *trifactors = 0; 1898 } 1899 PetscFunctionReturn(0); 1900 } 1901 1902