1 /* 2 Defines the basic matrix operations for the AIJ (compressed row) 3 matrix storage format using the CUSPARSE library, 4 */ 5 6 #include <petscconf.h> 7 #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/ 8 #include <../src/mat/impls/sbaij/seq/sbaij.h> 9 #include <../src/vec/vec/impls/dvecimpl.h> 10 #include <petsc/private/vecimpl.h> 11 #undef VecType 12 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h> 13 14 const char *const MatCUSPARSEStorageFormats[] = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0}; 15 16 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*); 17 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*); 18 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*); 19 20 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 21 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 22 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*); 23 24 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec); 25 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 26 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 27 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 28 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptions *PetscOptionsObject,Mat); 29 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec); 30 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 31 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 32 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 33 34 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**); 35 static PetscErrorCode Mat_SeqAIJCUSPARSETriFactorStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**); 36 static PetscErrorCode Mat_SeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat); 37 static PetscErrorCode Mat_SeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**); 38 static PetscErrorCode Mat_SeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**); 39 40 #undef __FUNCT__ 41 #define __FUNCT__ "MatCUSPARSESetStream" 42 PetscErrorCode MatCUSPARSESetStream(Mat A,const cudaStream_t stream) 43 { 44 cusparseStatus_t stat; 45 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 46 47 PetscFunctionBegin; 48 cusparsestruct->stream = stream; 49 stat = cusparseSetStream(cusparsestruct->handle,cusparsestruct->stream);CHKERRCUSP(stat); 50 PetscFunctionReturn(0); 51 } 52 53 #undef __FUNCT__ 54 #define __FUNCT__ "MatCUSPARSESetHandle" 55 PetscErrorCode MatCUSPARSESetHandle(Mat A,const cusparseHandle_t handle) 56 { 57 cusparseStatus_t stat; 58 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 59 60 PetscFunctionBegin; 61 if (cusparsestruct->handle) 62 stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUSP(stat); 63 cusparsestruct->handle = handle; 64 stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSP(stat); 65 PetscFunctionReturn(0); 66 } 67 68 #undef __FUNCT__ 69 #define __FUNCT__ "MatCUSPARSEClearHandle" 70 PetscErrorCode MatCUSPARSEClearHandle(Mat A) 71 { 72 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 73 PetscFunctionBegin; 74 if (cusparsestruct->handle) 75 cusparsestruct->handle = 0; 76 PetscFunctionReturn(0); 77 } 78 79 #undef __FUNCT__ 80 #define __FUNCT__ "MatFactorGetSolverPackage_seqaij_cusparse" 81 PetscErrorCode MatFactorGetSolverPackage_seqaij_cusparse(Mat A,const MatSolverPackage *type) 82 { 83 PetscFunctionBegin; 84 *type = MATSOLVERCUSPARSE; 85 PetscFunctionReturn(0); 86 } 87 88 /*MC 89 MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices 90 on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported 91 algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer 92 performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the 93 CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these 94 algorithms are not recommended. This class does NOT support direct solver operations. 95 96 Level: beginner 97 98 .seealso: PCFactorSetMatSolverPackage(), MatSolverPackage, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 99 M*/ 100 101 #undef __FUNCT__ 102 #define __FUNCT__ "MatGetFactor_seqaijcusparse_cusparse" 103 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B) 104 { 105 PetscErrorCode ierr; 106 PetscInt n = A->rmap->n; 107 108 PetscFunctionBegin; 109 ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr); 110 (*B)->factortype = ftype; 111 ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr); 112 ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 113 114 if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) { 115 ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr); 116 (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE; 117 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJCUSPARSE; 118 } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) { 119 (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJCUSPARSE; 120 (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE; 121 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types"); 122 123 ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr); 124 ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverPackage_C",MatFactorGetSolverPackage_seqaij_cusparse);CHKERRQ(ierr); 125 PetscFunctionReturn(0); 126 } 127 128 #undef __FUNCT__ 129 #define __FUNCT__ "MatCUSPARSESetFormat_SeqAIJCUSPARSE" 130 PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 131 { 132 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 133 134 PetscFunctionBegin; 135 #if CUDA_VERSION>=4020 136 switch (op) { 137 case MAT_CUSPARSE_MULT: 138 cusparsestruct->format = format; 139 break; 140 case MAT_CUSPARSE_ALL: 141 cusparsestruct->format = format; 142 break; 143 default: 144 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op); 145 } 146 #else 147 if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) 148 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"ELL (Ellpack) and HYB (Hybrid) storage format require CUDA 4.2 or later."); 149 #endif 150 PetscFunctionReturn(0); 151 } 152 153 /*@ 154 MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular 155 operation. Only the MatMult operation can use different GPU storage formats 156 for MPIAIJCUSPARSE matrices. 157 Not Collective 158 159 Input Parameters: 160 + A - Matrix of type SEQAIJCUSPARSE 161 . op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL. 162 - format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2) 163 164 Output Parameter: 165 166 Level: intermediate 167 168 .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 169 @*/ 170 #undef __FUNCT__ 171 #define __FUNCT__ "MatCUSPARSESetFormat" 172 PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 173 { 174 PetscErrorCode ierr; 175 176 PetscFunctionBegin; 177 PetscValidHeaderSpecific(A, MAT_CLASSID,1); 178 ierr = PetscTryMethod(A, "MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));CHKERRQ(ierr); 179 PetscFunctionReturn(0); 180 } 181 182 #undef __FUNCT__ 183 #define __FUNCT__ "MatSetFromOptions_SeqAIJCUSPARSE" 184 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptions *PetscOptionsObject,Mat A) 185 { 186 PetscErrorCode ierr; 187 MatCUSPARSEStorageFormat format; 188 PetscBool flg; 189 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 190 191 PetscFunctionBegin; 192 ierr = PetscOptionsHead(PetscOptionsObject,"SeqAIJCUSPARSE options");CHKERRQ(ierr); 193 ierr = PetscObjectOptionsBegin((PetscObject)A); 194 if (A->factortype==MAT_FACTOR_NONE) { 195 ierr = PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV", 196 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 197 if (flg) { 198 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);CHKERRQ(ierr); 199 } 200 } 201 ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve", 202 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 203 if (flg) { 204 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr); 205 } 206 ierr = PetscOptionsEnd();CHKERRQ(ierr); 207 PetscFunctionReturn(0); 208 209 } 210 211 #undef __FUNCT__ 212 #define __FUNCT__ "MatILUFactorSymbolic_SeqAIJCUSPARSE" 213 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 214 { 215 PetscErrorCode ierr; 216 217 PetscFunctionBegin; 218 ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 219 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 220 PetscFunctionReturn(0); 221 } 222 223 #undef __FUNCT__ 224 #define __FUNCT__ "MatLUFactorSymbolic_SeqAIJCUSPARSE" 225 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 226 { 227 PetscErrorCode ierr; 228 229 PetscFunctionBegin; 230 ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 231 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 232 PetscFunctionReturn(0); 233 } 234 235 #undef __FUNCT__ 236 #define __FUNCT__ "MatICCFactorSymbolic_SeqAIJCUSPARSE" 237 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info) 238 { 239 PetscErrorCode ierr; 240 241 PetscFunctionBegin; 242 ierr = MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr); 243 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 244 PetscFunctionReturn(0); 245 } 246 247 #undef __FUNCT__ 248 #define __FUNCT__ "MatCholeskyFactorSymbolic_SeqAIJCUSPARSE" 249 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info) 250 { 251 PetscErrorCode ierr; 252 253 PetscFunctionBegin; 254 ierr = MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr); 255 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 256 PetscFunctionReturn(0); 257 } 258 259 #undef __FUNCT__ 260 #define __FUNCT__ "MatSeqAIJCUSPARSEBuildILULowerTriMatrix" 261 static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A) 262 { 263 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 264 PetscInt n = A->rmap->n; 265 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 266 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 267 cusparseStatus_t stat; 268 const PetscInt *ai = a->i,*aj = a->j,*vi; 269 const MatScalar *aa = a->a,*v; 270 PetscInt *AiLo, *AjLo; 271 PetscScalar *AALo; 272 PetscInt i,nz, nzLower, offset, rowOffset; 273 PetscErrorCode ierr; 274 275 PetscFunctionBegin; 276 if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUSP_CPU) { 277 try { 278 /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */ 279 nzLower=n+ai[n]-ai[1]; 280 281 /* Allocate Space for the lower triangular matrix */ 282 ierr = cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUSP(ierr); 283 ierr = cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUSP(ierr); 284 ierr = cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUSP(ierr); 285 286 /* Fill the lower triangular matrix */ 287 AiLo[0] = (PetscInt) 0; 288 AiLo[n] = nzLower; 289 AjLo[0] = (PetscInt) 0; 290 AALo[0] = (MatScalar) 1.0; 291 v = aa; 292 vi = aj; 293 offset = 1; 294 rowOffset= 1; 295 for (i=1; i<n; i++) { 296 nz = ai[i+1] - ai[i]; 297 /* additional 1 for the term on the diagonal */ 298 AiLo[i] = rowOffset; 299 rowOffset += nz+1; 300 301 ierr = PetscMemcpy(&(AjLo[offset]), vi, nz*sizeof(PetscInt));CHKERRQ(ierr); 302 ierr = PetscMemcpy(&(AALo[offset]), v, nz*sizeof(PetscScalar));CHKERRQ(ierr); 303 304 offset += nz; 305 AjLo[offset] = (PetscInt) i; 306 AALo[offset] = (MatScalar) 1.0; 307 offset += 1; 308 309 v += nz; 310 vi += nz; 311 } 312 313 /* allocate space for the triangular factor information */ 314 loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct; 315 316 /* Create the matrix description */ 317 stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSP(stat); 318 stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSP(stat); 319 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSP(stat); 320 stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER);CHKERRCUSP(stat); 321 stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSP(stat); 322 323 /* Create the solve analysis information */ 324 stat = cusparseCreateSolveAnalysisInfo(&loTriFactor->solveInfo);CHKERRCUSP(stat); 325 326 /* set the operation */ 327 loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 328 329 /* set the matrix */ 330 loTriFactor->csrMat = new CsrMatrix; 331 loTriFactor->csrMat->num_rows = n; 332 loTriFactor->csrMat->num_cols = n; 333 loTriFactor->csrMat->num_entries = nzLower; 334 335 loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1); 336 loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1); 337 338 loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower); 339 loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower); 340 341 loTriFactor->csrMat->values = new THRUSTARRAY(nzLower); 342 loTriFactor->csrMat->values->assign(AALo, AALo+nzLower); 343 344 /* perform the solve analysis */ 345 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, 346 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 347 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 348 loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo);CHKERRCUSP(stat); 349 350 /* assign the pointer. Is this really necessary? */ 351 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor; 352 353 ierr = cudaFreeHost(AiLo);CHKERRCUSP(ierr); 354 ierr = cudaFreeHost(AjLo);CHKERRCUSP(ierr); 355 ierr = cudaFreeHost(AALo);CHKERRCUSP(ierr); 356 } catch(char *ex) { 357 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 358 } 359 } 360 PetscFunctionReturn(0); 361 } 362 363 #undef __FUNCT__ 364 #define __FUNCT__ "MatSeqAIJCUSPARSEBuildILUUpperTriMatrix" 365 static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A) 366 { 367 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 368 PetscInt n = A->rmap->n; 369 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 370 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 371 cusparseStatus_t stat; 372 const PetscInt *aj = a->j,*adiag = a->diag,*vi; 373 const MatScalar *aa = a->a,*v; 374 PetscInt *AiUp, *AjUp; 375 PetscScalar *AAUp; 376 PetscInt i,nz, nzUpper, offset; 377 PetscErrorCode ierr; 378 379 PetscFunctionBegin; 380 if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUSP_CPU) { 381 try { 382 /* next, figure out the number of nonzeros in the upper triangular matrix. */ 383 nzUpper = adiag[0]-adiag[n]; 384 385 /* Allocate Space for the upper triangular matrix */ 386 ierr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUSP(ierr); 387 ierr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUSP(ierr); 388 ierr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUSP(ierr); 389 390 /* Fill the upper triangular matrix */ 391 AiUp[0]=(PetscInt) 0; 392 AiUp[n]=nzUpper; 393 offset = nzUpper; 394 for (i=n-1; i>=0; i--) { 395 v = aa + adiag[i+1] + 1; 396 vi = aj + adiag[i+1] + 1; 397 398 /* number of elements NOT on the diagonal */ 399 nz = adiag[i] - adiag[i+1]-1; 400 401 /* decrement the offset */ 402 offset -= (nz+1); 403 404 /* first, set the diagonal elements */ 405 AjUp[offset] = (PetscInt) i; 406 AAUp[offset] = 1./v[nz]; 407 AiUp[i] = AiUp[i+1] - (nz+1); 408 409 ierr = PetscMemcpy(&(AjUp[offset+1]), vi, nz*sizeof(PetscInt));CHKERRQ(ierr); 410 ierr = PetscMemcpy(&(AAUp[offset+1]), v, nz*sizeof(PetscScalar));CHKERRQ(ierr); 411 } 412 413 /* allocate space for the triangular factor information */ 414 upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct; 415 416 /* Create the matrix description */ 417 stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSP(stat); 418 stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSP(stat); 419 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSP(stat); 420 stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSP(stat); 421 stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSP(stat); 422 423 /* Create the solve analysis information */ 424 stat = cusparseCreateSolveAnalysisInfo(&upTriFactor->solveInfo);CHKERRCUSP(stat); 425 426 /* set the operation */ 427 upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 428 429 /* set the matrix */ 430 upTriFactor->csrMat = new CsrMatrix; 431 upTriFactor->csrMat->num_rows = n; 432 upTriFactor->csrMat->num_cols = n; 433 upTriFactor->csrMat->num_entries = nzUpper; 434 435 upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1); 436 upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1); 437 438 upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper); 439 upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper); 440 441 upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper); 442 upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper); 443 444 /* perform the solve analysis */ 445 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, 446 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 447 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 448 upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo);CHKERRCUSP(stat); 449 450 /* assign the pointer. Is this really necessary? */ 451 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor; 452 453 ierr = cudaFreeHost(AiUp);CHKERRCUSP(ierr); 454 ierr = cudaFreeHost(AjUp);CHKERRCUSP(ierr); 455 ierr = cudaFreeHost(AAUp);CHKERRCUSP(ierr); 456 } catch(char *ex) { 457 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 458 } 459 } 460 PetscFunctionReturn(0); 461 } 462 463 #undef __FUNCT__ 464 #define __FUNCT__ "MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU" 465 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A) 466 { 467 PetscErrorCode ierr; 468 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 469 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 470 IS isrow = a->row,iscol = a->icol; 471 PetscBool row_identity,col_identity; 472 const PetscInt *r,*c; 473 PetscInt n = A->rmap->n; 474 475 PetscFunctionBegin; 476 ierr = MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);CHKERRQ(ierr); 477 ierr = MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);CHKERRQ(ierr); 478 479 cusparseTriFactors->workVector = new THRUSTARRAY; 480 cusparseTriFactors->workVector->resize(n); 481 cusparseTriFactors->nnz=a->nz; 482 483 A->valid_GPU_matrix = PETSC_CUSP_BOTH; 484 /*lower triangular indices */ 485 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 486 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 487 if (!row_identity) { 488 cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 489 cusparseTriFactors->rpermIndices->assign(r, r+n); 490 } 491 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 492 493 /*upper triangular indices */ 494 ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr); 495 ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 496 if (!col_identity) { 497 cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 498 cusparseTriFactors->cpermIndices->assign(c, c+n); 499 } 500 ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr); 501 PetscFunctionReturn(0); 502 } 503 504 #undef __FUNCT__ 505 #define __FUNCT__ "MatSeqAIJCUSPARSEBuildICCTriMatrices" 506 static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A) 507 { 508 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 509 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 510 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 511 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 512 cusparseStatus_t stat; 513 PetscErrorCode ierr; 514 PetscInt *AiUp, *AjUp; 515 PetscScalar *AAUp; 516 PetscScalar *AALo; 517 PetscInt nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j; 518 Mat_SeqSBAIJ *b = (Mat_SeqSBAIJ*)A->data; 519 const PetscInt *ai = b->i,*aj = b->j,*vj; 520 const MatScalar *aa = b->a,*v; 521 522 PetscFunctionBegin; 523 if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUSP_CPU) { 524 try { 525 /* Allocate Space for the upper triangular matrix */ 526 ierr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUSP(ierr); 527 ierr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUSP(ierr); 528 ierr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUSP(ierr); 529 ierr = cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUSP(ierr); 530 531 /* Fill the upper triangular matrix */ 532 AiUp[0]=(PetscInt) 0; 533 AiUp[n]=nzUpper; 534 offset = 0; 535 for (i=0; i<n; i++) { 536 /* set the pointers */ 537 v = aa + ai[i]; 538 vj = aj + ai[i]; 539 nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */ 540 541 /* first, set the diagonal elements */ 542 AjUp[offset] = (PetscInt) i; 543 AAUp[offset] = 1.0/v[nz]; 544 AiUp[i] = offset; 545 AALo[offset] = 1.0/v[nz]; 546 547 offset+=1; 548 if (nz>0) { 549 ierr = PetscMemcpy(&(AjUp[offset]), vj, nz*sizeof(PetscInt));CHKERRQ(ierr); 550 ierr = PetscMemcpy(&(AAUp[offset]), v, nz*sizeof(PetscScalar));CHKERRQ(ierr); 551 for (j=offset; j<offset+nz; j++) { 552 AAUp[j] = -AAUp[j]; 553 AALo[j] = AAUp[j]/v[nz]; 554 } 555 offset+=nz; 556 } 557 } 558 559 /* allocate space for the triangular factor information */ 560 upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct; 561 562 /* Create the matrix description */ 563 stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSP(stat); 564 stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSP(stat); 565 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSP(stat); 566 stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSP(stat); 567 stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSP(stat); 568 569 /* Create the solve analysis information */ 570 stat = cusparseCreateSolveAnalysisInfo(&upTriFactor->solveInfo);CHKERRCUSP(stat); 571 572 /* set the operation */ 573 upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 574 575 /* set the matrix */ 576 upTriFactor->csrMat = new CsrMatrix; 577 upTriFactor->csrMat->num_rows = A->rmap->n; 578 upTriFactor->csrMat->num_cols = A->cmap->n; 579 upTriFactor->csrMat->num_entries = a->nz; 580 581 upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 582 upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1); 583 584 upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 585 upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz); 586 587 upTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 588 upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz); 589 590 /* perform the solve analysis */ 591 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, 592 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 593 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 594 upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo);CHKERRCUSP(stat); 595 596 /* assign the pointer. Is this really necessary? */ 597 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor; 598 599 /* allocate space for the triangular factor information */ 600 loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct; 601 602 /* Create the matrix description */ 603 stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSP(stat); 604 stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSP(stat); 605 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSP(stat); 606 stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSP(stat); 607 stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSP(stat); 608 609 /* Create the solve analysis information */ 610 stat = cusparseCreateSolveAnalysisInfo(&loTriFactor->solveInfo);CHKERRCUSP(stat); 611 612 /* set the operation */ 613 loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE; 614 615 /* set the matrix */ 616 loTriFactor->csrMat = new CsrMatrix; 617 loTriFactor->csrMat->num_rows = A->rmap->n; 618 loTriFactor->csrMat->num_cols = A->cmap->n; 619 loTriFactor->csrMat->num_entries = a->nz; 620 621 loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 622 loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1); 623 624 loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 625 loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz); 626 627 loTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 628 loTriFactor->csrMat->values->assign(AALo, AALo+a->nz); 629 630 /* perform the solve analysis */ 631 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, 632 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 633 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 634 loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo);CHKERRCUSP(stat); 635 636 /* assign the pointer. Is this really necessary? */ 637 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor; 638 639 A->valid_GPU_matrix = PETSC_CUSP_BOTH; 640 ierr = cudaFreeHost(AiUp);CHKERRCUSP(ierr); 641 ierr = cudaFreeHost(AjUp);CHKERRCUSP(ierr); 642 ierr = cudaFreeHost(AAUp);CHKERRCUSP(ierr); 643 ierr = cudaFreeHost(AALo);CHKERRCUSP(ierr); 644 } catch(char *ex) { 645 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 646 } 647 } 648 PetscFunctionReturn(0); 649 } 650 651 #undef __FUNCT__ 652 #define __FUNCT__ "MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU" 653 static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A) 654 { 655 PetscErrorCode ierr; 656 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 657 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 658 IS ip = a->row; 659 const PetscInt *rip; 660 PetscBool perm_identity; 661 PetscInt n = A->rmap->n; 662 663 PetscFunctionBegin; 664 ierr = MatSeqAIJCUSPARSEBuildICCTriMatrices(A);CHKERRQ(ierr); 665 cusparseTriFactors->workVector = new THRUSTARRAY; 666 cusparseTriFactors->workVector->resize(n); 667 cusparseTriFactors->nnz=(a->nz-n)*2 + n; 668 669 /*lower triangular indices */ 670 ierr = ISGetIndices(ip,&rip);CHKERRQ(ierr); 671 ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr); 672 if (!perm_identity) { 673 cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 674 cusparseTriFactors->rpermIndices->assign(rip, rip+n); 675 cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 676 cusparseTriFactors->cpermIndices->assign(rip, rip+n); 677 } 678 ierr = ISRestoreIndices(ip,&rip);CHKERRQ(ierr); 679 PetscFunctionReturn(0); 680 } 681 682 #undef __FUNCT__ 683 #define __FUNCT__ "MatLUFactorNumeric_SeqAIJCUSPARSE" 684 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info) 685 { 686 Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 687 IS isrow = b->row,iscol = b->col; 688 PetscBool row_identity,col_identity; 689 PetscErrorCode ierr; 690 691 PetscFunctionBegin; 692 ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 693 /* determine which version of MatSolve needs to be used. */ 694 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 695 ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 696 if (row_identity && col_identity) { 697 B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 698 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 699 } else { 700 B->ops->solve = MatSolve_SeqAIJCUSPARSE; 701 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 702 } 703 704 /* get the triangular factors */ 705 ierr = MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);CHKERRQ(ierr); 706 PetscFunctionReturn(0); 707 } 708 709 #undef __FUNCT__ 710 #define __FUNCT__ "MatCholeskyFactorNumeric_SeqAIJCUSPARSE" 711 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info) 712 { 713 Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 714 IS ip = b->row; 715 PetscBool perm_identity; 716 PetscErrorCode ierr; 717 718 PetscFunctionBegin; 719 ierr = MatCholeskyFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 720 721 /* determine which version of MatSolve needs to be used. */ 722 ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr); 723 if (perm_identity) { 724 B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 725 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 726 } else { 727 B->ops->solve = MatSolve_SeqAIJCUSPARSE; 728 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 729 } 730 731 /* get the triangular factors */ 732 ierr = MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);CHKERRQ(ierr); 733 PetscFunctionReturn(0); 734 } 735 736 #undef __FUNCT__ 737 #define __FUNCT__ "MatSeqAIJCUSPARSEAnalyzeTransposeForSolve" 738 static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A) 739 { 740 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 741 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 742 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 743 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 744 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 745 cusparseStatus_t stat; 746 cusparseIndexBase_t indexBase; 747 cusparseMatrixType_t matrixType; 748 cusparseFillMode_t fillMode; 749 cusparseDiagType_t diagType; 750 751 PetscFunctionBegin; 752 753 /*********************************************/ 754 /* Now the Transpose of the Lower Tri Factor */ 755 /*********************************************/ 756 757 /* allocate space for the transpose of the lower triangular factor */ 758 loTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct; 759 760 /* set the matrix descriptors of the lower triangular factor */ 761 matrixType = cusparseGetMatType(loTriFactor->descr); 762 indexBase = cusparseGetMatIndexBase(loTriFactor->descr); 763 fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ? 764 CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 765 diagType = cusparseGetMatDiagType(loTriFactor->descr); 766 767 /* Create the matrix description */ 768 stat = cusparseCreateMatDescr(&loTriFactorT->descr);CHKERRCUSP(stat); 769 stat = cusparseSetMatIndexBase(loTriFactorT->descr, indexBase);CHKERRCUSP(stat); 770 stat = cusparseSetMatType(loTriFactorT->descr, matrixType);CHKERRCUSP(stat); 771 stat = cusparseSetMatFillMode(loTriFactorT->descr, fillMode);CHKERRCUSP(stat); 772 stat = cusparseSetMatDiagType(loTriFactorT->descr, diagType);CHKERRCUSP(stat); 773 774 /* Create the solve analysis information */ 775 stat = cusparseCreateSolveAnalysisInfo(&loTriFactorT->solveInfo);CHKERRCUSP(stat); 776 777 /* set the operation */ 778 loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 779 780 /* allocate GPU space for the CSC of the lower triangular factor*/ 781 loTriFactorT->csrMat = new CsrMatrix; 782 loTriFactorT->csrMat->num_rows = loTriFactor->csrMat->num_rows; 783 loTriFactorT->csrMat->num_cols = loTriFactor->csrMat->num_cols; 784 loTriFactorT->csrMat->num_entries = loTriFactor->csrMat->num_entries; 785 loTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(loTriFactor->csrMat->num_rows+1); 786 loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactor->csrMat->num_entries); 787 loTriFactorT->csrMat->values = new THRUSTARRAY(loTriFactor->csrMat->num_entries); 788 789 /* compute the transpose of the lower triangular factor, i.e. the CSC */ 790 stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, 791 loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, 792 loTriFactor->csrMat->values->data().get(), 793 loTriFactor->csrMat->row_offsets->data().get(), 794 loTriFactor->csrMat->column_indices->data().get(), 795 loTriFactorT->csrMat->values->data().get(), 796 loTriFactorT->csrMat->column_indices->data().get(), 797 loTriFactorT->csrMat->row_offsets->data().get(), 798 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSP(stat); 799 800 /* perform the solve analysis on the transposed matrix */ 801 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp, 802 loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, 803 loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(), 804 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), 805 loTriFactorT->solveInfo);CHKERRCUSP(stat); 806 807 /* assign the pointer. Is this really necessary? */ 808 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT; 809 810 /*********************************************/ 811 /* Now the Transpose of the Upper Tri Factor */ 812 /*********************************************/ 813 814 /* allocate space for the transpose of the upper triangular factor */ 815 upTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct; 816 817 /* set the matrix descriptors of the upper triangular factor */ 818 matrixType = cusparseGetMatType(upTriFactor->descr); 819 indexBase = cusparseGetMatIndexBase(upTriFactor->descr); 820 fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ? 821 CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 822 diagType = cusparseGetMatDiagType(upTriFactor->descr); 823 824 /* Create the matrix description */ 825 stat = cusparseCreateMatDescr(&upTriFactorT->descr);CHKERRCUSP(stat); 826 stat = cusparseSetMatIndexBase(upTriFactorT->descr, indexBase);CHKERRCUSP(stat); 827 stat = cusparseSetMatType(upTriFactorT->descr, matrixType);CHKERRCUSP(stat); 828 stat = cusparseSetMatFillMode(upTriFactorT->descr, fillMode);CHKERRCUSP(stat); 829 stat = cusparseSetMatDiagType(upTriFactorT->descr, diagType);CHKERRCUSP(stat); 830 831 /* Create the solve analysis information */ 832 stat = cusparseCreateSolveAnalysisInfo(&upTriFactorT->solveInfo);CHKERRCUSP(stat); 833 834 /* set the operation */ 835 upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 836 837 /* allocate GPU space for the CSC of the upper triangular factor*/ 838 upTriFactorT->csrMat = new CsrMatrix; 839 upTriFactorT->csrMat->num_rows = upTriFactor->csrMat->num_rows; 840 upTriFactorT->csrMat->num_cols = upTriFactor->csrMat->num_cols; 841 upTriFactorT->csrMat->num_entries = upTriFactor->csrMat->num_entries; 842 upTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(upTriFactor->csrMat->num_rows+1); 843 upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactor->csrMat->num_entries); 844 upTriFactorT->csrMat->values = new THRUSTARRAY(upTriFactor->csrMat->num_entries); 845 846 /* compute the transpose of the upper triangular factor, i.e. the CSC */ 847 stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, 848 upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, 849 upTriFactor->csrMat->values->data().get(), 850 upTriFactor->csrMat->row_offsets->data().get(), 851 upTriFactor->csrMat->column_indices->data().get(), 852 upTriFactorT->csrMat->values->data().get(), 853 upTriFactorT->csrMat->column_indices->data().get(), 854 upTriFactorT->csrMat->row_offsets->data().get(), 855 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSP(stat); 856 857 /* perform the solve analysis on the transposed matrix */ 858 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp, 859 upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, 860 upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(), 861 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), 862 upTriFactorT->solveInfo);CHKERRCUSP(stat); 863 864 /* assign the pointer. Is this really necessary? */ 865 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT; 866 PetscFunctionReturn(0); 867 } 868 869 #undef __FUNCT__ 870 #define __FUNCT__ "MatSeqAIJCUSPARSEGenerateTransposeForMult" 871 static PetscErrorCode MatSeqAIJCUSPARSEGenerateTransposeForMult(Mat A) 872 { 873 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 874 Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 875 Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 876 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 877 cusparseStatus_t stat; 878 cusparseIndexBase_t indexBase; 879 cudaError_t err; 880 881 PetscFunctionBegin; 882 883 /* allocate space for the triangular factor information */ 884 matstructT = new Mat_SeqAIJCUSPARSEMultStruct; 885 stat = cusparseCreateMatDescr(&matstructT->descr);CHKERRCUSP(stat); 886 indexBase = cusparseGetMatIndexBase(matstruct->descr); 887 stat = cusparseSetMatIndexBase(matstructT->descr, indexBase);CHKERRCUSP(stat); 888 stat = cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSP(stat); 889 890 /* set alpha and beta */ 891 err = cudaMalloc((void **)&(matstructT->alpha),sizeof(PetscScalar));CHKERRCUSP(err); 892 err = cudaMemcpy(matstructT->alpha,&ALPHA,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUSP(err); 893 err = cudaMalloc((void **)&(matstructT->beta),sizeof(PetscScalar));CHKERRCUSP(err); 894 err = cudaMemcpy(matstructT->beta,&BETA,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUSP(err); 895 stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSP(stat); 896 897 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 898 CsrMatrix *matrix = (CsrMatrix*)matstruct->mat; 899 CsrMatrix *matrixT= new CsrMatrix; 900 matrixT->num_rows = A->rmap->n; 901 matrixT->num_cols = A->cmap->n; 902 matrixT->num_entries = a->nz; 903 matrixT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 904 matrixT->column_indices = new THRUSTINTARRAY32(a->nz); 905 matrixT->values = new THRUSTARRAY(a->nz); 906 907 /* compute the transpose of the upper triangular factor, i.e. the CSC */ 908 indexBase = cusparseGetMatIndexBase(matstruct->descr); 909 stat = cusparse_csr2csc(cusparsestruct->handle, matrix->num_rows, 910 matrix->num_cols, matrix->num_entries, 911 matrix->values->data().get(), 912 matrix->row_offsets->data().get(), 913 matrix->column_indices->data().get(), 914 matrixT->values->data().get(), 915 matrixT->column_indices->data().get(), 916 matrixT->row_offsets->data().get(), 917 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSP(stat); 918 919 /* assign the pointer */ 920 matstructT->mat = matrixT; 921 922 } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) { 923 #if CUDA_VERSION>=5000 924 /* First convert HYB to CSR */ 925 CsrMatrix *temp= new CsrMatrix; 926 temp->num_rows = A->rmap->n; 927 temp->num_cols = A->cmap->n; 928 temp->num_entries = a->nz; 929 temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 930 temp->column_indices = new THRUSTINTARRAY32(a->nz); 931 temp->values = new THRUSTARRAY(a->nz); 932 933 934 stat = cusparse_hyb2csr(cusparsestruct->handle, 935 matstruct->descr, (cusparseHybMat_t)matstruct->mat, 936 temp->values->data().get(), 937 temp->row_offsets->data().get(), 938 temp->column_indices->data().get());CHKERRCUSP(stat); 939 940 /* Next, convert CSR to CSC (i.e. the matrix transpose) */ 941 CsrMatrix *tempT= new CsrMatrix; 942 tempT->num_rows = A->rmap->n; 943 tempT->num_cols = A->cmap->n; 944 tempT->num_entries = a->nz; 945 tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 946 tempT->column_indices = new THRUSTINTARRAY32(a->nz); 947 tempT->values = new THRUSTARRAY(a->nz); 948 949 stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows, 950 temp->num_cols, temp->num_entries, 951 temp->values->data().get(), 952 temp->row_offsets->data().get(), 953 temp->column_indices->data().get(), 954 tempT->values->data().get(), 955 tempT->column_indices->data().get(), 956 tempT->row_offsets->data().get(), 957 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSP(stat); 958 959 /* Last, convert CSC to HYB */ 960 cusparseHybMat_t hybMat; 961 stat = cusparseCreateHybMat(&hybMat);CHKERRCUSP(stat); 962 cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ? 963 CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 964 stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n, 965 matstructT->descr, tempT->values->data().get(), 966 tempT->row_offsets->data().get(), 967 tempT->column_indices->data().get(), 968 hybMat, 0, partition);CHKERRCUSP(stat); 969 970 /* assign the pointer */ 971 matstructT->mat = hybMat; 972 973 /* delete temporaries */ 974 if (tempT) { 975 if (tempT->values) delete (THRUSTARRAY*) tempT->values; 976 if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices; 977 if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets; 978 delete (CsrMatrix*) tempT; 979 } 980 if (temp) { 981 if (temp->values) delete (THRUSTARRAY*) temp->values; 982 if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices; 983 if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets; 984 delete (CsrMatrix*) temp; 985 } 986 #else 987 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"ELL (Ellpack) and HYB (Hybrid) storage format for the Matrix Transpose (in MatMultTranspose) require CUDA 5.0 or later."); 988 #endif 989 } 990 /* assign the compressed row indices */ 991 matstructT->cprowIndices = new THRUSTINTARRAY; 992 993 /* assign the pointer */ 994 ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT; 995 PetscFunctionReturn(0); 996 } 997 998 #undef __FUNCT__ 999 #define __FUNCT__ "MatSolveTranspose_SeqAIJCUSPARSE" 1000 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 1001 { 1002 CUSPARRAY *xGPU, *bGPU; 1003 cusparseStatus_t stat; 1004 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1005 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1006 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1007 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1008 PetscErrorCode ierr; 1009 1010 PetscFunctionBegin; 1011 /* Analyze the matrix and create the transpose ... on the fly */ 1012 if (!loTriFactorT && !upTriFactorT) { 1013 ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr); 1014 loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1015 upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1016 } 1017 1018 /* Get the GPU pointers */ 1019 ierr = VecCUSPGetArrayWrite(xx,&xGPU);CHKERRQ(ierr); 1020 ierr = VecCUSPGetArrayRead(bb,&bGPU);CHKERRQ(ierr); 1021 1022 /* First, reorder with the row permutation */ 1023 thrust::copy(thrust::make_permutation_iterator(bGPU->begin(), cusparseTriFactors->rpermIndices->begin()), 1024 thrust::make_permutation_iterator(bGPU->end(), cusparseTriFactors->rpermIndices->end()), 1025 xGPU->begin()); 1026 1027 /* First, solve U */ 1028 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, 1029 upTriFactorT->csrMat->num_rows, &ALPHA, upTriFactorT->descr, 1030 upTriFactorT->csrMat->values->data().get(), 1031 upTriFactorT->csrMat->row_offsets->data().get(), 1032 upTriFactorT->csrMat->column_indices->data().get(), 1033 upTriFactorT->solveInfo, 1034 xGPU->data().get(), tempGPU->data().get());CHKERRCUSP(stat); 1035 1036 /* Then, solve L */ 1037 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, 1038 loTriFactorT->csrMat->num_rows, &ALPHA, loTriFactorT->descr, 1039 loTriFactorT->csrMat->values->data().get(), 1040 loTriFactorT->csrMat->row_offsets->data().get(), 1041 loTriFactorT->csrMat->column_indices->data().get(), 1042 loTriFactorT->solveInfo, 1043 tempGPU->data().get(), xGPU->data().get());CHKERRCUSP(stat); 1044 1045 /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */ 1046 thrust::copy(thrust::make_permutation_iterator(xGPU->begin(), cusparseTriFactors->cpermIndices->begin()), 1047 thrust::make_permutation_iterator(xGPU->end(), cusparseTriFactors->cpermIndices->end()), 1048 tempGPU->begin()); 1049 1050 /* Copy the temporary to the full solution. */ 1051 thrust::copy(tempGPU->begin(), tempGPU->end(), xGPU->begin()); 1052 1053 /* restore */ 1054 ierr = VecCUSPRestoreArrayRead(bb,&bGPU);CHKERRQ(ierr); 1055 ierr = VecCUSPRestoreArrayWrite(xx,&xGPU);CHKERRQ(ierr); 1056 ierr = WaitForGPU();CHKERRCUSP(ierr); 1057 1058 ierr = PetscLogFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1059 PetscFunctionReturn(0); 1060 } 1061 1062 #undef __FUNCT__ 1063 #define __FUNCT__ "MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering" 1064 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 1065 { 1066 CUSPARRAY *xGPU,*bGPU; 1067 cusparseStatus_t stat; 1068 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1069 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1070 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1071 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1072 PetscErrorCode ierr; 1073 1074 PetscFunctionBegin; 1075 /* Analyze the matrix and create the transpose ... on the fly */ 1076 if (!loTriFactorT && !upTriFactorT) { 1077 ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr); 1078 loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1079 upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1080 } 1081 1082 /* Get the GPU pointers */ 1083 ierr = VecCUSPGetArrayWrite(xx,&xGPU);CHKERRQ(ierr); 1084 ierr = VecCUSPGetArrayRead(bb,&bGPU);CHKERRQ(ierr); 1085 1086 /* First, solve U */ 1087 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, 1088 upTriFactorT->csrMat->num_rows, &ALPHA, upTriFactorT->descr, 1089 upTriFactorT->csrMat->values->data().get(), 1090 upTriFactorT->csrMat->row_offsets->data().get(), 1091 upTriFactorT->csrMat->column_indices->data().get(), 1092 upTriFactorT->solveInfo, 1093 bGPU->data().get(), tempGPU->data().get());CHKERRCUSP(stat); 1094 1095 /* Then, solve L */ 1096 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, 1097 loTriFactorT->csrMat->num_rows, &ALPHA, loTriFactorT->descr, 1098 loTriFactorT->csrMat->values->data().get(), 1099 loTriFactorT->csrMat->row_offsets->data().get(), 1100 loTriFactorT->csrMat->column_indices->data().get(), 1101 loTriFactorT->solveInfo, 1102 tempGPU->data().get(), xGPU->data().get());CHKERRCUSP(stat); 1103 1104 /* restore */ 1105 ierr = VecCUSPRestoreArrayRead(bb,&bGPU);CHKERRQ(ierr); 1106 ierr = VecCUSPRestoreArrayWrite(xx,&xGPU);CHKERRQ(ierr); 1107 ierr = WaitForGPU();CHKERRCUSP(ierr); 1108 ierr = PetscLogFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1109 PetscFunctionReturn(0); 1110 } 1111 1112 #undef __FUNCT__ 1113 #define __FUNCT__ "MatSolve_SeqAIJCUSPARSE" 1114 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 1115 { 1116 CUSPARRAY *xGPU,*bGPU; 1117 cusparseStatus_t stat; 1118 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1119 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1120 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1121 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1122 PetscErrorCode ierr; 1123 VecType t; 1124 PetscBool flg; 1125 1126 PetscFunctionBegin; 1127 ierr = VecGetType(bb,&t);CHKERRQ(ierr); 1128 ierr = PetscStrcmp(t,VECSEQCUSP,&flg);CHKERRQ(ierr); 1129 if (!flg) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Vector of type %s passed into MatSolve_SeqAIJCUSPARSE (Arg #2). Can only deal with %s\n.",t,VECSEQCUSP); 1130 ierr = VecGetType(xx,&t);CHKERRQ(ierr); 1131 ierr = PetscStrcmp(t,VECSEQCUSP,&flg);CHKERRQ(ierr); 1132 if (!flg) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Vector of type %s passed into MatSolve_SeqAIJCUSPARSE (Arg #3). Can only deal with %s\n.",t,VECSEQCUSP); 1133 1134 /* Get the GPU pointers */ 1135 ierr = VecCUSPGetArrayWrite(xx,&xGPU);CHKERRQ(ierr); 1136 ierr = VecCUSPGetArrayRead(bb,&bGPU);CHKERRQ(ierr); 1137 1138 /* First, reorder with the row permutation */ 1139 thrust::copy(thrust::make_permutation_iterator(bGPU->begin(), cusparseTriFactors->rpermIndices->begin()), 1140 thrust::make_permutation_iterator(bGPU->end(), cusparseTriFactors->rpermIndices->end()), 1141 xGPU->begin()); 1142 1143 /* Next, solve L */ 1144 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp, 1145 loTriFactor->csrMat->num_rows, &ALPHA, loTriFactor->descr, 1146 loTriFactor->csrMat->values->data().get(), 1147 loTriFactor->csrMat->row_offsets->data().get(), 1148 loTriFactor->csrMat->column_indices->data().get(), 1149 loTriFactor->solveInfo, 1150 xGPU->data().get(), tempGPU->data().get());CHKERRCUSP(stat); 1151 1152 /* Then, solve U */ 1153 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp, 1154 upTriFactor->csrMat->num_rows, &ALPHA, upTriFactor->descr, 1155 upTriFactor->csrMat->values->data().get(), 1156 upTriFactor->csrMat->row_offsets->data().get(), 1157 upTriFactor->csrMat->column_indices->data().get(), 1158 upTriFactor->solveInfo, 1159 tempGPU->data().get(), xGPU->data().get());CHKERRCUSP(stat); 1160 1161 /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */ 1162 thrust::copy(thrust::make_permutation_iterator(xGPU->begin(), cusparseTriFactors->cpermIndices->begin()), 1163 thrust::make_permutation_iterator(xGPU->end(), cusparseTriFactors->cpermIndices->end()), 1164 tempGPU->begin()); 1165 1166 /* Copy the temporary to the full solution. */ 1167 thrust::copy(tempGPU->begin(), tempGPU->end(), xGPU->begin()); 1168 1169 ierr = VecCUSPRestoreArrayRead(bb,&bGPU);CHKERRQ(ierr); 1170 ierr = VecCUSPRestoreArrayWrite(xx,&xGPU);CHKERRQ(ierr); 1171 ierr = WaitForGPU();CHKERRCUSP(ierr); 1172 ierr = PetscLogFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1173 PetscFunctionReturn(0); 1174 } 1175 1176 #undef __FUNCT__ 1177 #define __FUNCT__ "MatSolve_SeqAIJCUSPARSE_NaturalOrdering" 1178 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 1179 { 1180 CUSPARRAY *xGPU,*bGPU; 1181 cusparseStatus_t stat; 1182 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1183 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1184 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1185 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1186 PetscErrorCode ierr; 1187 1188 PetscFunctionBegin; 1189 /* Get the GPU pointers */ 1190 ierr = VecCUSPGetArrayWrite(xx,&xGPU);CHKERRQ(ierr); 1191 ierr = VecCUSPGetArrayRead(bb,&bGPU);CHKERRQ(ierr); 1192 1193 /* First, solve L */ 1194 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp, 1195 loTriFactor->csrMat->num_rows, &ALPHA, loTriFactor->descr, 1196 loTriFactor->csrMat->values->data().get(), 1197 loTriFactor->csrMat->row_offsets->data().get(), 1198 loTriFactor->csrMat->column_indices->data().get(), 1199 loTriFactor->solveInfo, 1200 bGPU->data().get(), tempGPU->data().get());CHKERRCUSP(stat); 1201 1202 /* Next, solve U */ 1203 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp, 1204 upTriFactor->csrMat->num_rows, &ALPHA, upTriFactor->descr, 1205 upTriFactor->csrMat->values->data().get(), 1206 upTriFactor->csrMat->row_offsets->data().get(), 1207 upTriFactor->csrMat->column_indices->data().get(), 1208 upTriFactor->solveInfo, 1209 tempGPU->data().get(), xGPU->data().get());CHKERRCUSP(stat); 1210 1211 ierr = VecCUSPRestoreArrayRead(bb,&bGPU);CHKERRQ(ierr); 1212 ierr = VecCUSPRestoreArrayWrite(xx,&xGPU);CHKERRQ(ierr); 1213 ierr = WaitForGPU();CHKERRCUSP(ierr); 1214 ierr = PetscLogFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1215 PetscFunctionReturn(0); 1216 } 1217 1218 #undef __FUNCT__ 1219 #define __FUNCT__ "MatSeqAIJCUSPARSECopyToGPU" 1220 static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A) 1221 { 1222 1223 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1224 Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 1225 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1226 PetscInt m = A->rmap->n,*ii,*ridx; 1227 PetscErrorCode ierr; 1228 cusparseStatus_t stat; 1229 cudaError_t err; 1230 1231 PetscFunctionBegin; 1232 if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUSP_CPU) { 1233 ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1234 Mat_SeqAIJCUSPARSEMultStruct_Destroy(&matstruct,cusparsestruct->format); 1235 try { 1236 cusparsestruct->nonzerorow=0; 1237 for (int j = 0; j<m; j++) cusparsestruct->nonzerorow += ((a->i[j+1]-a->i[j])>0); 1238 1239 if (a->compressedrow.use) { 1240 m = a->compressedrow.nrows; 1241 ii = a->compressedrow.i; 1242 ridx = a->compressedrow.rindex; 1243 } else { 1244 /* Forcing compressed row on the GPU */ 1245 int k=0; 1246 ierr = PetscMalloc1(cusparsestruct->nonzerorow+1, &ii);CHKERRQ(ierr); 1247 ierr = PetscMalloc1(cusparsestruct->nonzerorow, &ridx);CHKERRQ(ierr); 1248 ii[0]=0; 1249 for (int j = 0; j<m; j++) { 1250 if ((a->i[j+1]-a->i[j])>0) { 1251 ii[k] = a->i[j]; 1252 ridx[k]= j; 1253 k++; 1254 } 1255 } 1256 ii[cusparsestruct->nonzerorow] = a->nz; 1257 m = cusparsestruct->nonzerorow; 1258 } 1259 1260 /* allocate space for the triangular factor information */ 1261 matstruct = new Mat_SeqAIJCUSPARSEMultStruct; 1262 stat = cusparseCreateMatDescr(&matstruct->descr);CHKERRCUSP(stat); 1263 stat = cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSP(stat); 1264 stat = cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSP(stat); 1265 1266 err = cudaMalloc((void **)&(matstruct->alpha),sizeof(PetscScalar));CHKERRCUSP(err); 1267 err = cudaMemcpy(matstruct->alpha,&ALPHA,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUSP(err); 1268 err = cudaMalloc((void **)&(matstruct->beta),sizeof(PetscScalar));CHKERRCUSP(err); 1269 err = cudaMemcpy(matstruct->beta,&BETA,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUSP(err); 1270 stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSP(stat); 1271 1272 /* Build a hybrid/ellpack matrix if this option is chosen for the storage */ 1273 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1274 /* set the matrix */ 1275 CsrMatrix *matrix= new CsrMatrix; 1276 matrix->num_rows = m; 1277 matrix->num_cols = A->cmap->n; 1278 matrix->num_entries = a->nz; 1279 matrix->row_offsets = new THRUSTINTARRAY32(m+1); 1280 matrix->row_offsets->assign(ii, ii + m+1); 1281 1282 matrix->column_indices = new THRUSTINTARRAY32(a->nz); 1283 matrix->column_indices->assign(a->j, a->j+a->nz); 1284 1285 matrix->values = new THRUSTARRAY(a->nz); 1286 matrix->values->assign(a->a, a->a+a->nz); 1287 1288 /* assign the pointer */ 1289 matstruct->mat = matrix; 1290 1291 } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) { 1292 #if CUDA_VERSION>=4020 1293 CsrMatrix *matrix= new CsrMatrix; 1294 matrix->num_rows = m; 1295 matrix->num_cols = A->cmap->n; 1296 matrix->num_entries = a->nz; 1297 matrix->row_offsets = new THRUSTINTARRAY32(m+1); 1298 matrix->row_offsets->assign(ii, ii + m+1); 1299 1300 matrix->column_indices = new THRUSTINTARRAY32(a->nz); 1301 matrix->column_indices->assign(a->j, a->j+a->nz); 1302 1303 matrix->values = new THRUSTARRAY(a->nz); 1304 matrix->values->assign(a->a, a->a+a->nz); 1305 1306 cusparseHybMat_t hybMat; 1307 stat = cusparseCreateHybMat(&hybMat);CHKERRCUSP(stat); 1308 cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ? 1309 CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 1310 stat = cusparse_csr2hyb(cusparsestruct->handle, matrix->num_rows, matrix->num_cols, 1311 matstruct->descr, matrix->values->data().get(), 1312 matrix->row_offsets->data().get(), 1313 matrix->column_indices->data().get(), 1314 hybMat, 0, partition);CHKERRCUSP(stat); 1315 /* assign the pointer */ 1316 matstruct->mat = hybMat; 1317 1318 if (matrix) { 1319 if (matrix->values) delete (THRUSTARRAY*)matrix->values; 1320 if (matrix->column_indices) delete (THRUSTINTARRAY32*)matrix->column_indices; 1321 if (matrix->row_offsets) delete (THRUSTINTARRAY32*)matrix->row_offsets; 1322 delete (CsrMatrix*)matrix; 1323 } 1324 #endif 1325 } 1326 1327 /* assign the compressed row indices */ 1328 matstruct->cprowIndices = new THRUSTINTARRAY(m); 1329 matstruct->cprowIndices->assign(ridx,ridx+m); 1330 1331 /* assign the pointer */ 1332 cusparsestruct->mat = matstruct; 1333 1334 if (!a->compressedrow.use) { 1335 ierr = PetscFree(ii);CHKERRQ(ierr); 1336 ierr = PetscFree(ridx);CHKERRQ(ierr); 1337 } 1338 cusparsestruct->workVector = new THRUSTARRAY; 1339 cusparsestruct->workVector->resize(m); 1340 } catch(char *ex) { 1341 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 1342 } 1343 ierr = WaitForGPU();CHKERRCUSP(ierr); 1344 1345 A->valid_GPU_matrix = PETSC_CUSP_BOTH; 1346 1347 ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1348 } 1349 PetscFunctionReturn(0); 1350 } 1351 1352 #undef __FUNCT__ 1353 #define __FUNCT__ "MatCreateVecs_SeqAIJCUSPARSE" 1354 static PetscErrorCode MatCreateVecs_SeqAIJCUSPARSE(Mat mat, Vec *right, Vec *left) 1355 { 1356 PetscErrorCode ierr; 1357 PetscInt rbs,cbs; 1358 1359 PetscFunctionBegin; 1360 ierr = MatGetBlockSizes(mat,&rbs,&cbs);CHKERRQ(ierr); 1361 if (right) { 1362 ierr = VecCreate(PetscObjectComm((PetscObject)mat),right);CHKERRQ(ierr); 1363 ierr = VecSetSizes(*right,mat->cmap->n,PETSC_DETERMINE);CHKERRQ(ierr); 1364 ierr = VecSetBlockSize(*right,cbs);CHKERRQ(ierr); 1365 ierr = VecSetType(*right,VECSEQCUSP);CHKERRQ(ierr); 1366 ierr = PetscLayoutReference(mat->cmap,&(*right)->map);CHKERRQ(ierr); 1367 } 1368 if (left) { 1369 ierr = VecCreate(PetscObjectComm((PetscObject)mat),left);CHKERRQ(ierr); 1370 ierr = VecSetSizes(*left,mat->rmap->n,PETSC_DETERMINE);CHKERRQ(ierr); 1371 ierr = VecSetBlockSize(*left,rbs);CHKERRQ(ierr); 1372 ierr = VecSetType(*left,VECSEQCUSP);CHKERRQ(ierr); 1373 ierr = PetscLayoutReference(mat->rmap,&(*left)->map);CHKERRQ(ierr); 1374 } 1375 PetscFunctionReturn(0); 1376 } 1377 1378 struct VecCUSPPlusEquals 1379 { 1380 template <typename Tuple> 1381 __host__ __device__ 1382 void operator()(Tuple t) 1383 { 1384 thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t); 1385 } 1386 }; 1387 1388 #undef __FUNCT__ 1389 #define __FUNCT__ "MatMult_SeqAIJCUSPARSE" 1390 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 1391 { 1392 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1393 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1394 Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 1395 CUSPARRAY *xarray,*yarray; 1396 PetscErrorCode ierr; 1397 cusparseStatus_t stat; 1398 1399 PetscFunctionBegin; 1400 /* The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE 1401 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); */ 1402 ierr = VecCUSPGetArrayRead(xx,&xarray);CHKERRQ(ierr); 1403 ierr = VecCUSPGetArrayWrite(yy,&yarray);CHKERRQ(ierr); 1404 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1405 CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 1406 stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1407 mat->num_rows, mat->num_cols, mat->num_entries, 1408 matstruct->alpha, matstruct->descr, mat->values->data().get(), mat->row_offsets->data().get(), 1409 mat->column_indices->data().get(), xarray->data().get(), matstruct->beta, 1410 yarray->data().get());CHKERRCUSP(stat); 1411 } else { 1412 #if CUDA_VERSION>=4020 1413 cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat; 1414 stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1415 matstruct->alpha, matstruct->descr, hybMat, 1416 xarray->data().get(), matstruct->beta, 1417 yarray->data().get());CHKERRCUSP(stat); 1418 #endif 1419 } 1420 ierr = VecCUSPRestoreArrayRead(xx,&xarray);CHKERRQ(ierr); 1421 ierr = VecCUSPRestoreArrayWrite(yy,&yarray);CHKERRQ(ierr); 1422 if (!cusparsestruct->stream) { 1423 ierr = WaitForGPU();CHKERRCUSP(ierr); 1424 } 1425 ierr = PetscLogFlops(2.0*a->nz - cusparsestruct->nonzerorow);CHKERRQ(ierr); 1426 PetscFunctionReturn(0); 1427 } 1428 1429 #undef __FUNCT__ 1430 #define __FUNCT__ "MatMultTranspose_SeqAIJCUSPARSE" 1431 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 1432 { 1433 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1434 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1435 Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 1436 CUSPARRAY *xarray,*yarray; 1437 PetscErrorCode ierr; 1438 cusparseStatus_t stat; 1439 1440 PetscFunctionBegin; 1441 /* The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE 1442 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); */ 1443 if (!matstructT) { 1444 ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr); 1445 matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 1446 } 1447 ierr = VecCUSPGetArrayRead(xx,&xarray);CHKERRQ(ierr); 1448 ierr = VecCUSPGetArrayWrite(yy,&yarray);CHKERRQ(ierr); 1449 1450 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1451 CsrMatrix *mat = (CsrMatrix*)matstructT->mat; 1452 stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1453 mat->num_rows, mat->num_cols, 1454 mat->num_entries, matstructT->alpha, matstructT->descr, 1455 mat->values->data().get(), mat->row_offsets->data().get(), 1456 mat->column_indices->data().get(), xarray->data().get(), matstructT->beta, 1457 yarray->data().get());CHKERRCUSP(stat); 1458 } else { 1459 #if CUDA_VERSION>=4020 1460 cusparseHybMat_t hybMat = (cusparseHybMat_t)matstructT->mat; 1461 stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1462 matstructT->alpha, matstructT->descr, hybMat, 1463 xarray->data().get(), matstructT->beta, 1464 yarray->data().get());CHKERRCUSP(stat); 1465 #endif 1466 } 1467 ierr = VecCUSPRestoreArrayRead(xx,&xarray);CHKERRQ(ierr); 1468 ierr = VecCUSPRestoreArrayWrite(yy,&yarray);CHKERRQ(ierr); 1469 if (!cusparsestruct->stream) { 1470 ierr = WaitForGPU();CHKERRCUSP(ierr); 1471 } 1472 ierr = PetscLogFlops(2.0*a->nz - cusparsestruct->nonzerorow);CHKERRQ(ierr); 1473 PetscFunctionReturn(0); 1474 } 1475 1476 1477 #undef __FUNCT__ 1478 #define __FUNCT__ "MatMultAdd_SeqAIJCUSPARSE" 1479 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 1480 { 1481 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1482 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1483 Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 1484 CUSPARRAY *xarray,*yarray,*zarray; 1485 PetscErrorCode ierr; 1486 cusparseStatus_t stat; 1487 1488 PetscFunctionBegin; 1489 /* The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE 1490 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); */ 1491 try { 1492 ierr = VecCopy_SeqCUSP(yy,zz);CHKERRQ(ierr); 1493 ierr = VecCUSPGetArrayRead(xx,&xarray);CHKERRQ(ierr); 1494 ierr = VecCUSPGetArrayRead(yy,&yarray);CHKERRQ(ierr); 1495 ierr = VecCUSPGetArrayWrite(zz,&zarray);CHKERRQ(ierr); 1496 1497 /* multiply add */ 1498 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1499 CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 1500 /* here we need to be careful to set the number of rows in the multiply to the 1501 number of compressed rows in the matrix ... which is equivalent to the 1502 size of the workVector */ 1503 stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1504 mat->num_rows, mat->num_cols, 1505 mat->num_entries, matstruct->alpha, matstruct->descr, 1506 mat->values->data().get(), mat->row_offsets->data().get(), 1507 mat->column_indices->data().get(), xarray->data().get(), matstruct->beta, 1508 cusparsestruct->workVector->data().get());CHKERRCUSP(stat); 1509 } else { 1510 #if CUDA_VERSION>=4020 1511 cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat; 1512 if (cusparsestruct->workVector->size()) { 1513 stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1514 matstruct->alpha, matstruct->descr, hybMat, 1515 xarray->data().get(), matstruct->beta, 1516 cusparsestruct->workVector->data().get());CHKERRCUSP(stat); 1517 } 1518 #endif 1519 } 1520 1521 /* scatter the data from the temporary into the full vector with a += operation */ 1522 thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zarray->begin(), matstruct->cprowIndices->begin()))), 1523 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zarray->begin(), matstruct->cprowIndices->begin()))) + cusparsestruct->workVector->size(), 1524 VecCUSPPlusEquals()); 1525 ierr = VecCUSPRestoreArrayRead(xx,&xarray);CHKERRQ(ierr); 1526 ierr = VecCUSPRestoreArrayRead(yy,&yarray);CHKERRQ(ierr); 1527 ierr = VecCUSPRestoreArrayWrite(zz,&zarray);CHKERRQ(ierr); 1528 1529 } catch(char *ex) { 1530 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 1531 } 1532 ierr = WaitForGPU();CHKERRCUSP(ierr); 1533 ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr); 1534 PetscFunctionReturn(0); 1535 } 1536 1537 #undef __FUNCT__ 1538 #define __FUNCT__ "MatMultTransposeAdd_SeqAIJCUSPARSE" 1539 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 1540 { 1541 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1542 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1543 Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 1544 CUSPARRAY *xarray,*yarray,*zarray; 1545 PetscErrorCode ierr; 1546 cusparseStatus_t stat; 1547 1548 PetscFunctionBegin; 1549 /* The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE 1550 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); */ 1551 if (!matstructT) { 1552 ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr); 1553 matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 1554 } 1555 1556 try { 1557 ierr = VecCopy_SeqCUSP(yy,zz);CHKERRQ(ierr); 1558 ierr = VecCUSPGetArrayRead(xx,&xarray);CHKERRQ(ierr); 1559 ierr = VecCUSPGetArrayRead(yy,&yarray);CHKERRQ(ierr); 1560 ierr = VecCUSPGetArrayWrite(zz,&zarray);CHKERRQ(ierr); 1561 1562 /* multiply add with matrix transpose */ 1563 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1564 CsrMatrix *mat = (CsrMatrix*)matstructT->mat; 1565 /* here we need to be careful to set the number of rows in the multiply to the 1566 number of compressed rows in the matrix ... which is equivalent to the 1567 size of the workVector */ 1568 stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1569 mat->num_rows, mat->num_cols, 1570 mat->num_entries, matstructT->alpha, matstructT->descr, 1571 mat->values->data().get(), mat->row_offsets->data().get(), 1572 mat->column_indices->data().get(), xarray->data().get(), matstructT->beta, 1573 cusparsestruct->workVector->data().get());CHKERRCUSP(stat); 1574 } else { 1575 #if CUDA_VERSION>=4020 1576 cusparseHybMat_t hybMat = (cusparseHybMat_t)matstructT->mat; 1577 if (cusparsestruct->workVector->size()) { 1578 stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1579 matstructT->alpha, matstructT->descr, hybMat, 1580 xarray->data().get(), matstructT->beta, 1581 cusparsestruct->workVector->data().get());CHKERRCUSP(stat); 1582 } 1583 #endif 1584 } 1585 1586 /* scatter the data from the temporary into the full vector with a += operation */ 1587 thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zarray->begin(), matstructT->cprowIndices->begin()))), 1588 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zarray->begin(), matstructT->cprowIndices->begin()))) + cusparsestruct->workVector->size(), 1589 VecCUSPPlusEquals()); 1590 1591 ierr = VecCUSPRestoreArrayRead(xx,&xarray);CHKERRQ(ierr); 1592 ierr = VecCUSPRestoreArrayRead(yy,&yarray);CHKERRQ(ierr); 1593 ierr = VecCUSPRestoreArrayWrite(zz,&zarray);CHKERRQ(ierr); 1594 1595 } catch(char *ex) { 1596 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 1597 } 1598 ierr = WaitForGPU();CHKERRCUSP(ierr); 1599 ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr); 1600 PetscFunctionReturn(0); 1601 } 1602 1603 #undef __FUNCT__ 1604 #define __FUNCT__ "MatAssemblyEnd_SeqAIJCUSPARSE" 1605 static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode) 1606 { 1607 PetscErrorCode ierr; 1608 1609 PetscFunctionBegin; 1610 ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr); 1611 if (A->factortype==MAT_FACTOR_NONE) { 1612 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 1613 } 1614 if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0); 1615 A->ops->mult = MatMult_SeqAIJCUSPARSE; 1616 A->ops->multadd = MatMultAdd_SeqAIJCUSPARSE; 1617 A->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE; 1618 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE; 1619 PetscFunctionReturn(0); 1620 } 1621 1622 /* --------------------------------------------------------------------------------*/ 1623 #undef __FUNCT__ 1624 #define __FUNCT__ "MatCreateSeqAIJCUSPARSE" 1625 /*@ 1626 MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format 1627 (the default parallel PETSc format). This matrix will ultimately pushed down 1628 to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix 1629 assembly performance the user should preallocate the matrix storage by setting 1630 the parameter nz (or the array nnz). By setting these parameters accurately, 1631 performance during matrix assembly can be increased by more than a factor of 50. 1632 1633 Collective on MPI_Comm 1634 1635 Input Parameters: 1636 + comm - MPI communicator, set to PETSC_COMM_SELF 1637 . m - number of rows 1638 . n - number of columns 1639 . nz - number of nonzeros per row (same for all rows) 1640 - nnz - array containing the number of nonzeros in the various rows 1641 (possibly different for each row) or NULL 1642 1643 Output Parameter: 1644 . A - the matrix 1645 1646 It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 1647 MatXXXXSetPreallocation() paradgm instead of this routine directly. 1648 [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 1649 1650 Notes: 1651 If nnz is given then nz is ignored 1652 1653 The AIJ format (also called the Yale sparse matrix format or 1654 compressed row storage), is fully compatible with standard Fortran 77 1655 storage. That is, the stored row and column indices can begin at 1656 either one (as in Fortran) or zero. See the users' manual for details. 1657 1658 Specify the preallocated storage with either nz or nnz (not both). 1659 Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 1660 allocation. For large problems you MUST preallocate memory or you 1661 will get TERRIBLE performance, see the users' manual chapter on matrices. 1662 1663 By default, this format uses inodes (identical nodes) when possible, to 1664 improve numerical efficiency of matrix-vector products and solves. We 1665 search for consecutive rows with the same nonzero structure, thereby 1666 reusing matrix information to achieve increased efficiency. 1667 1668 Level: intermediate 1669 1670 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE 1671 @*/ 1672 PetscErrorCode MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A) 1673 { 1674 PetscErrorCode ierr; 1675 1676 PetscFunctionBegin; 1677 ierr = MatCreate(comm,A);CHKERRQ(ierr); 1678 ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr); 1679 ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 1680 ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr); 1681 PetscFunctionReturn(0); 1682 } 1683 1684 #undef __FUNCT__ 1685 #define __FUNCT__ "MatDestroy_SeqAIJCUSPARSE" 1686 static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A) 1687 { 1688 PetscErrorCode ierr; 1689 1690 PetscFunctionBegin; 1691 if (A->factortype==MAT_FACTOR_NONE) { 1692 if (A->valid_GPU_matrix != PETSC_CUSP_UNALLOCATED) { 1693 ierr = Mat_SeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr);CHKERRQ(ierr); 1694 } 1695 } else { 1696 ierr = Mat_SeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr);CHKERRQ(ierr); 1697 } 1698 ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr); 1699 PetscFunctionReturn(0); 1700 } 1701 1702 #undef __FUNCT__ 1703 #define __FUNCT__ "MatCreate_SeqAIJCUSPARSE" 1704 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B) 1705 { 1706 PetscErrorCode ierr; 1707 cusparseStatus_t stat; 1708 cusparseHandle_t handle=0; 1709 1710 PetscFunctionBegin; 1711 ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr); 1712 if (B->factortype==MAT_FACTOR_NONE) { 1713 /* you cannot check the inode.use flag here since the matrix was just created. 1714 now build a GPU matrix data structure */ 1715 B->spptr = new Mat_SeqAIJCUSPARSE; 1716 ((Mat_SeqAIJCUSPARSE*)B->spptr)->mat = 0; 1717 ((Mat_SeqAIJCUSPARSE*)B->spptr)->matTranspose = 0; 1718 ((Mat_SeqAIJCUSPARSE*)B->spptr)->workVector = 0; 1719 ((Mat_SeqAIJCUSPARSE*)B->spptr)->format = MAT_CUSPARSE_CSR; 1720 ((Mat_SeqAIJCUSPARSE*)B->spptr)->stream = 0; 1721 ((Mat_SeqAIJCUSPARSE*)B->spptr)->handle = 0; 1722 stat = cusparseCreate(&handle);CHKERRCUSP(stat); 1723 ((Mat_SeqAIJCUSPARSE*)B->spptr)->handle = handle; 1724 ((Mat_SeqAIJCUSPARSE*)B->spptr)->stream = 0; 1725 } else { 1726 /* NEXT, set the pointers to the triangular factors */ 1727 B->spptr = new Mat_SeqAIJCUSPARSETriFactors; 1728 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->loTriFactorPtr = 0; 1729 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->upTriFactorPtr = 0; 1730 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->loTriFactorPtrTranspose = 0; 1731 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->upTriFactorPtrTranspose = 0; 1732 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->rpermIndices = 0; 1733 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->cpermIndices = 0; 1734 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->workVector = 0; 1735 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->handle = 0; 1736 stat = cusparseCreate(&handle);CHKERRCUSP(stat); 1737 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->handle = handle; 1738 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->nnz = 0; 1739 } 1740 1741 B->ops->assemblyend = MatAssemblyEnd_SeqAIJCUSPARSE; 1742 B->ops->destroy = MatDestroy_SeqAIJCUSPARSE; 1743 B->ops->getvecs = MatCreateVecs_SeqAIJCUSPARSE; 1744 B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE; 1745 B->ops->mult = MatMult_SeqAIJCUSPARSE; 1746 B->ops->multadd = MatMultAdd_SeqAIJCUSPARSE; 1747 B->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE; 1748 B->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE; 1749 1750 ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 1751 1752 B->valid_GPU_matrix = PETSC_CUSP_UNALLOCATED; 1753 1754 ierr = PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr); 1755 PetscFunctionReturn(0); 1756 } 1757 1758 /*M 1759 MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices. 1760 1761 A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either 1762 CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later. 1763 All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library. 1764 1765 Options Database Keys: 1766 + -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions() 1767 . -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 1768 . -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 1769 1770 Level: beginner 1771 1772 .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 1773 M*/ 1774 1775 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat,MatFactorType,Mat*); 1776 1777 1778 #undef __FUNCT__ 1779 #define __FUNCT__ "MatSolverPackageRegister_CUSPARSE" 1780 PETSC_EXTERN PetscErrorCode MatSolverPackageRegister_CUSPARSE(void) 1781 { 1782 PetscErrorCode ierr; 1783 1784 PetscFunctionBegin; 1785 ierr = MatSolverPackageRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 1786 ierr = MatSolverPackageRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 1787 ierr = MatSolverPackageRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 1788 ierr = MatSolverPackageRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 1789 PetscFunctionReturn(0); 1790 } 1791 1792 1793 #undef __FUNCT__ 1794 #define __FUNCT__ "Mat_SeqAIJCUSPARSE_Destroy" 1795 static PetscErrorCode Mat_SeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct) 1796 { 1797 cusparseStatus_t stat; 1798 cusparseHandle_t handle; 1799 1800 PetscFunctionBegin; 1801 if (*cusparsestruct) { 1802 Mat_SeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format); 1803 Mat_SeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format); 1804 delete (*cusparsestruct)->workVector; 1805 if (handle = (*cusparsestruct)->handle) { 1806 stat = cusparseDestroy(handle);CHKERRCUSP(stat); 1807 } 1808 delete *cusparsestruct; 1809 *cusparsestruct = 0; 1810 } 1811 PetscFunctionReturn(0); 1812 } 1813 1814 #undef __FUNCT__ 1815 #define __FUNCT__ "CsrMatrix_Destroy" 1816 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat) 1817 { 1818 PetscFunctionBegin; 1819 if (*mat) { 1820 delete (*mat)->values; 1821 delete (*mat)->column_indices; 1822 delete (*mat)->row_offsets; 1823 delete *mat; 1824 *mat = 0; 1825 } 1826 PetscFunctionReturn(0); 1827 } 1828 1829 #undef __FUNCT__ 1830 #define __FUNCT__ "Mat_SeqAIJCUSPARSETriFactorStruct_Destroy" 1831 static PetscErrorCode Mat_SeqAIJCUSPARSETriFactorStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor) 1832 { 1833 cusparseStatus_t stat; 1834 PetscErrorCode ierr; 1835 1836 PetscFunctionBegin; 1837 if (*trifactor) { 1838 if ((*trifactor)->descr) { stat = cusparseDestroyMatDescr((*trifactor)->descr);CHKERRCUSP(stat); } 1839 if ((*trifactor)->solveInfo) { stat = cusparseDestroySolveAnalysisInfo((*trifactor)->solveInfo);CHKERRCUSP(stat); } 1840 ierr = CsrMatrix_Destroy(&(*trifactor)->csrMat);CHKERRQ(ierr); 1841 delete *trifactor; 1842 *trifactor = 0; 1843 } 1844 PetscFunctionReturn(0); 1845 } 1846 1847 #undef __FUNCT__ 1848 #define __FUNCT__ "Mat_SeqAIJCUSPARSEMultStruct_Destroy" 1849 static PetscErrorCode Mat_SeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format) 1850 { 1851 CsrMatrix *mat; 1852 cusparseStatus_t stat; 1853 cudaError_t err; 1854 1855 PetscFunctionBegin; 1856 if (*matstruct) { 1857 if ((*matstruct)->mat) { 1858 if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) { 1859 cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat; 1860 stat = cusparseDestroyHybMat(hybMat);CHKERRCUSP(stat); 1861 } else { 1862 mat = (CsrMatrix*)(*matstruct)->mat; 1863 CsrMatrix_Destroy(&mat); 1864 } 1865 } 1866 if ((*matstruct)->descr) { stat = cusparseDestroyMatDescr((*matstruct)->descr);CHKERRCUSP(stat); } 1867 delete (*matstruct)->cprowIndices; 1868 if ((*matstruct)->alpha) { err=cudaFree((*matstruct)->alpha);CHKERRCUSP(err); } 1869 if ((*matstruct)->beta) { err=cudaFree((*matstruct)->beta);CHKERRCUSP(err); } 1870 delete *matstruct; 1871 *matstruct = 0; 1872 } 1873 PetscFunctionReturn(0); 1874 } 1875 1876 #undef __FUNCT__ 1877 #define __FUNCT__ "Mat_SeqAIJCUSPARSETriFactors_Destroy" 1878 static PetscErrorCode Mat_SeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors) 1879 { 1880 cusparseHandle_t handle; 1881 cusparseStatus_t stat; 1882 1883 PetscFunctionBegin; 1884 if (*trifactors) { 1885 Mat_SeqAIJCUSPARSETriFactorStruct_Destroy(&(*trifactors)->loTriFactorPtr); 1886 Mat_SeqAIJCUSPARSETriFactorStruct_Destroy(&(*trifactors)->upTriFactorPtr); 1887 Mat_SeqAIJCUSPARSETriFactorStruct_Destroy(&(*trifactors)->loTriFactorPtrTranspose); 1888 Mat_SeqAIJCUSPARSETriFactorStruct_Destroy(&(*trifactors)->upTriFactorPtrTranspose); 1889 delete (*trifactors)->rpermIndices; 1890 delete (*trifactors)->cpermIndices; 1891 delete (*trifactors)->workVector; 1892 if (handle = (*trifactors)->handle) { 1893 stat = cusparseDestroy(handle);CHKERRCUSP(stat); 1894 } 1895 delete *trifactors; 1896 *trifactors = 0; 1897 } 1898 PetscFunctionReturn(0); 1899 } 1900 1901