1 /* 2 Defines the basic matrix operations for the AIJ (compressed row) 3 matrix storage format using the CUSPARSE library, 4 */ 5 #define PETSC_SKIP_SPINLOCK 6 7 #include <petscconf.h> 8 #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/ 9 #include <../src/mat/impls/sbaij/seq/sbaij.h> 10 #include <../src/vec/vec/impls/dvecimpl.h> 11 #include <petsc/private/vecimpl.h> 12 #undef VecType 13 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h> 14 15 const char *const MatCUSPARSEStorageFormats[] = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0}; 16 17 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*); 18 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*); 19 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*); 20 21 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 22 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 23 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*); 24 25 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec); 26 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 27 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 28 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 29 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat); 30 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec); 31 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 32 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 33 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 34 35 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**); 36 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**); 37 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat); 38 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**); 39 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**); 40 41 PetscErrorCode MatCUSPARSESetStream(Mat A,const cudaStream_t stream) 42 { 43 cusparseStatus_t stat; 44 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 45 46 PetscFunctionBegin; 47 cusparsestruct->stream = stream; 48 stat = cusparseSetStream(cusparsestruct->handle,cusparsestruct->stream);CHKERRCUDA(stat); 49 PetscFunctionReturn(0); 50 } 51 52 PetscErrorCode MatCUSPARSESetHandle(Mat A,const cusparseHandle_t handle) 53 { 54 cusparseStatus_t stat; 55 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 56 57 PetscFunctionBegin; 58 if (cusparsestruct->handle != handle) { 59 if (cusparsestruct->handle) { 60 stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUDA(stat); 61 } 62 cusparsestruct->handle = handle; 63 } 64 stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUDA(stat); 65 PetscFunctionReturn(0); 66 } 67 68 PetscErrorCode MatCUSPARSEClearHandle(Mat A) 69 { 70 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 71 PetscFunctionBegin; 72 if (cusparsestruct->handle) 73 cusparsestruct->handle = 0; 74 PetscFunctionReturn(0); 75 } 76 77 PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A,MatSolverType *type) 78 { 79 PetscFunctionBegin; 80 *type = MATSOLVERCUSPARSE; 81 PetscFunctionReturn(0); 82 } 83 84 /*MC 85 MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices 86 on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported 87 algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer 88 performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the 89 CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these 90 algorithms are not recommended. This class does NOT support direct solver operations. 91 92 Level: beginner 93 94 .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 95 M*/ 96 97 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B) 98 { 99 PetscErrorCode ierr; 100 PetscInt n = A->rmap->n; 101 102 PetscFunctionBegin; 103 ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr); 104 (*B)->factortype = ftype; 105 ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr); 106 ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 107 108 if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) { 109 ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr); 110 (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE; 111 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJCUSPARSE; 112 } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) { 113 (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJCUSPARSE; 114 (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE; 115 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types"); 116 117 ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr); 118 ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_cusparse);CHKERRQ(ierr); 119 PetscFunctionReturn(0); 120 } 121 122 PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 123 { 124 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 125 126 PetscFunctionBegin; 127 #if CUDA_VERSION>=4020 128 switch (op) { 129 case MAT_CUSPARSE_MULT: 130 cusparsestruct->format = format; 131 break; 132 case MAT_CUSPARSE_ALL: 133 cusparsestruct->format = format; 134 break; 135 default: 136 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op); 137 } 138 #else 139 if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"ELL (Ellpack) and HYB (Hybrid) storage format require CUDA 4.2 or later."); 140 #endif 141 PetscFunctionReturn(0); 142 } 143 144 /*@ 145 MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular 146 operation. Only the MatMult operation can use different GPU storage formats 147 for MPIAIJCUSPARSE matrices. 148 Not Collective 149 150 Input Parameters: 151 + A - Matrix of type SEQAIJCUSPARSE 152 . op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL. 153 - format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2) 154 155 Output Parameter: 156 157 Level: intermediate 158 159 .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 160 @*/ 161 PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 162 { 163 PetscErrorCode ierr; 164 165 PetscFunctionBegin; 166 PetscValidHeaderSpecific(A, MAT_CLASSID,1); 167 ierr = PetscTryMethod(A, "MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));CHKERRQ(ierr); 168 PetscFunctionReturn(0); 169 } 170 171 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A) 172 { 173 PetscErrorCode ierr; 174 MatCUSPARSEStorageFormat format; 175 PetscBool flg; 176 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 177 178 PetscFunctionBegin; 179 ierr = PetscOptionsHead(PetscOptionsObject,"SeqAIJCUSPARSE options");CHKERRQ(ierr); 180 if (A->factortype==MAT_FACTOR_NONE) { 181 ierr = PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV", 182 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 183 if (flg) { 184 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);CHKERRQ(ierr); 185 } 186 } 187 ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve", 188 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 189 if (flg) { 190 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr); 191 } 192 ierr = PetscOptionsTail();CHKERRQ(ierr); 193 PetscFunctionReturn(0); 194 195 } 196 197 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 198 { 199 PetscErrorCode ierr; 200 201 PetscFunctionBegin; 202 ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 203 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 204 PetscFunctionReturn(0); 205 } 206 207 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 208 { 209 PetscErrorCode ierr; 210 211 PetscFunctionBegin; 212 ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 213 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 214 PetscFunctionReturn(0); 215 } 216 217 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info) 218 { 219 PetscErrorCode ierr; 220 221 PetscFunctionBegin; 222 ierr = MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr); 223 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 224 PetscFunctionReturn(0); 225 } 226 227 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info) 228 { 229 PetscErrorCode ierr; 230 231 PetscFunctionBegin; 232 ierr = MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr); 233 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 234 PetscFunctionReturn(0); 235 } 236 237 static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A) 238 { 239 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 240 PetscInt n = A->rmap->n; 241 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 242 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 243 cusparseStatus_t stat; 244 const PetscInt *ai = a->i,*aj = a->j,*vi; 245 const MatScalar *aa = a->a,*v; 246 PetscInt *AiLo, *AjLo; 247 PetscScalar *AALo; 248 PetscInt i,nz, nzLower, offset, rowOffset; 249 PetscErrorCode ierr; 250 251 PetscFunctionBegin; 252 if (A->valid_GPU_matrix == PETSC_OFFLOAD_UNALLOCATED || A->valid_GPU_matrix == PETSC_OFFLOAD_CPU) { 253 try { 254 /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */ 255 nzLower=n+ai[n]-ai[1]; 256 257 /* Allocate Space for the lower triangular matrix */ 258 ierr = cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUDA(ierr); 259 ierr = cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUDA(ierr); 260 ierr = cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUDA(ierr); 261 262 /* Fill the lower triangular matrix */ 263 AiLo[0] = (PetscInt) 0; 264 AiLo[n] = nzLower; 265 AjLo[0] = (PetscInt) 0; 266 AALo[0] = (MatScalar) 1.0; 267 v = aa; 268 vi = aj; 269 offset = 1; 270 rowOffset= 1; 271 for (i=1; i<n; i++) { 272 nz = ai[i+1] - ai[i]; 273 /* additional 1 for the term on the diagonal */ 274 AiLo[i] = rowOffset; 275 rowOffset += nz+1; 276 277 ierr = PetscArraycpy(&(AjLo[offset]), vi, nz);CHKERRQ(ierr); 278 ierr = PetscArraycpy(&(AALo[offset]), v, nz);CHKERRQ(ierr); 279 280 offset += nz; 281 AjLo[offset] = (PetscInt) i; 282 AALo[offset] = (MatScalar) 1.0; 283 offset += 1; 284 285 v += nz; 286 vi += nz; 287 } 288 289 /* allocate space for the triangular factor information */ 290 loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct; 291 292 /* Create the matrix description */ 293 stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUDA(stat); 294 stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat); 295 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUDA(stat); 296 stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER);CHKERRCUDA(stat); 297 stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUDA(stat); 298 299 /* Create the solve analysis information */ 300 stat = cusparseCreateSolveAnalysisInfo(&loTriFactor->solveInfo);CHKERRCUDA(stat); 301 302 /* set the operation */ 303 loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 304 305 /* set the matrix */ 306 loTriFactor->csrMat = new CsrMatrix; 307 loTriFactor->csrMat->num_rows = n; 308 loTriFactor->csrMat->num_cols = n; 309 loTriFactor->csrMat->num_entries = nzLower; 310 311 loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1); 312 loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1); 313 314 loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower); 315 loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower); 316 317 loTriFactor->csrMat->values = new THRUSTARRAY(nzLower); 318 loTriFactor->csrMat->values->assign(AALo, AALo+nzLower); 319 320 /* perform the solve analysis */ 321 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, 322 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 323 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 324 loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo);CHKERRCUDA(stat); 325 326 /* assign the pointer. Is this really necessary? */ 327 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor; 328 329 ierr = cudaFreeHost(AiLo);CHKERRCUDA(ierr); 330 ierr = cudaFreeHost(AjLo);CHKERRCUDA(ierr); 331 ierr = cudaFreeHost(AALo);CHKERRCUDA(ierr); 332 ierr = PetscLogCpuToGpu((n+1+nzLower)*sizeof(int)+nzLower*sizeof(PetscScalar));CHKERRQ(ierr); 333 } catch(char *ex) { 334 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 335 } 336 } 337 PetscFunctionReturn(0); 338 } 339 340 static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A) 341 { 342 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 343 PetscInt n = A->rmap->n; 344 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 345 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 346 cusparseStatus_t stat; 347 const PetscInt *aj = a->j,*adiag = a->diag,*vi; 348 const MatScalar *aa = a->a,*v; 349 PetscInt *AiUp, *AjUp; 350 PetscScalar *AAUp; 351 PetscInt i,nz, nzUpper, offset; 352 PetscErrorCode ierr; 353 354 PetscFunctionBegin; 355 if (A->valid_GPU_matrix == PETSC_OFFLOAD_UNALLOCATED || A->valid_GPU_matrix == PETSC_OFFLOAD_CPU) { 356 try { 357 /* next, figure out the number of nonzeros in the upper triangular matrix. */ 358 nzUpper = adiag[0]-adiag[n]; 359 360 /* Allocate Space for the upper triangular matrix */ 361 ierr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(ierr); 362 ierr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(ierr); 363 ierr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(ierr); 364 365 /* Fill the upper triangular matrix */ 366 AiUp[0]=(PetscInt) 0; 367 AiUp[n]=nzUpper; 368 offset = nzUpper; 369 for (i=n-1; i>=0; i--) { 370 v = aa + adiag[i+1] + 1; 371 vi = aj + adiag[i+1] + 1; 372 373 /* number of elements NOT on the diagonal */ 374 nz = adiag[i] - adiag[i+1]-1; 375 376 /* decrement the offset */ 377 offset -= (nz+1); 378 379 /* first, set the diagonal elements */ 380 AjUp[offset] = (PetscInt) i; 381 AAUp[offset] = (MatScalar)1./v[nz]; 382 AiUp[i] = AiUp[i+1] - (nz+1); 383 384 ierr = PetscArraycpy(&(AjUp[offset+1]), vi, nz);CHKERRQ(ierr); 385 ierr = PetscArraycpy(&(AAUp[offset+1]), v, nz);CHKERRQ(ierr); 386 } 387 388 /* allocate space for the triangular factor information */ 389 upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct; 390 391 /* Create the matrix description */ 392 stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUDA(stat); 393 stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat); 394 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUDA(stat); 395 stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUDA(stat); 396 stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUDA(stat); 397 398 /* Create the solve analysis information */ 399 stat = cusparseCreateSolveAnalysisInfo(&upTriFactor->solveInfo);CHKERRCUDA(stat); 400 401 /* set the operation */ 402 upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 403 404 /* set the matrix */ 405 upTriFactor->csrMat = new CsrMatrix; 406 upTriFactor->csrMat->num_rows = n; 407 upTriFactor->csrMat->num_cols = n; 408 upTriFactor->csrMat->num_entries = nzUpper; 409 410 upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1); 411 upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1); 412 413 upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper); 414 upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper); 415 416 upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper); 417 upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper); 418 419 /* perform the solve analysis */ 420 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, 421 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 422 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 423 upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo);CHKERRCUDA(stat); 424 425 /* assign the pointer. Is this really necessary? */ 426 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor; 427 428 ierr = cudaFreeHost(AiUp);CHKERRCUDA(ierr); 429 ierr = cudaFreeHost(AjUp);CHKERRCUDA(ierr); 430 ierr = cudaFreeHost(AAUp);CHKERRCUDA(ierr); 431 ierr = PetscLogCpuToGpu((n+1+nzUpper)*sizeof(int)+nzUpper*sizeof(PetscScalar));CHKERRQ(ierr); 432 } catch(char *ex) { 433 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 434 } 435 } 436 PetscFunctionReturn(0); 437 } 438 439 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A) 440 { 441 PetscErrorCode ierr; 442 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 443 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 444 IS isrow = a->row,iscol = a->icol; 445 PetscBool row_identity,col_identity; 446 const PetscInt *r,*c; 447 PetscInt n = A->rmap->n; 448 449 PetscFunctionBegin; 450 ierr = MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);CHKERRQ(ierr); 451 ierr = MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);CHKERRQ(ierr); 452 453 cusparseTriFactors->workVector = new THRUSTARRAY(n); 454 cusparseTriFactors->nnz=a->nz; 455 456 A->valid_GPU_matrix = PETSC_OFFLOAD_BOTH; 457 /*lower triangular indices */ 458 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 459 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 460 if (!row_identity) { 461 cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 462 cusparseTriFactors->rpermIndices->assign(r, r+n); 463 } 464 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 465 466 /*upper triangular indices */ 467 ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr); 468 ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 469 if (!col_identity) { 470 cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 471 cusparseTriFactors->cpermIndices->assign(c, c+n); 472 } 473 474 if(!row_identity && !col_identity) { 475 ierr = PetscLogCpuToGpu(2*n*sizeof(PetscInt));CHKERRQ(ierr); 476 } else if(!row_identity) { 477 ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr); 478 } else if(!col_identity) { 479 ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr); 480 } 481 482 ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr); 483 PetscFunctionReturn(0); 484 } 485 486 static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A) 487 { 488 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 489 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 490 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 491 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 492 cusparseStatus_t stat; 493 PetscErrorCode ierr; 494 PetscInt *AiUp, *AjUp; 495 PetscScalar *AAUp; 496 PetscScalar *AALo; 497 PetscInt nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j; 498 Mat_SeqSBAIJ *b = (Mat_SeqSBAIJ*)A->data; 499 const PetscInt *ai = b->i,*aj = b->j,*vj; 500 const MatScalar *aa = b->a,*v; 501 502 PetscFunctionBegin; 503 if (A->valid_GPU_matrix == PETSC_OFFLOAD_UNALLOCATED || A->valid_GPU_matrix == PETSC_OFFLOAD_CPU) { 504 try { 505 /* Allocate Space for the upper triangular matrix */ 506 ierr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(ierr); 507 ierr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(ierr); 508 ierr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(ierr); 509 ierr = cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUDA(ierr); 510 511 /* Fill the upper triangular matrix */ 512 AiUp[0]=(PetscInt) 0; 513 AiUp[n]=nzUpper; 514 offset = 0; 515 for (i=0; i<n; i++) { 516 /* set the pointers */ 517 v = aa + ai[i]; 518 vj = aj + ai[i]; 519 nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */ 520 521 /* first, set the diagonal elements */ 522 AjUp[offset] = (PetscInt) i; 523 AAUp[offset] = (MatScalar)1.0/v[nz]; 524 AiUp[i] = offset; 525 AALo[offset] = (MatScalar)1.0/v[nz]; 526 527 offset+=1; 528 if (nz>0) { 529 ierr = PetscArraycpy(&(AjUp[offset]), vj, nz);CHKERRQ(ierr); 530 ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr); 531 for (j=offset; j<offset+nz; j++) { 532 AAUp[j] = -AAUp[j]; 533 AALo[j] = AAUp[j]/v[nz]; 534 } 535 offset+=nz; 536 } 537 } 538 539 /* allocate space for the triangular factor information */ 540 upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct; 541 542 /* Create the matrix description */ 543 stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUDA(stat); 544 stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat); 545 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUDA(stat); 546 stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUDA(stat); 547 stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUDA(stat); 548 549 /* Create the solve analysis information */ 550 stat = cusparseCreateSolveAnalysisInfo(&upTriFactor->solveInfo);CHKERRCUDA(stat); 551 552 /* set the operation */ 553 upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 554 555 /* set the matrix */ 556 upTriFactor->csrMat = new CsrMatrix; 557 upTriFactor->csrMat->num_rows = A->rmap->n; 558 upTriFactor->csrMat->num_cols = A->cmap->n; 559 upTriFactor->csrMat->num_entries = a->nz; 560 561 upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 562 upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1); 563 564 upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 565 upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz); 566 567 upTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 568 upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz); 569 570 /* perform the solve analysis */ 571 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, 572 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 573 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 574 upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo);CHKERRCUDA(stat); 575 576 /* assign the pointer. Is this really necessary? */ 577 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor; 578 579 /* allocate space for the triangular factor information */ 580 loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct; 581 582 /* Create the matrix description */ 583 stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUDA(stat); 584 stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat); 585 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUDA(stat); 586 stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUDA(stat); 587 stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUDA(stat); 588 589 /* Create the solve analysis information */ 590 stat = cusparseCreateSolveAnalysisInfo(&loTriFactor->solveInfo);CHKERRCUDA(stat); 591 592 /* set the operation */ 593 loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE; 594 595 /* set the matrix */ 596 loTriFactor->csrMat = new CsrMatrix; 597 loTriFactor->csrMat->num_rows = A->rmap->n; 598 loTriFactor->csrMat->num_cols = A->cmap->n; 599 loTriFactor->csrMat->num_entries = a->nz; 600 601 loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 602 loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1); 603 604 loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 605 loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz); 606 607 loTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 608 loTriFactor->csrMat->values->assign(AALo, AALo+a->nz); 609 ierr = PetscLogCpuToGpu(2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)));CHKERRQ(ierr); 610 611 /* perform the solve analysis */ 612 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, 613 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 614 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 615 loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo);CHKERRCUDA(stat); 616 617 /* assign the pointer. Is this really necessary? */ 618 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor; 619 620 A->valid_GPU_matrix = PETSC_OFFLOAD_BOTH; 621 ierr = cudaFreeHost(AiUp);CHKERRCUDA(ierr); 622 ierr = cudaFreeHost(AjUp);CHKERRCUDA(ierr); 623 ierr = cudaFreeHost(AAUp);CHKERRCUDA(ierr); 624 ierr = cudaFreeHost(AALo);CHKERRCUDA(ierr); 625 } catch(char *ex) { 626 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 627 } 628 } 629 PetscFunctionReturn(0); 630 } 631 632 static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A) 633 { 634 PetscErrorCode ierr; 635 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 636 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 637 IS ip = a->row; 638 const PetscInt *rip; 639 PetscBool perm_identity; 640 PetscInt n = A->rmap->n; 641 642 PetscFunctionBegin; 643 ierr = MatSeqAIJCUSPARSEBuildICCTriMatrices(A);CHKERRQ(ierr); 644 cusparseTriFactors->workVector = new THRUSTARRAY(n); 645 cusparseTriFactors->nnz=(a->nz-n)*2 + n; 646 647 /*lower triangular indices */ 648 ierr = ISGetIndices(ip,&rip);CHKERRQ(ierr); 649 ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr); 650 if (!perm_identity) { 651 cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 652 cusparseTriFactors->rpermIndices->assign(rip, rip+n); 653 cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 654 cusparseTriFactors->cpermIndices->assign(rip, rip+n); 655 ierr = PetscLogCpuToGpu(2*n*sizeof(PetscInt));CHKERRQ(ierr); 656 } 657 ierr = ISRestoreIndices(ip,&rip);CHKERRQ(ierr); 658 PetscFunctionReturn(0); 659 } 660 661 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info) 662 { 663 Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 664 IS isrow = b->row,iscol = b->col; 665 PetscBool row_identity,col_identity; 666 PetscErrorCode ierr; 667 668 PetscFunctionBegin; 669 ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 670 /* determine which version of MatSolve needs to be used. */ 671 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 672 ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 673 if (row_identity && col_identity) { 674 B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 675 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 676 } else { 677 B->ops->solve = MatSolve_SeqAIJCUSPARSE; 678 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 679 } 680 681 /* get the triangular factors */ 682 ierr = MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);CHKERRQ(ierr); 683 PetscFunctionReturn(0); 684 } 685 686 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info) 687 { 688 Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 689 IS ip = b->row; 690 PetscBool perm_identity; 691 PetscErrorCode ierr; 692 693 PetscFunctionBegin; 694 ierr = MatCholeskyFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 695 696 /* determine which version of MatSolve needs to be used. */ 697 ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr); 698 if (perm_identity) { 699 B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 700 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 701 } else { 702 B->ops->solve = MatSolve_SeqAIJCUSPARSE; 703 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 704 } 705 706 /* get the triangular factors */ 707 ierr = MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);CHKERRQ(ierr); 708 PetscFunctionReturn(0); 709 } 710 711 static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A) 712 { 713 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 714 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 715 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 716 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 717 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 718 cusparseStatus_t stat; 719 cusparseIndexBase_t indexBase; 720 cusparseMatrixType_t matrixType; 721 cusparseFillMode_t fillMode; 722 cusparseDiagType_t diagType; 723 724 PetscFunctionBegin; 725 726 /*********************************************/ 727 /* Now the Transpose of the Lower Tri Factor */ 728 /*********************************************/ 729 730 /* allocate space for the transpose of the lower triangular factor */ 731 loTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct; 732 733 /* set the matrix descriptors of the lower triangular factor */ 734 matrixType = cusparseGetMatType(loTriFactor->descr); 735 indexBase = cusparseGetMatIndexBase(loTriFactor->descr); 736 fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ? 737 CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 738 diagType = cusparseGetMatDiagType(loTriFactor->descr); 739 740 /* Create the matrix description */ 741 stat = cusparseCreateMatDescr(&loTriFactorT->descr);CHKERRCUDA(stat); 742 stat = cusparseSetMatIndexBase(loTriFactorT->descr, indexBase);CHKERRCUDA(stat); 743 stat = cusparseSetMatType(loTriFactorT->descr, matrixType);CHKERRCUDA(stat); 744 stat = cusparseSetMatFillMode(loTriFactorT->descr, fillMode);CHKERRCUDA(stat); 745 stat = cusparseSetMatDiagType(loTriFactorT->descr, diagType);CHKERRCUDA(stat); 746 747 /* Create the solve analysis information */ 748 stat = cusparseCreateSolveAnalysisInfo(&loTriFactorT->solveInfo);CHKERRCUDA(stat); 749 750 /* set the operation */ 751 loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 752 753 /* allocate GPU space for the CSC of the lower triangular factor*/ 754 loTriFactorT->csrMat = new CsrMatrix; 755 loTriFactorT->csrMat->num_rows = loTriFactor->csrMat->num_rows; 756 loTriFactorT->csrMat->num_cols = loTriFactor->csrMat->num_cols; 757 loTriFactorT->csrMat->num_entries = loTriFactor->csrMat->num_entries; 758 loTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(loTriFactor->csrMat->num_rows+1); 759 loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactor->csrMat->num_entries); 760 loTriFactorT->csrMat->values = new THRUSTARRAY(loTriFactor->csrMat->num_entries); 761 762 /* compute the transpose of the lower triangular factor, i.e. the CSC */ 763 stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, 764 loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, 765 loTriFactor->csrMat->values->data().get(), 766 loTriFactor->csrMat->row_offsets->data().get(), 767 loTriFactor->csrMat->column_indices->data().get(), 768 loTriFactorT->csrMat->values->data().get(), 769 loTriFactorT->csrMat->column_indices->data().get(), 770 loTriFactorT->csrMat->row_offsets->data().get(), 771 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUDA(stat); 772 773 /* perform the solve analysis on the transposed matrix */ 774 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp, 775 loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, 776 loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(), 777 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), 778 loTriFactorT->solveInfo);CHKERRCUDA(stat); 779 780 /* assign the pointer. Is this really necessary? */ 781 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT; 782 783 /*********************************************/ 784 /* Now the Transpose of the Upper Tri Factor */ 785 /*********************************************/ 786 787 /* allocate space for the transpose of the upper triangular factor */ 788 upTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct; 789 790 /* set the matrix descriptors of the upper triangular factor */ 791 matrixType = cusparseGetMatType(upTriFactor->descr); 792 indexBase = cusparseGetMatIndexBase(upTriFactor->descr); 793 fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ? 794 CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 795 diagType = cusparseGetMatDiagType(upTriFactor->descr); 796 797 /* Create the matrix description */ 798 stat = cusparseCreateMatDescr(&upTriFactorT->descr);CHKERRCUDA(stat); 799 stat = cusparseSetMatIndexBase(upTriFactorT->descr, indexBase);CHKERRCUDA(stat); 800 stat = cusparseSetMatType(upTriFactorT->descr, matrixType);CHKERRCUDA(stat); 801 stat = cusparseSetMatFillMode(upTriFactorT->descr, fillMode);CHKERRCUDA(stat); 802 stat = cusparseSetMatDiagType(upTriFactorT->descr, diagType);CHKERRCUDA(stat); 803 804 /* Create the solve analysis information */ 805 stat = cusparseCreateSolveAnalysisInfo(&upTriFactorT->solveInfo);CHKERRCUDA(stat); 806 807 /* set the operation */ 808 upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 809 810 /* allocate GPU space for the CSC of the upper triangular factor*/ 811 upTriFactorT->csrMat = new CsrMatrix; 812 upTriFactorT->csrMat->num_rows = upTriFactor->csrMat->num_rows; 813 upTriFactorT->csrMat->num_cols = upTriFactor->csrMat->num_cols; 814 upTriFactorT->csrMat->num_entries = upTriFactor->csrMat->num_entries; 815 upTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(upTriFactor->csrMat->num_rows+1); 816 upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactor->csrMat->num_entries); 817 upTriFactorT->csrMat->values = new THRUSTARRAY(upTriFactor->csrMat->num_entries); 818 819 /* compute the transpose of the upper triangular factor, i.e. the CSC */ 820 stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, 821 upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, 822 upTriFactor->csrMat->values->data().get(), 823 upTriFactor->csrMat->row_offsets->data().get(), 824 upTriFactor->csrMat->column_indices->data().get(), 825 upTriFactorT->csrMat->values->data().get(), 826 upTriFactorT->csrMat->column_indices->data().get(), 827 upTriFactorT->csrMat->row_offsets->data().get(), 828 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUDA(stat); 829 830 /* perform the solve analysis on the transposed matrix */ 831 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp, 832 upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, 833 upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(), 834 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), 835 upTriFactorT->solveInfo);CHKERRCUDA(stat); 836 837 /* assign the pointer. Is this really necessary? */ 838 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT; 839 PetscFunctionReturn(0); 840 } 841 842 static PetscErrorCode MatSeqAIJCUSPARSEGenerateTransposeForMult(Mat A) 843 { 844 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 845 Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 846 Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 847 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 848 cusparseStatus_t stat; 849 cusparseIndexBase_t indexBase; 850 cudaError_t err; 851 PetscErrorCode ierr; 852 853 PetscFunctionBegin; 854 855 /* allocate space for the triangular factor information */ 856 matstructT = new Mat_SeqAIJCUSPARSEMultStruct; 857 stat = cusparseCreateMatDescr(&matstructT->descr);CHKERRCUDA(stat); 858 indexBase = cusparseGetMatIndexBase(matstruct->descr); 859 stat = cusparseSetMatIndexBase(matstructT->descr, indexBase);CHKERRCUDA(stat); 860 stat = cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUDA(stat); 861 862 /* set alpha and beta */ 863 err = cudaMalloc((void **)&(matstructT->alpha), sizeof(PetscScalar));CHKERRCUDA(err); 864 err = cudaMalloc((void **)&(matstructT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err); 865 err = cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar));CHKERRCUDA(err); 866 err = cudaMemcpy(matstructT->alpha, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 867 err = cudaMemcpy(matstructT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 868 err = cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 869 stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUDA(stat); 870 871 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 872 CsrMatrix *matrix = (CsrMatrix*)matstruct->mat; 873 CsrMatrix *matrixT= new CsrMatrix; 874 matrixT->num_rows = A->cmap->n; 875 matrixT->num_cols = A->rmap->n; 876 matrixT->num_entries = a->nz; 877 matrixT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 878 matrixT->column_indices = new THRUSTINTARRAY32(a->nz); 879 matrixT->values = new THRUSTARRAY(a->nz); 880 881 /* compute the transpose of the upper triangular factor, i.e. the CSC */ 882 indexBase = cusparseGetMatIndexBase(matstruct->descr); 883 stat = cusparse_csr2csc(cusparsestruct->handle, matrix->num_rows, 884 matrix->num_cols, matrix->num_entries, 885 matrix->values->data().get(), 886 matrix->row_offsets->data().get(), 887 matrix->column_indices->data().get(), 888 matrixT->values->data().get(), 889 matrixT->column_indices->data().get(), 890 matrixT->row_offsets->data().get(), 891 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUDA(stat); 892 893 /* assign the pointer */ 894 matstructT->mat = matrixT; 895 ierr = PetscLogCpuToGpu(((A->rmap->n+1)+(a->nz))*sizeof(int)+(3+a->nz)*sizeof(PetscScalar));CHKERRQ(ierr); 896 } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) { 897 #if CUDA_VERSION>=5000 898 /* First convert HYB to CSR */ 899 CsrMatrix *temp= new CsrMatrix; 900 temp->num_rows = A->rmap->n; 901 temp->num_cols = A->cmap->n; 902 temp->num_entries = a->nz; 903 temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 904 temp->column_indices = new THRUSTINTARRAY32(a->nz); 905 temp->values = new THRUSTARRAY(a->nz); 906 907 908 stat = cusparse_hyb2csr(cusparsestruct->handle, 909 matstruct->descr, (cusparseHybMat_t)matstruct->mat, 910 temp->values->data().get(), 911 temp->row_offsets->data().get(), 912 temp->column_indices->data().get());CHKERRCUDA(stat); 913 914 /* Next, convert CSR to CSC (i.e. the matrix transpose) */ 915 CsrMatrix *tempT= new CsrMatrix; 916 tempT->num_rows = A->rmap->n; 917 tempT->num_cols = A->cmap->n; 918 tempT->num_entries = a->nz; 919 tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 920 tempT->column_indices = new THRUSTINTARRAY32(a->nz); 921 tempT->values = new THRUSTARRAY(a->nz); 922 923 stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows, 924 temp->num_cols, temp->num_entries, 925 temp->values->data().get(), 926 temp->row_offsets->data().get(), 927 temp->column_indices->data().get(), 928 tempT->values->data().get(), 929 tempT->column_indices->data().get(), 930 tempT->row_offsets->data().get(), 931 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUDA(stat); 932 933 /* Last, convert CSC to HYB */ 934 cusparseHybMat_t hybMat; 935 stat = cusparseCreateHybMat(&hybMat);CHKERRCUDA(stat); 936 cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ? 937 CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 938 stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n, 939 matstructT->descr, tempT->values->data().get(), 940 tempT->row_offsets->data().get(), 941 tempT->column_indices->data().get(), 942 hybMat, 0, partition);CHKERRCUDA(stat); 943 944 /* assign the pointer */ 945 matstructT->mat = hybMat; 946 ierr = PetscLogCpuToGpu((2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)))+3*sizeof(PetscScalar));CHKERRQ(ierr); 947 948 /* delete temporaries */ 949 if (tempT) { 950 if (tempT->values) delete (THRUSTARRAY*) tempT->values; 951 if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices; 952 if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets; 953 delete (CsrMatrix*) tempT; 954 } 955 if (temp) { 956 if (temp->values) delete (THRUSTARRAY*) temp->values; 957 if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices; 958 if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets; 959 delete (CsrMatrix*) temp; 960 } 961 #else 962 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"ELL (Ellpack) and HYB (Hybrid) storage format for the Matrix Transpose (in MatMultTranspose) require CUDA 5.0 or later."); 963 #endif 964 } 965 /* assign the compressed row indices */ 966 matstructT->cprowIndices = new THRUSTINTARRAY; 967 matstructT->cprowIndices->resize(A->cmap->n); 968 thrust::sequence(matstructT->cprowIndices->begin(), matstructT->cprowIndices->end()); 969 /* assign the pointer */ 970 ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT; 971 PetscFunctionReturn(0); 972 } 973 974 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 975 { 976 PetscInt n = xx->map->n; 977 const PetscScalar *barray; 978 PetscScalar *xarray; 979 thrust::device_ptr<const PetscScalar> bGPU; 980 thrust::device_ptr<PetscScalar> xGPU; 981 cusparseStatus_t stat; 982 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 983 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 984 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 985 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 986 PetscErrorCode ierr; 987 988 PetscFunctionBegin; 989 /* Analyze the matrix and create the transpose ... on the fly */ 990 if (!loTriFactorT && !upTriFactorT) { 991 ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr); 992 loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 993 upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 994 } 995 996 /* Get the GPU pointers */ 997 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 998 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 999 xGPU = thrust::device_pointer_cast(xarray); 1000 bGPU = thrust::device_pointer_cast(barray); 1001 1002 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1003 /* First, reorder with the row permutation */ 1004 thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), 1005 thrust::make_permutation_iterator(bGPU+n, cusparseTriFactors->rpermIndices->end()), 1006 xGPU); 1007 1008 /* First, solve U */ 1009 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, 1010 upTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactorT->descr, 1011 upTriFactorT->csrMat->values->data().get(), 1012 upTriFactorT->csrMat->row_offsets->data().get(), 1013 upTriFactorT->csrMat->column_indices->data().get(), 1014 upTriFactorT->solveInfo, 1015 xarray, tempGPU->data().get());CHKERRCUDA(stat); 1016 1017 /* Then, solve L */ 1018 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, 1019 loTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactorT->descr, 1020 loTriFactorT->csrMat->values->data().get(), 1021 loTriFactorT->csrMat->row_offsets->data().get(), 1022 loTriFactorT->csrMat->column_indices->data().get(), 1023 loTriFactorT->solveInfo, 1024 tempGPU->data().get(), xarray);CHKERRCUDA(stat); 1025 1026 /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */ 1027 thrust::copy(thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()), 1028 thrust::make_permutation_iterator(xGPU+n, cusparseTriFactors->cpermIndices->end()), 1029 tempGPU->begin()); 1030 1031 /* Copy the temporary to the full solution. */ 1032 thrust::copy(tempGPU->begin(), tempGPU->end(), xGPU); 1033 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1034 1035 /* restore */ 1036 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1037 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1038 ierr = WaitForGPU();CHKERRCUDA(ierr); 1039 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1040 PetscFunctionReturn(0); 1041 } 1042 1043 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 1044 { 1045 const PetscScalar *barray; 1046 PetscScalar *xarray; 1047 cusparseStatus_t stat; 1048 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1049 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1050 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1051 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1052 PetscErrorCode ierr; 1053 1054 PetscFunctionBegin; 1055 /* Analyze the matrix and create the transpose ... on the fly */ 1056 if (!loTriFactorT && !upTriFactorT) { 1057 ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr); 1058 loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1059 upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1060 } 1061 1062 /* Get the GPU pointers */ 1063 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1064 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1065 1066 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1067 /* First, solve U */ 1068 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, 1069 upTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactorT->descr, 1070 upTriFactorT->csrMat->values->data().get(), 1071 upTriFactorT->csrMat->row_offsets->data().get(), 1072 upTriFactorT->csrMat->column_indices->data().get(), 1073 upTriFactorT->solveInfo, 1074 barray, tempGPU->data().get());CHKERRCUDA(stat); 1075 1076 /* Then, solve L */ 1077 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, 1078 loTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactorT->descr, 1079 loTriFactorT->csrMat->values->data().get(), 1080 loTriFactorT->csrMat->row_offsets->data().get(), 1081 loTriFactorT->csrMat->column_indices->data().get(), 1082 loTriFactorT->solveInfo, 1083 tempGPU->data().get(), xarray);CHKERRCUDA(stat); 1084 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1085 1086 /* restore */ 1087 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1088 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1089 ierr = WaitForGPU();CHKERRCUDA(ierr); 1090 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1091 PetscFunctionReturn(0); 1092 } 1093 1094 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 1095 { 1096 const PetscScalar *barray; 1097 PetscScalar *xarray; 1098 thrust::device_ptr<const PetscScalar> bGPU; 1099 thrust::device_ptr<PetscScalar> xGPU; 1100 cusparseStatus_t stat; 1101 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1102 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1103 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1104 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1105 PetscErrorCode ierr; 1106 1107 PetscFunctionBegin; 1108 1109 /* Get the GPU pointers */ 1110 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1111 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1112 xGPU = thrust::device_pointer_cast(xarray); 1113 bGPU = thrust::device_pointer_cast(barray); 1114 1115 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1116 /* First, reorder with the row permutation */ 1117 thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), 1118 thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()), 1119 xGPU); 1120 1121 /* Next, solve L */ 1122 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp, 1123 loTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactor->descr, 1124 loTriFactor->csrMat->values->data().get(), 1125 loTriFactor->csrMat->row_offsets->data().get(), 1126 loTriFactor->csrMat->column_indices->data().get(), 1127 loTriFactor->solveInfo, 1128 xarray, tempGPU->data().get());CHKERRCUDA(stat); 1129 1130 /* Then, solve U */ 1131 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp, 1132 upTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactor->descr, 1133 upTriFactor->csrMat->values->data().get(), 1134 upTriFactor->csrMat->row_offsets->data().get(), 1135 upTriFactor->csrMat->column_indices->data().get(), 1136 upTriFactor->solveInfo, 1137 tempGPU->data().get(), xarray);CHKERRCUDA(stat); 1138 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1139 1140 /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */ 1141 thrust::copy(thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()), 1142 thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->end()), 1143 tempGPU->begin()); 1144 1145 /* Copy the temporary to the full solution. */ 1146 thrust::copy(tempGPU->begin(), tempGPU->end(), xGPU); 1147 1148 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1149 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1150 ierr = WaitForGPU();CHKERRCUDA(ierr); 1151 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1152 PetscFunctionReturn(0); 1153 } 1154 1155 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 1156 { 1157 const PetscScalar *barray; 1158 PetscScalar *xarray; 1159 cusparseStatus_t stat; 1160 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1161 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1162 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1163 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1164 PetscErrorCode ierr; 1165 1166 PetscFunctionBegin; 1167 /* Get the GPU pointers */ 1168 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1169 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1170 1171 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1172 /* First, solve L */ 1173 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp, 1174 loTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactor->descr, 1175 loTriFactor->csrMat->values->data().get(), 1176 loTriFactor->csrMat->row_offsets->data().get(), 1177 loTriFactor->csrMat->column_indices->data().get(), 1178 loTriFactor->solveInfo, 1179 barray, tempGPU->data().get());CHKERRCUDA(stat); 1180 1181 /* Next, solve U */ 1182 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp, 1183 upTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactor->descr, 1184 upTriFactor->csrMat->values->data().get(), 1185 upTriFactor->csrMat->row_offsets->data().get(), 1186 upTriFactor->csrMat->column_indices->data().get(), 1187 upTriFactor->solveInfo, 1188 tempGPU->data().get(), xarray);CHKERRCUDA(stat); 1189 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1190 1191 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1192 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1193 ierr = WaitForGPU();CHKERRCUDA(ierr); 1194 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1195 PetscFunctionReturn(0); 1196 } 1197 1198 static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A) 1199 { 1200 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1201 Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 1202 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1203 PetscInt m = A->rmap->n,*ii,*ridx; 1204 PetscErrorCode ierr; 1205 cusparseStatus_t stat; 1206 cudaError_t err; 1207 1208 PetscFunctionBegin; 1209 if (A->pinnedtocpu) PetscFunctionReturn(0); 1210 if (A->valid_GPU_matrix == PETSC_OFFLOAD_UNALLOCATED || A->valid_GPU_matrix == PETSC_OFFLOAD_CPU) { 1211 ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1212 if (A->assembled && A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) { 1213 CsrMatrix *matrix = (CsrMatrix*)matstruct->mat; 1214 /* copy values only */ 1215 matrix->values->assign(a->a, a->a+a->nz); 1216 ierr = PetscLogCpuToGpu((a->nz)*sizeof(PetscScalar));CHKERRQ(ierr); 1217 } else { 1218 MatSeqAIJCUSPARSEMultStruct_Destroy(&matstruct,cusparsestruct->format); 1219 try { 1220 cusparsestruct->nonzerorow=0; 1221 for (int j = 0; j<m; j++) cusparsestruct->nonzerorow += ((a->i[j+1]-a->i[j])>0); 1222 1223 if (a->compressedrow.use) { 1224 m = a->compressedrow.nrows; 1225 ii = a->compressedrow.i; 1226 ridx = a->compressedrow.rindex; 1227 } else { 1228 /* Forcing compressed row on the GPU */ 1229 int k=0; 1230 ierr = PetscMalloc1(cusparsestruct->nonzerorow+1, &ii);CHKERRQ(ierr); 1231 ierr = PetscMalloc1(cusparsestruct->nonzerorow, &ridx);CHKERRQ(ierr); 1232 ii[0]=0; 1233 for (int j = 0; j<m; j++) { 1234 if ((a->i[j+1]-a->i[j])>0) { 1235 ii[k] = a->i[j]; 1236 ridx[k]= j; 1237 k++; 1238 } 1239 } 1240 ii[cusparsestruct->nonzerorow] = a->nz; 1241 m = cusparsestruct->nonzerorow; 1242 } 1243 1244 /* allocate space for the triangular factor information */ 1245 matstruct = new Mat_SeqAIJCUSPARSEMultStruct; 1246 stat = cusparseCreateMatDescr(&matstruct->descr);CHKERRCUDA(stat); 1247 stat = cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat); 1248 stat = cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUDA(stat); 1249 1250 err = cudaMalloc((void **)&(matstruct->alpha), sizeof(PetscScalar));CHKERRCUDA(err); 1251 err = cudaMalloc((void **)&(matstruct->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err); 1252 err = cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar));CHKERRCUDA(err); 1253 err = cudaMemcpy(matstruct->alpha, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1254 err = cudaMemcpy(matstruct->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1255 err = cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1256 stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUDA(stat); 1257 1258 /* Build a hybrid/ellpack matrix if this option is chosen for the storage */ 1259 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1260 /* set the matrix */ 1261 CsrMatrix *matrix= new CsrMatrix; 1262 matrix->num_rows = m; 1263 matrix->num_cols = A->cmap->n; 1264 matrix->num_entries = a->nz; 1265 matrix->row_offsets = new THRUSTINTARRAY32(m+1); 1266 matrix->row_offsets->assign(ii, ii + m+1); 1267 1268 matrix->column_indices = new THRUSTINTARRAY32(a->nz); 1269 matrix->column_indices->assign(a->j, a->j+a->nz); 1270 1271 matrix->values = new THRUSTARRAY(a->nz); 1272 matrix->values->assign(a->a, a->a+a->nz); 1273 1274 /* assign the pointer */ 1275 matstruct->mat = matrix; 1276 1277 } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) { 1278 #if CUDA_VERSION>=4020 1279 CsrMatrix *matrix= new CsrMatrix; 1280 matrix->num_rows = m; 1281 matrix->num_cols = A->cmap->n; 1282 matrix->num_entries = a->nz; 1283 matrix->row_offsets = new THRUSTINTARRAY32(m+1); 1284 matrix->row_offsets->assign(ii, ii + m+1); 1285 1286 matrix->column_indices = new THRUSTINTARRAY32(a->nz); 1287 matrix->column_indices->assign(a->j, a->j+a->nz); 1288 1289 matrix->values = new THRUSTARRAY(a->nz); 1290 matrix->values->assign(a->a, a->a+a->nz); 1291 1292 cusparseHybMat_t hybMat; 1293 stat = cusparseCreateHybMat(&hybMat);CHKERRCUDA(stat); 1294 cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ? 1295 CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 1296 stat = cusparse_csr2hyb(cusparsestruct->handle, matrix->num_rows, matrix->num_cols, 1297 matstruct->descr, matrix->values->data().get(), 1298 matrix->row_offsets->data().get(), 1299 matrix->column_indices->data().get(), 1300 hybMat, 0, partition);CHKERRCUDA(stat); 1301 /* assign the pointer */ 1302 matstruct->mat = hybMat; 1303 1304 if (matrix) { 1305 if (matrix->values) delete (THRUSTARRAY*)matrix->values; 1306 if (matrix->column_indices) delete (THRUSTINTARRAY32*)matrix->column_indices; 1307 if (matrix->row_offsets) delete (THRUSTINTARRAY32*)matrix->row_offsets; 1308 delete (CsrMatrix*)matrix; 1309 } 1310 #endif 1311 } 1312 1313 /* assign the compressed row indices */ 1314 matstruct->cprowIndices = new THRUSTINTARRAY(m); 1315 matstruct->cprowIndices->assign(ridx,ridx+m); 1316 ierr = PetscLogCpuToGpu(((m+1)+(a->nz))*sizeof(int)+m*sizeof(PetscInt)+(3+(a->nz))*sizeof(PetscScalar));CHKERRQ(ierr); 1317 1318 /* assign the pointer */ 1319 cusparsestruct->mat = matstruct; 1320 1321 if (!a->compressedrow.use) { 1322 ierr = PetscFree(ii);CHKERRQ(ierr); 1323 ierr = PetscFree(ridx);CHKERRQ(ierr); 1324 } 1325 cusparsestruct->workVector = new THRUSTARRAY(m); 1326 } catch(char *ex) { 1327 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 1328 } 1329 cusparsestruct->nonzerostate = A->nonzerostate; 1330 } 1331 ierr = WaitForGPU();CHKERRCUDA(ierr); 1332 A->valid_GPU_matrix = PETSC_OFFLOAD_BOTH; 1333 ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1334 } 1335 PetscFunctionReturn(0); 1336 } 1337 1338 struct VecCUDAPlusEquals 1339 { 1340 template <typename Tuple> 1341 __host__ __device__ 1342 void operator()(Tuple t) 1343 { 1344 thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t); 1345 } 1346 }; 1347 1348 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 1349 { 1350 PetscErrorCode ierr; 1351 1352 PetscFunctionBegin; 1353 ierr = MatMultAdd_SeqAIJCUSPARSE(A,xx,NULL,yy);CHKERRQ(ierr); 1354 PetscFunctionReturn(0); 1355 } 1356 1357 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 1358 { 1359 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1360 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1361 Mat_SeqAIJCUSPARSEMultStruct *matstructT; 1362 const PetscScalar *xarray; 1363 PetscScalar *yarray; 1364 PetscErrorCode ierr; 1365 cusparseStatus_t stat; 1366 1367 PetscFunctionBegin; 1368 /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */ 1369 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 1370 matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 1371 if (!matstructT) { 1372 ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr); 1373 matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 1374 } 1375 ierr = VecCUDAGetArrayRead(xx,&xarray);CHKERRQ(ierr); 1376 ierr = VecSet(yy,0);CHKERRQ(ierr); 1377 ierr = VecCUDAGetArrayWrite(yy,&yarray);CHKERRQ(ierr); 1378 1379 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1380 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1381 CsrMatrix *mat = (CsrMatrix*)matstructT->mat; 1382 stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1383 mat->num_rows, mat->num_cols, 1384 mat->num_entries, matstructT->alpha, matstructT->descr, 1385 mat->values->data().get(), mat->row_offsets->data().get(), 1386 mat->column_indices->data().get(), xarray, matstructT->beta_zero, 1387 yarray);CHKERRCUDA(stat); 1388 } else { 1389 #if CUDA_VERSION>=4020 1390 cusparseHybMat_t hybMat = (cusparseHybMat_t)matstructT->mat; 1391 stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1392 matstructT->alpha, matstructT->descr, hybMat, 1393 xarray, matstructT->beta_zero, 1394 yarray);CHKERRCUDA(stat); 1395 #endif 1396 } 1397 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1398 ierr = VecCUDARestoreArrayRead(xx,&xarray);CHKERRQ(ierr); 1399 ierr = VecCUDARestoreArrayWrite(yy,&yarray);CHKERRQ(ierr); 1400 if (!cusparsestruct->stream) { 1401 ierr = WaitForGPU();CHKERRCUDA(ierr); 1402 } 1403 ierr = PetscLogGpuFlops(2.0*a->nz - cusparsestruct->nonzerorow);CHKERRQ(ierr); 1404 PetscFunctionReturn(0); 1405 } 1406 1407 1408 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 1409 { 1410 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1411 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1412 Mat_SeqAIJCUSPARSEMultStruct *matstruct; 1413 const PetscScalar *xarray; 1414 PetscScalar *zarray,*dptr,*beta; 1415 PetscErrorCode ierr; 1416 cusparseStatus_t stat; 1417 1418 PetscFunctionBegin; 1419 /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */ 1420 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 1421 matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 1422 try { 1423 ierr = VecCUDAGetArrayRead(xx,&xarray);CHKERRQ(ierr); 1424 ierr = VecCUDAGetArray(zz,&zarray);CHKERRQ(ierr); 1425 dptr = cusparsestruct->workVector->size() == (thrust::detail::vector_base<PetscScalar, thrust::device_malloc_allocator<PetscScalar> >::size_type)(A->rmap->n) ? zarray : cusparsestruct->workVector->data().get(); 1426 beta = (yy == zz && dptr == zarray) ? matstruct->beta_one : matstruct->beta_zero; 1427 1428 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1429 /* csr_spmv is multiply add */ 1430 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 1431 /* here we need to be careful to set the number of rows in the multiply to the 1432 number of compressed rows in the matrix ... which is equivalent to the 1433 size of the workVector */ 1434 CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 1435 stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1436 mat->num_rows, mat->num_cols, 1437 mat->num_entries, matstruct->alpha, matstruct->descr, 1438 mat->values->data().get(), mat->row_offsets->data().get(), 1439 mat->column_indices->data().get(), xarray, beta, 1440 dptr);CHKERRCUDA(stat); 1441 } else { 1442 #if CUDA_VERSION>=4020 1443 cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat; 1444 if (cusparsestruct->workVector->size()) { 1445 stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1446 matstruct->alpha, matstruct->descr, hybMat, 1447 xarray, beta, 1448 dptr);CHKERRCUDA(stat); 1449 } 1450 #endif 1451 } 1452 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1453 1454 if (yy) { 1455 if (dptr != zarray) { 1456 ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr); 1457 } else if (zz != yy) { 1458 ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); 1459 } 1460 } else if (dptr != zarray) { 1461 ierr = VecSet(zz,0);CHKERRQ(ierr); 1462 } 1463 /* scatter the data from the temporary into the full vector with a += operation */ 1464 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1465 if (dptr != zarray) { 1466 thrust::device_ptr<PetscScalar> zptr; 1467 1468 zptr = thrust::device_pointer_cast(zarray); 1469 thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))), 1470 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + cusparsestruct->workVector->size(), 1471 VecCUDAPlusEquals()); 1472 } 1473 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1474 ierr = VecCUDARestoreArrayRead(xx,&xarray);CHKERRQ(ierr); 1475 ierr = VecCUDARestoreArray(zz,&zarray);CHKERRQ(ierr); 1476 } catch(char *ex) { 1477 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 1478 } 1479 if (!yy) { /* MatMult */ 1480 if (!cusparsestruct->stream) { 1481 ierr = WaitForGPU();CHKERRCUDA(ierr); 1482 } 1483 } 1484 ierr = PetscLogGpuFlops(2.0*a->nz);CHKERRQ(ierr); 1485 PetscFunctionReturn(0); 1486 } 1487 1488 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 1489 { 1490 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1491 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1492 Mat_SeqAIJCUSPARSEMultStruct *matstructT; 1493 thrust::device_ptr<PetscScalar> zptr; 1494 const PetscScalar *xarray; 1495 PetscScalar *zarray; 1496 PetscErrorCode ierr; 1497 cusparseStatus_t stat; 1498 1499 PetscFunctionBegin; 1500 /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */ 1501 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 1502 matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 1503 if (!matstructT) { 1504 ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr); 1505 matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 1506 } 1507 1508 try { 1509 ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr); 1510 ierr = VecCUDAGetArrayRead(xx,&xarray);CHKERRQ(ierr); 1511 ierr = VecCUDAGetArray(zz,&zarray);CHKERRQ(ierr); 1512 zptr = thrust::device_pointer_cast(zarray); 1513 1514 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1515 /* multiply add with matrix transpose */ 1516 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1517 CsrMatrix *mat = (CsrMatrix*)matstructT->mat; 1518 /* here we need to be careful to set the number of rows in the multiply to the 1519 number of compressed rows in the matrix ... which is equivalent to the 1520 size of the workVector */ 1521 stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1522 mat->num_rows, mat->num_cols, 1523 mat->num_entries, matstructT->alpha, matstructT->descr, 1524 mat->values->data().get(), mat->row_offsets->data().get(), 1525 mat->column_indices->data().get(), xarray, matstructT->beta_zero, 1526 cusparsestruct->workVector->data().get());CHKERRCUDA(stat); 1527 } else { 1528 #if CUDA_VERSION>=4020 1529 cusparseHybMat_t hybMat = (cusparseHybMat_t)matstructT->mat; 1530 if (cusparsestruct->workVector->size()) { 1531 stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1532 matstructT->alpha, matstructT->descr, hybMat, 1533 xarray, matstructT->beta_zero, 1534 cusparsestruct->workVector->data().get());CHKERRCUDA(stat); 1535 } 1536 #endif 1537 } 1538 1539 /* scatter the data from the temporary into the full vector with a += operation */ 1540 thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstructT->cprowIndices->begin()))), 1541 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstructT->cprowIndices->begin()))) + A->cmap->n, 1542 VecCUDAPlusEquals()); 1543 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1544 ierr = VecCUDARestoreArrayRead(xx,&xarray);CHKERRQ(ierr); 1545 ierr = VecCUDARestoreArray(zz,&zarray);CHKERRQ(ierr); 1546 1547 } catch(char *ex) { 1548 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 1549 } 1550 ierr = WaitForGPU();CHKERRCUDA(ierr); 1551 ierr = PetscLogGpuFlops(2.0*a->nz);CHKERRQ(ierr); 1552 PetscFunctionReturn(0); 1553 } 1554 1555 static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode) 1556 { 1557 PetscErrorCode ierr; 1558 1559 PetscFunctionBegin; 1560 ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr); 1561 if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0); 1562 if (A->factortype == MAT_FACTOR_NONE) { 1563 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 1564 } 1565 A->ops->mult = MatMult_SeqAIJCUSPARSE; 1566 A->ops->multadd = MatMultAdd_SeqAIJCUSPARSE; 1567 A->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE; 1568 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE; 1569 PetscFunctionReturn(0); 1570 } 1571 1572 /* --------------------------------------------------------------------------------*/ 1573 /*@ 1574 MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format 1575 (the default parallel PETSc format). This matrix will ultimately pushed down 1576 to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix 1577 assembly performance the user should preallocate the matrix storage by setting 1578 the parameter nz (or the array nnz). By setting these parameters accurately, 1579 performance during matrix assembly can be increased by more than a factor of 50. 1580 1581 Collective 1582 1583 Input Parameters: 1584 + comm - MPI communicator, set to PETSC_COMM_SELF 1585 . m - number of rows 1586 . n - number of columns 1587 . nz - number of nonzeros per row (same for all rows) 1588 - nnz - array containing the number of nonzeros in the various rows 1589 (possibly different for each row) or NULL 1590 1591 Output Parameter: 1592 . A - the matrix 1593 1594 It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 1595 MatXXXXSetPreallocation() paradgm instead of this routine directly. 1596 [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 1597 1598 Notes: 1599 If nnz is given then nz is ignored 1600 1601 The AIJ format (also called the Yale sparse matrix format or 1602 compressed row storage), is fully compatible with standard Fortran 77 1603 storage. That is, the stored row and column indices can begin at 1604 either one (as in Fortran) or zero. See the users' manual for details. 1605 1606 Specify the preallocated storage with either nz or nnz (not both). 1607 Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 1608 allocation. For large problems you MUST preallocate memory or you 1609 will get TERRIBLE performance, see the users' manual chapter on matrices. 1610 1611 By default, this format uses inodes (identical nodes) when possible, to 1612 improve numerical efficiency of matrix-vector products and solves. We 1613 search for consecutive rows with the same nonzero structure, thereby 1614 reusing matrix information to achieve increased efficiency. 1615 1616 Level: intermediate 1617 1618 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE 1619 @*/ 1620 PetscErrorCode MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A) 1621 { 1622 PetscErrorCode ierr; 1623 1624 PetscFunctionBegin; 1625 ierr = MatCreate(comm,A);CHKERRQ(ierr); 1626 ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr); 1627 ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 1628 ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr); 1629 PetscFunctionReturn(0); 1630 } 1631 1632 static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A) 1633 { 1634 PetscErrorCode ierr; 1635 1636 PetscFunctionBegin; 1637 if (A->factortype==MAT_FACTOR_NONE) { 1638 if (A->valid_GPU_matrix != PETSC_OFFLOAD_UNALLOCATED) { 1639 ierr = MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr);CHKERRQ(ierr); 1640 } 1641 } else { 1642 ierr = MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr);CHKERRQ(ierr); 1643 } 1644 ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr); 1645 PetscFunctionReturn(0); 1646 } 1647 1648 static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A,MatDuplicateOption cpvalues,Mat *B) 1649 { 1650 PetscErrorCode ierr; 1651 Mat C; 1652 cusparseStatus_t stat; 1653 cusparseHandle_t handle=0; 1654 1655 PetscFunctionBegin; 1656 ierr = MatDuplicate_SeqAIJ(A,cpvalues,B);CHKERRQ(ierr); 1657 C = *B; 1658 ierr = PetscFree(C->defaultvectype);CHKERRQ(ierr); 1659 ierr = PetscStrallocpy(VECCUDA,&C->defaultvectype);CHKERRQ(ierr); 1660 1661 /* inject CUSPARSE-specific stuff */ 1662 if (C->factortype==MAT_FACTOR_NONE) { 1663 /* you cannot check the inode.use flag here since the matrix was just created. 1664 now build a GPU matrix data structure */ 1665 C->spptr = new Mat_SeqAIJCUSPARSE; 1666 ((Mat_SeqAIJCUSPARSE*)C->spptr)->mat = 0; 1667 ((Mat_SeqAIJCUSPARSE*)C->spptr)->matTranspose = 0; 1668 ((Mat_SeqAIJCUSPARSE*)C->spptr)->workVector = 0; 1669 ((Mat_SeqAIJCUSPARSE*)C->spptr)->format = MAT_CUSPARSE_CSR; 1670 ((Mat_SeqAIJCUSPARSE*)C->spptr)->stream = 0; 1671 stat = cusparseCreate(&handle);CHKERRCUDA(stat); 1672 ((Mat_SeqAIJCUSPARSE*)C->spptr)->handle = handle; 1673 ((Mat_SeqAIJCUSPARSE*)C->spptr)->nonzerostate = 0; 1674 } else { 1675 /* NEXT, set the pointers to the triangular factors */ 1676 C->spptr = new Mat_SeqAIJCUSPARSETriFactors; 1677 ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->loTriFactorPtr = 0; 1678 ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->upTriFactorPtr = 0; 1679 ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->loTriFactorPtrTranspose = 0; 1680 ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->upTriFactorPtrTranspose = 0; 1681 ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->rpermIndices = 0; 1682 ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->cpermIndices = 0; 1683 ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->workVector = 0; 1684 ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->handle = 0; 1685 stat = cusparseCreate(&handle);CHKERRCUDA(stat); 1686 ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->handle = handle; 1687 ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->nnz = 0; 1688 } 1689 1690 C->ops->assemblyend = MatAssemblyEnd_SeqAIJCUSPARSE; 1691 C->ops->destroy = MatDestroy_SeqAIJCUSPARSE; 1692 C->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE; 1693 C->ops->mult = MatMult_SeqAIJCUSPARSE; 1694 C->ops->multadd = MatMultAdd_SeqAIJCUSPARSE; 1695 C->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE; 1696 C->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE; 1697 C->ops->duplicate = MatDuplicate_SeqAIJCUSPARSE; 1698 1699 ierr = PetscObjectChangeTypeName((PetscObject)C,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 1700 1701 C->valid_GPU_matrix = PETSC_OFFLOAD_UNALLOCATED; 1702 1703 ierr = PetscObjectComposeFunction((PetscObject)C, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr); 1704 PetscFunctionReturn(0); 1705 } 1706 1707 PETSC_EXTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat B) 1708 { 1709 PetscErrorCode ierr; 1710 cusparseStatus_t stat; 1711 cusparseHandle_t handle=0; 1712 1713 PetscFunctionBegin; 1714 ierr = PetscFree(B->defaultvectype);CHKERRQ(ierr); 1715 ierr = PetscStrallocpy(VECCUDA,&B->defaultvectype);CHKERRQ(ierr); 1716 1717 if (B->factortype==MAT_FACTOR_NONE) { 1718 /* you cannot check the inode.use flag here since the matrix was just created. 1719 now build a GPU matrix data structure */ 1720 B->spptr = new Mat_SeqAIJCUSPARSE; 1721 ((Mat_SeqAIJCUSPARSE*)B->spptr)->mat = 0; 1722 ((Mat_SeqAIJCUSPARSE*)B->spptr)->matTranspose = 0; 1723 ((Mat_SeqAIJCUSPARSE*)B->spptr)->workVector = 0; 1724 ((Mat_SeqAIJCUSPARSE*)B->spptr)->format = MAT_CUSPARSE_CSR; 1725 ((Mat_SeqAIJCUSPARSE*)B->spptr)->stream = 0; 1726 stat = cusparseCreate(&handle);CHKERRCUDA(stat); 1727 ((Mat_SeqAIJCUSPARSE*)B->spptr)->handle = handle; 1728 ((Mat_SeqAIJCUSPARSE*)B->spptr)->nonzerostate = 0; 1729 } else { 1730 /* NEXT, set the pointers to the triangular factors */ 1731 B->spptr = new Mat_SeqAIJCUSPARSETriFactors; 1732 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->loTriFactorPtr = 0; 1733 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->upTriFactorPtr = 0; 1734 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->loTriFactorPtrTranspose = 0; 1735 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->upTriFactorPtrTranspose = 0; 1736 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->rpermIndices = 0; 1737 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->cpermIndices = 0; 1738 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->workVector = 0; 1739 stat = cusparseCreate(&handle);CHKERRCUDA(stat); 1740 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->handle = handle; 1741 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->nnz = 0; 1742 } 1743 1744 B->ops->assemblyend = MatAssemblyEnd_SeqAIJCUSPARSE; 1745 B->ops->destroy = MatDestroy_SeqAIJCUSPARSE; 1746 B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE; 1747 B->ops->mult = MatMult_SeqAIJCUSPARSE; 1748 B->ops->multadd = MatMultAdd_SeqAIJCUSPARSE; 1749 B->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE; 1750 B->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE; 1751 B->ops->duplicate = MatDuplicate_SeqAIJCUSPARSE; 1752 1753 ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 1754 1755 B->valid_GPU_matrix = PETSC_OFFLOAD_UNALLOCATED; 1756 1757 ierr = PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr); 1758 PetscFunctionReturn(0); 1759 } 1760 1761 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B) 1762 { 1763 PetscErrorCode ierr; 1764 1765 PetscFunctionBegin; 1766 ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr); 1767 ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(B);CHKERRQ(ierr); 1768 PetscFunctionReturn(0); 1769 } 1770 1771 /*MC 1772 MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices. 1773 1774 A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either 1775 CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later. 1776 All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library. 1777 1778 Options Database Keys: 1779 + -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions() 1780 . -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 1781 - -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 1782 1783 Level: beginner 1784 1785 .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 1786 M*/ 1787 1788 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat,MatFactorType,Mat*); 1789 1790 1791 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void) 1792 { 1793 PetscErrorCode ierr; 1794 1795 PetscFunctionBegin; 1796 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 1797 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 1798 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 1799 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 1800 PetscFunctionReturn(0); 1801 } 1802 1803 1804 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct) 1805 { 1806 cusparseStatus_t stat; 1807 cusparseHandle_t handle; 1808 1809 PetscFunctionBegin; 1810 if (*cusparsestruct) { 1811 MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format); 1812 MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format); 1813 delete (*cusparsestruct)->workVector; 1814 if (handle = (*cusparsestruct)->handle) { 1815 stat = cusparseDestroy(handle);CHKERRCUDA(stat); 1816 } 1817 delete *cusparsestruct; 1818 *cusparsestruct = 0; 1819 } 1820 PetscFunctionReturn(0); 1821 } 1822 1823 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat) 1824 { 1825 PetscFunctionBegin; 1826 if (*mat) { 1827 delete (*mat)->values; 1828 delete (*mat)->column_indices; 1829 delete (*mat)->row_offsets; 1830 delete *mat; 1831 *mat = 0; 1832 } 1833 PetscFunctionReturn(0); 1834 } 1835 1836 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor) 1837 { 1838 cusparseStatus_t stat; 1839 PetscErrorCode ierr; 1840 1841 PetscFunctionBegin; 1842 if (*trifactor) { 1843 if ((*trifactor)->descr) { stat = cusparseDestroyMatDescr((*trifactor)->descr);CHKERRCUDA(stat); } 1844 if ((*trifactor)->solveInfo) { stat = cusparseDestroySolveAnalysisInfo((*trifactor)->solveInfo);CHKERRCUDA(stat); } 1845 ierr = CsrMatrix_Destroy(&(*trifactor)->csrMat);CHKERRQ(ierr); 1846 delete *trifactor; 1847 *trifactor = 0; 1848 } 1849 PetscFunctionReturn(0); 1850 } 1851 1852 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format) 1853 { 1854 CsrMatrix *mat; 1855 cusparseStatus_t stat; 1856 cudaError_t err; 1857 1858 PetscFunctionBegin; 1859 if (*matstruct) { 1860 if ((*matstruct)->mat) { 1861 if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) { 1862 cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat; 1863 stat = cusparseDestroyHybMat(hybMat);CHKERRCUDA(stat); 1864 } else { 1865 mat = (CsrMatrix*)(*matstruct)->mat; 1866 CsrMatrix_Destroy(&mat); 1867 } 1868 } 1869 if ((*matstruct)->descr) { stat = cusparseDestroyMatDescr((*matstruct)->descr);CHKERRCUDA(stat); } 1870 delete (*matstruct)->cprowIndices; 1871 if ((*matstruct)->alpha) { err=cudaFree((*matstruct)->alpha);CHKERRCUDA(err); } 1872 if ((*matstruct)->beta_zero) { err=cudaFree((*matstruct)->beta_zero);CHKERRCUDA(err); } 1873 if ((*matstruct)->beta_one) { err=cudaFree((*matstruct)->beta_one);CHKERRCUDA(err); } 1874 delete *matstruct; 1875 *matstruct = 0; 1876 } 1877 PetscFunctionReturn(0); 1878 } 1879 1880 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors) 1881 { 1882 cusparseHandle_t handle; 1883 cusparseStatus_t stat; 1884 1885 PetscFunctionBegin; 1886 if (*trifactors) { 1887 MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtr); 1888 MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtr); 1889 MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtrTranspose); 1890 MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtrTranspose); 1891 delete (*trifactors)->rpermIndices; 1892 delete (*trifactors)->cpermIndices; 1893 delete (*trifactors)->workVector; 1894 if (handle = (*trifactors)->handle) { 1895 stat = cusparseDestroy(handle);CHKERRCUDA(stat); 1896 } 1897 delete *trifactors; 1898 *trifactors = 0; 1899 } 1900 PetscFunctionReturn(0); 1901 } 1902 1903