1 /*
2 Defines the basic matrix operations for the AIJ (compressed row)
3 matrix storage format using the CUSPARSE library,
4 */
5 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
6
7 #include <petscconf.h>
8 #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/
9 #include <../src/mat/impls/sbaij/seq/sbaij.h>
10 #include <../src/vec/vec/impls/dvecimpl.h>
11 #include <petsc/private/vecimpl.h>
12 #undef VecType
13 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
14 #include <thrust/adjacent_difference.h>
15 #if PETSC_CPP_VERSION >= 14
16 #define PETSC_HAVE_THRUST_ASYNC 1
17 // thrust::for_each(thrust::cuda::par.on()) requires C++14
18 #endif
19 #include <thrust/iterator/constant_iterator.h>
20 #include <thrust/remove.h>
21 #include <thrust/sort.h>
22 #include <thrust/unique.h>
23 #if PETSC_PKG_CUDA_VERSION_GE(12, 9, 0) && !PetscDefined(HAVE_THRUST)
24 #include <cuda/std/functional>
25 #endif
26
27 const char *const MatCUSPARSEStorageFormats[] = {"CSR", "ELL", "HYB", "MatCUSPARSEStorageFormat", "MAT_CUSPARSE_", 0};
28 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
29 /*
30 The following are copied from cusparse.h in CUDA-11.0. In MatCUSPARSESpMVAlgorithms[] etc, we copy them in
31 0-based integer value order, since we want to use PetscOptionsEnum() to parse user command line options for them.
32 */
33 const char *const MatCUSPARSESpMVAlgorithms[] = {"MV_ALG_DEFAULT", "COOMV_ALG", "CSRMV_ALG1", "CSRMV_ALG2", "cusparseSpMVAlg_t", "CUSPARSE_", 0};
34 const char *const MatCUSPARSESpMMAlgorithms[] = {"ALG_DEFAULT", "COO_ALG1", "COO_ALG2", "COO_ALG3", "CSR_ALG1", "COO_ALG4", "CSR_ALG2", "cusparseSpMMAlg_t", "CUSPARSE_SPMM_", 0};
35 const char *const MatCUSPARSECsr2CscAlgorithms[] = {"INVALID" /*cusparse does not have enum 0! We created one*/, "ALG1", "ALG2", "cusparseCsr2CscAlg_t", "CUSPARSE_CSR2CSC_", 0};
36 #endif
37
38 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, const MatFactorInfo *);
39 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, const MatFactorInfo *);
40 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat, Mat, const MatFactorInfo *);
41 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, IS, const MatFactorInfo *);
42 #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0)
43 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat, Vec, Vec);
44 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat, Vec, Vec);
45 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat, Vec, Vec);
46 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat, Vec, Vec);
47 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **);
48 #endif
49 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat, PetscOptionItems PetscOptionsObject);
50 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat, PetscScalar, Mat, MatStructure);
51 static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat, PetscScalar);
52 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat, Vec, Vec);
53 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec);
54 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat, Vec, Vec);
55 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec);
56 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat, Vec, Vec);
57 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec);
58 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec, PetscBool, PetscBool);
59
60 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **);
61 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **, MatCUSPARSEStorageFormat);
62 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors **);
63 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat);
64
65 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat);
66 static PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat, PetscBool);
67
68 static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat, PetscInt, const PetscInt[], PetscScalar[]);
69 static PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat, PetscCount, PetscInt[], PetscInt[]);
70 static PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat, const PetscScalar[], InsertMode);
71
MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)72 PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A, MatCUSPARSEFormatOperation op, MatCUSPARSEStorageFormat format)
73 {
74 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
75
76 PetscFunctionBegin;
77 switch (op) {
78 case MAT_CUSPARSE_MULT:
79 cusparsestruct->format = format;
80 break;
81 case MAT_CUSPARSE_ALL:
82 cusparsestruct->format = format;
83 break;
84 default:
85 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.", op);
86 }
87 PetscFunctionReturn(PETSC_SUCCESS);
88 }
89
90 /*@
91 MatCUSPARSESetFormat - Sets the storage format of `MATSEQCUSPARSE` matrices for a particular
92 operation. Only the `MatMult()` operation can use different GPU storage formats
93
94 Not Collective
95
96 Input Parameters:
97 + A - Matrix of type `MATSEQAIJCUSPARSE`
98 . op - `MatCUSPARSEFormatOperation`. `MATSEQAIJCUSPARSE` matrices support `MAT_CUSPARSE_MULT` and `MAT_CUSPARSE_ALL`.
99 `MATMPIAIJCUSPARSE` matrices support `MAT_CUSPARSE_MULT_DIAG`,`MAT_CUSPARSE_MULT_OFFDIAG`, and `MAT_CUSPARSE_ALL`.
100 - format - `MatCUSPARSEStorageFormat` (one of `MAT_CUSPARSE_CSR`, `MAT_CUSPARSE_ELL`, `MAT_CUSPARSE_HYB`.)
101
102 Level: intermediate
103
104 .seealso: [](ch_matrices), `Mat`, `MATSEQAIJCUSPARSE`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
105 @*/
MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)106 PetscErrorCode MatCUSPARSESetFormat(Mat A, MatCUSPARSEFormatOperation op, MatCUSPARSEStorageFormat format)
107 {
108 PetscFunctionBegin;
109 PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
110 PetscTryMethod(A, "MatCUSPARSESetFormat_C", (Mat, MatCUSPARSEFormatOperation, MatCUSPARSEStorageFormat), (A, op, format));
111 PetscFunctionReturn(PETSC_SUCCESS);
112 }
113
MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE(Mat A,PetscBool use_cpu)114 PETSC_INTERN PetscErrorCode MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE(Mat A, PetscBool use_cpu)
115 {
116 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
117
118 PetscFunctionBegin;
119 cusparsestruct->use_cpu_solve = use_cpu;
120 PetscFunctionReturn(PETSC_SUCCESS);
121 }
122
123 /*@
124 MatCUSPARSESetUseCPUSolve - Sets to use CPU `MatSolve()`.
125
126 Input Parameters:
127 + A - Matrix of type `MATSEQAIJCUSPARSE`
128 - use_cpu - set flag for using the built-in CPU `MatSolve()`
129
130 Level: intermediate
131
132 Note:
133 The NVIDIA cuSPARSE LU solver currently computes the factors with the built-in CPU method
134 and moves the factors to the GPU for the solve. We have observed better performance keeping the data on the CPU and performing the solve there.
135 This method to specify if the solve is done on the CPU or GPU (GPU is the default).
136
137 .seealso: [](ch_matrices), `Mat`, `MatSolve()`, `MATSEQAIJCUSPARSE`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
138 @*/
MatCUSPARSESetUseCPUSolve(Mat A,PetscBool use_cpu)139 PetscErrorCode MatCUSPARSESetUseCPUSolve(Mat A, PetscBool use_cpu)
140 {
141 PetscFunctionBegin;
142 PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
143 PetscTryMethod(A, "MatCUSPARSESetUseCPUSolve_C", (Mat, PetscBool), (A, use_cpu));
144 PetscFunctionReturn(PETSC_SUCCESS);
145 }
146
MatSetOption_SeqAIJCUSPARSE(Mat A,MatOption op,PetscBool flg)147 static PetscErrorCode MatSetOption_SeqAIJCUSPARSE(Mat A, MatOption op, PetscBool flg)
148 {
149 PetscFunctionBegin;
150 switch (op) {
151 case MAT_FORM_EXPLICIT_TRANSPOSE:
152 /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */
153 if (A->form_explicit_transpose && !flg) PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE));
154 A->form_explicit_transpose = flg;
155 break;
156 default:
157 PetscCall(MatSetOption_SeqAIJ(A, op, flg));
158 break;
159 }
160 PetscFunctionReturn(PETSC_SUCCESS);
161 }
162
MatSetFromOptions_SeqAIJCUSPARSE(Mat A,PetscOptionItems PetscOptionsObject)163 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat A, PetscOptionItems PetscOptionsObject)
164 {
165 MatCUSPARSEStorageFormat format;
166 PetscBool flg;
167 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
168
169 PetscFunctionBegin;
170 PetscOptionsHeadBegin(PetscOptionsObject, "SeqAIJCUSPARSE options");
171 if (A->factortype == MAT_FACTOR_NONE) {
172 PetscCall(PetscOptionsEnum("-mat_cusparse_mult_storage_format", "sets storage format of (seq)aijcusparse gpu matrices for SpMV", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparsestruct->format, (PetscEnum *)&format, &flg));
173 if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_MULT, format));
174
175 PetscCall(PetscOptionsEnum("-mat_cusparse_storage_format", "sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparsestruct->format, (PetscEnum *)&format, &flg));
176 if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_ALL, format));
177 PetscCall(PetscOptionsBool("-mat_cusparse_use_cpu_solve", "Use CPU (I)LU solve", "MatCUSPARSESetUseCPUSolve", cusparsestruct->use_cpu_solve, &cusparsestruct->use_cpu_solve, &flg));
178 if (flg) PetscCall(MatCUSPARSESetUseCPUSolve(A, cusparsestruct->use_cpu_solve));
179 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
180 PetscCall(PetscOptionsEnum("-mat_cusparse_spmv_alg", "sets cuSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)", "cusparseSpMVAlg_t", MatCUSPARSESpMVAlgorithms, (PetscEnum)cusparsestruct->spmvAlg, (PetscEnum *)&cusparsestruct->spmvAlg, &flg));
181 /* If user did use this option, check its consistency with cuSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatCUSPARSESpMVAlgorithms[] */
182 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
183 PetscCheck(!flg || CUSPARSE_SPMV_CSR_ALG1 == 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly");
184 #else
185 PetscCheck(!flg || CUSPARSE_CSRMV_ALG1 == 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly");
186 #endif
187 PetscCall(PetscOptionsEnum("-mat_cusparse_spmm_alg", "sets cuSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)", "cusparseSpMMAlg_t", MatCUSPARSESpMMAlgorithms, (PetscEnum)cusparsestruct->spmmAlg, (PetscEnum *)&cusparsestruct->spmmAlg, &flg));
188 PetscCheck(!flg || CUSPARSE_SPMM_CSR_ALG1 == 4, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseSpMMAlg_t has been changed but PETSc has not been updated accordingly");
189
190 PetscCall(
191 PetscOptionsEnum("-mat_cusparse_csr2csc_alg", "sets cuSPARSE algorithm used in converting CSR matrices to CSC matrices", "cusparseCsr2CscAlg_t", MatCUSPARSECsr2CscAlgorithms, (PetscEnum)cusparsestruct->csr2cscAlg, (PetscEnum *)&cusparsestruct->csr2cscAlg, &flg));
192 PetscCheck(!flg || CUSPARSE_CSR2CSC_ALG1 == 1, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseCsr2CscAlg_t has been changed but PETSc has not been updated accordingly");
193 #endif
194 }
195 PetscOptionsHeadEnd();
196 PetscFunctionReturn(PETSC_SUCCESS);
197 }
198
199 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
MatSeqAIJCUSPARSEBuildFactoredMatrix_LU(Mat A)200 static PetscErrorCode MatSeqAIJCUSPARSEBuildFactoredMatrix_LU(Mat A)
201 {
202 Mat_SeqAIJ *a = static_cast<Mat_SeqAIJ *>(A->data);
203 PetscInt m = A->rmap->n;
204 Mat_SeqAIJCUSPARSETriFactors *fs = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr);
205 const PetscInt *Ai = a->i, *Aj = a->j, *adiag;
206 const MatScalar *Aa = a->a;
207 PetscInt *Mi, *Mj, Mnz;
208 PetscScalar *Ma;
209
210 PetscFunctionBegin;
211 PetscCall(MatGetDiagonalMarkers_SeqAIJ(A, &adiag, NULL));
212 if (A->offloadmask == PETSC_OFFLOAD_CPU) { // A's latest factors are on CPU
213 if (!fs->csrRowPtr) { // Is't the first time to do the setup? Use csrRowPtr since it is not null even when m=0
214 // Re-arrange the (skewed) factored matrix and put the result into M, a regular csr matrix on host
215 Mnz = (Ai[m] - Ai[0]) + (adiag[0] - adiag[m]); // Lnz (without the unit diagonal) + Unz (with the non-unit diagonal)
216 PetscCall(PetscMalloc1(m + 1, &Mi));
217 PetscCall(PetscMalloc1(Mnz, &Mj)); // Mj is temp
218 PetscCall(PetscMalloc1(Mnz, &Ma));
219 Mi[0] = 0;
220 for (PetscInt i = 0; i < m; i++) {
221 PetscInt llen = Ai[i + 1] - Ai[i];
222 PetscInt ulen = adiag[i] - adiag[i + 1];
223 PetscCall(PetscArraycpy(Mj + Mi[i], Aj + Ai[i], llen)); // entries of L
224 Mj[Mi[i] + llen] = i; // diagonal entry
225 PetscCall(PetscArraycpy(Mj + Mi[i] + llen + 1, Aj + adiag[i + 1] + 1, ulen - 1)); // entries of U on the right of the diagonal
226 Mi[i + 1] = Mi[i] + llen + ulen;
227 }
228 // Copy M (L,U) from host to device
229 PetscCallCUDA(cudaMalloc(&fs->csrRowPtr, sizeof(*fs->csrRowPtr) * (m + 1)));
230 PetscCallCUDA(cudaMalloc(&fs->csrColIdx, sizeof(*fs->csrColIdx) * Mnz));
231 PetscCallCUDA(cudaMalloc(&fs->csrVal, sizeof(*fs->csrVal) * Mnz));
232 PetscCallCUDA(cudaMemcpy(fs->csrRowPtr, Mi, sizeof(*fs->csrRowPtr) * (m + 1), cudaMemcpyHostToDevice));
233 PetscCallCUDA(cudaMemcpy(fs->csrColIdx, Mj, sizeof(*fs->csrColIdx) * Mnz, cudaMemcpyHostToDevice));
234
235 // Create descriptors for L, U. See https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t
236 // cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always
237 // assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that
238 // all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine
239 // assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory.
240 cusparseFillMode_t fillMode = CUSPARSE_FILL_MODE_LOWER;
241 cusparseDiagType_t diagType = CUSPARSE_DIAG_TYPE_UNIT;
242 const cusparseIndexType_t indexType = PetscDefined(USE_64BIT_INDICES) ? CUSPARSE_INDEX_64I : CUSPARSE_INDEX_32I;
243
244 PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_L, m, m, Mnz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, indexType, indexType, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
245 PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
246 PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
247
248 fillMode = CUSPARSE_FILL_MODE_UPPER;
249 diagType = CUSPARSE_DIAG_TYPE_NON_UNIT;
250 PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_U, m, m, Mnz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, indexType, indexType, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
251 PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
252 PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
253
254 // Allocate work vectors in SpSv
255 PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(*fs->X) * m));
256 PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(*fs->Y) * m));
257
258 PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype));
259 PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype));
260
261 // Query buffer sizes for SpSV and then allocate buffers, temporarily assuming opA = CUSPARSE_OPERATION_NON_TRANSPOSE
262 PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_L));
263 PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, &fs->spsvBufferSize_L));
264 PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_U));
265 PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, &fs->spsvBufferSize_U));
266 PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_U, fs->spsvBufferSize_U));
267 PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_L, fs->spsvBufferSize_L));
268
269 // Record for reuse
270 fs->csrRowPtr_h = Mi;
271 fs->csrVal_h = Ma;
272 PetscCall(PetscFree(Mj));
273 }
274 // Copy the value
275 Mi = fs->csrRowPtr_h;
276 Ma = fs->csrVal_h;
277 Mnz = Mi[m];
278 for (PetscInt i = 0; i < m; i++) {
279 PetscInt llen = Ai[i + 1] - Ai[i];
280 PetscInt ulen = adiag[i] - adiag[i + 1];
281 PetscCall(PetscArraycpy(Ma + Mi[i], Aa + Ai[i], llen)); // entries of L
282 Ma[Mi[i] + llen] = (MatScalar)1.0 / Aa[adiag[i]]; // recover the diagonal entry
283 PetscCall(PetscArraycpy(Ma + Mi[i] + llen + 1, Aa + adiag[i + 1] + 1, ulen - 1)); // entries of U on the right of the diagonal
284 }
285 PetscCallCUDA(cudaMemcpy(fs->csrVal, Ma, sizeof(*Ma) * Mnz, cudaMemcpyHostToDevice));
286
287 #if PETSC_PKG_CUDA_VERSION_GE(12, 1, 1)
288 if (fs->updatedSpSVAnalysis) { // have done cusparseSpSV_analysis before, and only matrix values changed?
289 // Otherwise cusparse would error out: "On entry to cusparseSpSV_updateMatrix() parameter number 3 (newValues) had an illegal value: NULL pointer"
290 if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_L, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL));
291 if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_U, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL));
292 } else
293 #endif
294 {
295 // Do cusparseSpSV_analysis(), which is numeric and requires valid and up-to-date matrix values
296 PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L));
297
298 PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, fs->spsvBuffer_U));
299 fs->updatedSpSVAnalysis = PETSC_TRUE;
300 fs->updatedTransposeSpSVAnalysis = PETSC_FALSE;
301 }
302 }
303 PetscFunctionReturn(PETSC_SUCCESS);
304 }
305 #else
MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A)306 static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A)
307 {
308 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
309 PetscInt n = A->rmap->n;
310 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
311 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr;
312 const PetscInt *ai = a->i, *aj = a->j, *vi;
313 const MatScalar *aa = a->a, *v;
314 PetscInt *AiLo, *AjLo;
315 PetscInt i, nz, nzLower, offset, rowOffset;
316
317 PetscFunctionBegin;
318 if (!n) PetscFunctionReturn(PETSC_SUCCESS);
319 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
320 try {
321 /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */
322 nzLower = n + ai[n] - ai[1];
323 if (!loTriFactor) {
324 PetscScalar *AALo;
325
326 PetscCallCUDA(cudaMallocHost((void **)&AALo, nzLower * sizeof(PetscScalar)));
327
328 /* Allocate Space for the lower triangular matrix */
329 PetscCallCUDA(cudaMallocHost((void **)&AiLo, (n + 1) * sizeof(PetscInt)));
330 PetscCallCUDA(cudaMallocHost((void **)&AjLo, nzLower * sizeof(PetscInt)));
331
332 /* Fill the lower triangular matrix */
333 AiLo[0] = (PetscInt)0;
334 AiLo[n] = nzLower;
335 AjLo[0] = (PetscInt)0;
336 AALo[0] = (MatScalar)1.0;
337 v = aa;
338 vi = aj;
339 offset = 1;
340 rowOffset = 1;
341 for (i = 1; i < n; i++) {
342 nz = ai[i + 1] - ai[i];
343 /* additional 1 for the term on the diagonal */
344 AiLo[i] = rowOffset;
345 rowOffset += nz + 1;
346
347 PetscCall(PetscArraycpy(&AjLo[offset], vi, nz));
348 PetscCall(PetscArraycpy(&AALo[offset], v, nz));
349
350 offset += nz;
351 AjLo[offset] = (PetscInt)i;
352 AALo[offset] = (MatScalar)1.0;
353 offset += 1;
354
355 v += nz;
356 vi += nz;
357 }
358
359 /* allocate space for the triangular factor information */
360 PetscCall(PetscNew(&loTriFactor));
361 loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
362 /* Create the matrix description */
363 PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactor->descr));
364 PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
365 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
366 PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
367 #else
368 PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
369 #endif
370 PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER));
371 PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT));
372
373 /* set the operation */
374 loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
375
376 /* set the matrix */
377 loTriFactor->csrMat = new CsrMatrix;
378 loTriFactor->csrMat->num_rows = n;
379 loTriFactor->csrMat->num_cols = n;
380 loTriFactor->csrMat->num_entries = nzLower;
381
382 loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n + 1);
383 loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo + n + 1);
384
385 loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower);
386 loTriFactor->csrMat->column_indices->assign(AjLo, AjLo + nzLower);
387
388 loTriFactor->csrMat->values = new THRUSTARRAY(nzLower);
389 loTriFactor->csrMat->values->assign(AALo, AALo + nzLower);
390
391 /* Create the solve analysis information */
392 PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
393 PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactor->solveInfo));
394 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
395 PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
396 loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, &loTriFactor->solveBufferSize));
397 PetscCallCUDA(cudaMalloc(&loTriFactor->solveBuffer, loTriFactor->solveBufferSize));
398 #endif
399
400 /* perform the solve analysis */
401 PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
402 loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, loTriFactor->solvePolicy, loTriFactor->solveBuffer));
403 PetscCallCUDA(WaitForCUDA());
404 PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
405
406 /* assign the pointer */
407 ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->loTriFactorPtr = loTriFactor;
408 loTriFactor->AA_h = AALo;
409 PetscCallCUDA(cudaFreeHost(AiLo));
410 PetscCallCUDA(cudaFreeHost(AjLo));
411 PetscCall(PetscLogCpuToGpu((n + 1 + nzLower) * sizeof(int) + nzLower * sizeof(PetscScalar)));
412 } else { /* update values only */
413 if (!loTriFactor->AA_h) PetscCallCUDA(cudaMallocHost((void **)&loTriFactor->AA_h, nzLower * sizeof(PetscScalar)));
414 /* Fill the lower triangular matrix */
415 loTriFactor->AA_h[0] = 1.0;
416 v = aa;
417 vi = aj;
418 offset = 1;
419 for (i = 1; i < n; i++) {
420 nz = ai[i + 1] - ai[i];
421 PetscCall(PetscArraycpy(&loTriFactor->AA_h[offset], v, nz));
422 offset += nz;
423 loTriFactor->AA_h[offset] = 1.0;
424 offset += 1;
425 v += nz;
426 }
427 loTriFactor->csrMat->values->assign(loTriFactor->AA_h, loTriFactor->AA_h + nzLower);
428 PetscCall(PetscLogCpuToGpu(nzLower * sizeof(PetscScalar)));
429 }
430 } catch (char *ex) {
431 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex);
432 }
433 }
434 PetscFunctionReturn(PETSC_SUCCESS);
435 }
436
MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A)437 static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A)
438 {
439 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
440 PetscInt n = A->rmap->n;
441 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
442 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr;
443 const PetscInt *aj = a->j, *adiag, *vi;
444 const MatScalar *aa = a->a, *v;
445 PetscInt *AiUp, *AjUp;
446 PetscInt i, nz, nzUpper, offset;
447
448 PetscFunctionBegin;
449 if (!n) PetscFunctionReturn(PETSC_SUCCESS);
450 PetscCall(MatGetDiagonalMarkers_SeqAIJ(A, &adiag, NULL));
451 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
452 try {
453 /* next, figure out the number of nonzeros in the upper triangular matrix. */
454 nzUpper = adiag[0] - adiag[n];
455 if (!upTriFactor) {
456 PetscScalar *AAUp;
457
458 PetscCallCUDA(cudaMallocHost((void **)&AAUp, nzUpper * sizeof(PetscScalar)));
459
460 /* Allocate Space for the upper triangular matrix */
461 PetscCallCUDA(cudaMallocHost((void **)&AiUp, (n + 1) * sizeof(PetscInt)));
462 PetscCallCUDA(cudaMallocHost((void **)&AjUp, nzUpper * sizeof(PetscInt)));
463
464 /* Fill the upper triangular matrix */
465 AiUp[0] = (PetscInt)0;
466 AiUp[n] = nzUpper;
467 offset = nzUpper;
468 for (i = n - 1; i >= 0; i--) {
469 v = aa + adiag[i + 1] + 1;
470 vi = aj + adiag[i + 1] + 1;
471
472 /* number of elements NOT on the diagonal */
473 nz = adiag[i] - adiag[i + 1] - 1;
474
475 /* decrement the offset */
476 offset -= (nz + 1);
477
478 /* first, set the diagonal elements */
479 AjUp[offset] = (PetscInt)i;
480 AAUp[offset] = (MatScalar)1. / v[nz];
481 AiUp[i] = AiUp[i + 1] - (nz + 1);
482
483 PetscCall(PetscArraycpy(&AjUp[offset + 1], vi, nz));
484 PetscCall(PetscArraycpy(&AAUp[offset + 1], v, nz));
485 }
486
487 /* allocate space for the triangular factor information */
488 PetscCall(PetscNew(&upTriFactor));
489 upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
490
491 /* Create the matrix description */
492 PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactor->descr));
493 PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
494 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
495 PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
496 #else
497 PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
498 #endif
499 PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER));
500 PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT));
501
502 /* set the operation */
503 upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
504
505 /* set the matrix */
506 upTriFactor->csrMat = new CsrMatrix;
507 upTriFactor->csrMat->num_rows = n;
508 upTriFactor->csrMat->num_cols = n;
509 upTriFactor->csrMat->num_entries = nzUpper;
510
511 upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n + 1);
512 upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + n + 1);
513
514 upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper);
515 upTriFactor->csrMat->column_indices->assign(AjUp, AjUp + nzUpper);
516
517 upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper);
518 upTriFactor->csrMat->values->assign(AAUp, AAUp + nzUpper);
519
520 /* Create the solve analysis information */
521 PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
522 PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactor->solveInfo));
523 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
524 PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
525 upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, &upTriFactor->solveBufferSize));
526 PetscCallCUDA(cudaMalloc(&upTriFactor->solveBuffer, upTriFactor->solveBufferSize));
527 #endif
528
529 /* perform the solve analysis */
530 PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
531 upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, upTriFactor->solvePolicy, upTriFactor->solveBuffer));
532
533 PetscCallCUDA(WaitForCUDA());
534 PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
535
536 /* assign the pointer */
537 ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->upTriFactorPtr = upTriFactor;
538 upTriFactor->AA_h = AAUp;
539 PetscCallCUDA(cudaFreeHost(AiUp));
540 PetscCallCUDA(cudaFreeHost(AjUp));
541 PetscCall(PetscLogCpuToGpu((n + 1 + nzUpper) * sizeof(int) + nzUpper * sizeof(PetscScalar)));
542 } else {
543 if (!upTriFactor->AA_h) PetscCallCUDA(cudaMallocHost((void **)&upTriFactor->AA_h, nzUpper * sizeof(PetscScalar)));
544 /* Fill the upper triangular matrix */
545 offset = nzUpper;
546 for (i = n - 1; i >= 0; i--) {
547 v = aa + adiag[i + 1] + 1;
548
549 /* number of elements NOT on the diagonal */
550 nz = adiag[i] - adiag[i + 1] - 1;
551
552 /* decrement the offset */
553 offset -= (nz + 1);
554
555 /* first, set the diagonal elements */
556 upTriFactor->AA_h[offset] = 1. / v[nz];
557 PetscCall(PetscArraycpy(&upTriFactor->AA_h[offset + 1], v, nz));
558 }
559 upTriFactor->csrMat->values->assign(upTriFactor->AA_h, upTriFactor->AA_h + nzUpper);
560 PetscCall(PetscLogCpuToGpu(nzUpper * sizeof(PetscScalar)));
561 }
562 } catch (char *ex) {
563 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex);
564 }
565 }
566 PetscFunctionReturn(PETSC_SUCCESS);
567 }
568 #endif
569
MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A)570 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A)
571 {
572 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
573 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
574 IS isrow = a->row, isicol = a->icol;
575 PetscBool row_identity, col_identity;
576 PetscInt n = A->rmap->n;
577
578 PetscFunctionBegin;
579 PetscCheck(cusparseTriFactors, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors");
580 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
581 PetscCall(MatSeqAIJCUSPARSEBuildFactoredMatrix_LU(A));
582 #else
583 PetscCall(MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A));
584 PetscCall(MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A));
585 if (!cusparseTriFactors->workVector) cusparseTriFactors->workVector = new THRUSTARRAY(n);
586 #endif
587
588 cusparseTriFactors->nnz = a->nz;
589
590 A->offloadmask = PETSC_OFFLOAD_BOTH; // factored matrix is sync'ed to GPU
591 /* lower triangular indices */
592 PetscCall(ISIdentity(isrow, &row_identity));
593 if (!row_identity && !cusparseTriFactors->rpermIndices) {
594 const PetscInt *r;
595
596 PetscCall(ISGetIndices(isrow, &r));
597 cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
598 cusparseTriFactors->rpermIndices->assign(r, r + n);
599 PetscCall(ISRestoreIndices(isrow, &r));
600 PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt)));
601 }
602
603 /* upper triangular indices */
604 PetscCall(ISIdentity(isicol, &col_identity));
605 if (!col_identity && !cusparseTriFactors->cpermIndices) {
606 const PetscInt *c;
607
608 PetscCall(ISGetIndices(isicol, &c));
609 cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
610 cusparseTriFactors->cpermIndices->assign(c, c + n);
611 PetscCall(ISRestoreIndices(isicol, &c));
612 PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt)));
613 }
614 PetscFunctionReturn(PETSC_SUCCESS);
615 }
616
617 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
MatSeqAIJCUSPARSEBuildFactoredMatrix_Cholesky(Mat A)618 static PetscErrorCode MatSeqAIJCUSPARSEBuildFactoredMatrix_Cholesky(Mat A)
619 {
620 Mat_SeqAIJ *a = static_cast<Mat_SeqAIJ *>(A->data);
621 PetscInt m = A->rmap->n;
622 Mat_SeqAIJCUSPARSETriFactors *fs = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr);
623 const PetscInt *Ai = a->i, *Aj = a->j, *adiag;
624 const MatScalar *Aa = a->a;
625 PetscInt *Mj, Mnz;
626 PetscScalar *Ma, *D;
627
628 PetscFunctionBegin;
629 PetscCall(MatGetDiagonalMarkers_SeqAIJ(A, &adiag, NULL));
630 if (A->offloadmask == PETSC_OFFLOAD_CPU) { // A's latest factors are on CPU
631 if (!fs->csrRowPtr) { // Is't the first time to do the setup? Use csrRowPtr since it is not null even m=0
632 // Re-arrange the (skewed) factored matrix and put the result into M, a regular csr matrix on host.
633 // See comments at MatICCFactorSymbolic_SeqAIJ() on the layout of the factored matrix (U) on host.
634 Mnz = Ai[m]; // Unz (with the unit diagonal)
635 PetscCall(PetscMalloc1(Mnz, &Ma));
636 PetscCall(PetscMalloc1(Mnz, &Mj)); // Mj[] is temp
637 PetscCall(PetscMalloc1(m, &D)); // the diagonal
638 for (PetscInt i = 0; i < m; i++) {
639 PetscInt ulen = Ai[i + 1] - Ai[i];
640 Mj[Ai[i]] = i; // diagonal entry
641 PetscCall(PetscArraycpy(Mj + Ai[i] + 1, Aj + Ai[i], ulen - 1)); // entries of U on the right of the diagonal
642 }
643 // Copy M (U) from host to device
644 PetscCallCUDA(cudaMalloc(&fs->csrRowPtr, sizeof(*fs->csrRowPtr) * (m + 1)));
645 PetscCallCUDA(cudaMalloc(&fs->csrColIdx, sizeof(*fs->csrColIdx) * Mnz));
646 PetscCallCUDA(cudaMalloc(&fs->csrVal, sizeof(*fs->csrVal) * Mnz));
647 PetscCallCUDA(cudaMalloc(&fs->diag, sizeof(*fs->diag) * m));
648 PetscCallCUDA(cudaMemcpy(fs->csrRowPtr, Ai, sizeof(*Ai) * (m + 1), cudaMemcpyHostToDevice));
649 PetscCallCUDA(cudaMemcpy(fs->csrColIdx, Mj, sizeof(*Mj) * Mnz, cudaMemcpyHostToDevice));
650
651 // Create descriptors for L, U. See https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t
652 // cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always
653 // assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that
654 // all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine
655 // assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory.
656 cusparseFillMode_t fillMode = CUSPARSE_FILL_MODE_UPPER;
657 cusparseDiagType_t diagType = CUSPARSE_DIAG_TYPE_UNIT; // U is unit diagonal
658 const cusparseIndexType_t indexType = PetscDefined(USE_64BIT_INDICES) ? CUSPARSE_INDEX_64I : CUSPARSE_INDEX_32I;
659
660 PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_U, m, m, Mnz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, indexType, indexType, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
661 PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
662 PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
663
664 // Allocate work vectors in SpSv
665 PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(*fs->X) * m));
666 PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(*fs->Y) * m));
667
668 PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype));
669 PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype));
670
671 // Query buffer sizes for SpSV and then allocate buffers
672 PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_U));
673 PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, &fs->spsvBufferSize_U));
674 PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_U, fs->spsvBufferSize_U));
675
676 PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Ut)); // Ut solve uses the same matrix (spMatDescr_U), but different descr and buffer
677 PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Ut, &fs->spsvBufferSize_Ut));
678 PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Ut, fs->spsvBufferSize_Ut));
679
680 // Record for reuse
681 fs->csrVal_h = Ma;
682 fs->diag_h = D;
683 PetscCall(PetscFree(Mj));
684 }
685 // Copy the value
686 Ma = fs->csrVal_h;
687 D = fs->diag_h;
688 Mnz = Ai[m];
689 for (PetscInt i = 0; i < m; i++) {
690 D[i] = Aa[adiag[i]]; // actually Aa[adiag[i]] is the inverse of the diagonal
691 Ma[Ai[i]] = (MatScalar)1.0; // set the unit diagonal, which is cosmetic since cusparse does not really read it given CUSPARSE_DIAG_TYPE_UNIT
692 for (PetscInt k = 0; k < Ai[i + 1] - Ai[i] - 1; k++) Ma[Ai[i] + 1 + k] = -Aa[Ai[i] + k];
693 }
694 PetscCallCUDA(cudaMemcpy(fs->csrVal, Ma, sizeof(*Ma) * Mnz, cudaMemcpyHostToDevice));
695 PetscCallCUDA(cudaMemcpy(fs->diag, D, sizeof(*D) * m, cudaMemcpyHostToDevice));
696
697 #if PETSC_PKG_CUDA_VERSION_GE(12, 1, 1)
698 if (fs->updatedSpSVAnalysis) {
699 if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_U, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL));
700 if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_Ut, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL));
701 } else
702 #endif
703 {
704 // Do cusparseSpSV_analysis(), which is numeric and requires valid and up-to-date matrix values
705 PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, fs->spsvBuffer_U));
706 PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Ut, fs->spsvBuffer_Ut));
707 fs->updatedSpSVAnalysis = PETSC_TRUE;
708 }
709 }
710 PetscFunctionReturn(PETSC_SUCCESS);
711 }
712
713 // Solve Ut D U x = b
MatSolve_SeqAIJCUSPARSE_Cholesky(Mat A,Vec b,Vec x)714 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_Cholesky(Mat A, Vec b, Vec x)
715 {
716 Mat_SeqAIJCUSPARSETriFactors *fs = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr);
717 Mat_SeqAIJ *aij = static_cast<Mat_SeqAIJ *>(A->data);
718 const PetscScalar *barray;
719 PetscScalar *xarray;
720 thrust::device_ptr<const PetscScalar> bGPU;
721 thrust::device_ptr<PetscScalar> xGPU;
722 const cusparseSpSVAlg_t alg = CUSPARSE_SPSV_ALG_DEFAULT;
723 PetscInt m = A->rmap->n;
724
725 PetscFunctionBegin;
726 PetscCall(PetscLogGpuTimeBegin());
727 PetscCall(VecCUDAGetArrayWrite(x, &xarray));
728 PetscCall(VecCUDAGetArrayRead(b, &barray));
729 xGPU = thrust::device_pointer_cast(xarray);
730 bGPU = thrust::device_pointer_cast(barray);
731
732 // Reorder b with the row permutation if needed, and wrap the result in fs->X
733 if (fs->rpermIndices) {
734 PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->end()), thrust::device_pointer_cast(fs->X)));
735 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X));
736 } else {
737 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray));
738 }
739
740 // Solve Ut Y = X
741 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y));
742 PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Ut));
743
744 // Solve diag(D) Z = Y. Actually just do Y = Y*D since D is already inverted in MatCholeskyFactorNumeric_SeqAIJ().
745 // It is basically a vector element-wise multiplication, but cublas does not have it!
746 #if CCCL_VERSION >= 3001000
747 PetscCallThrust(thrust::transform(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::device_pointer_cast(fs->Y), thrust::device_pointer_cast(fs->Y + m), thrust::device_pointer_cast(fs->diag), thrust::device_pointer_cast(fs->Y), cuda::std::multiplies<PetscScalar>()));
748 #else
749 PetscCallThrust(thrust::transform(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::device_pointer_cast(fs->Y), thrust::device_pointer_cast(fs->Y + m), thrust::device_pointer_cast(fs->diag), thrust::device_pointer_cast(fs->Y), thrust::multiplies<PetscScalar>()));
750 #endif
751
752 // Solve U X = Y
753 if (fs->cpermIndices) { // if need to permute, we need to use the intermediate buffer X
754 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X));
755 } else {
756 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray));
757 }
758 PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, alg, fs->spsvDescr_U));
759
760 // Reorder X with the column permutation if needed, and put the result back to x
761 if (fs->cpermIndices) {
762 PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X), fs->cpermIndices->begin()),
763 thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X + m), fs->cpermIndices->end()), xGPU));
764 }
765
766 PetscCall(VecCUDARestoreArrayRead(b, &barray));
767 PetscCall(VecCUDARestoreArrayWrite(x, &xarray));
768 PetscCall(PetscLogGpuTimeEnd());
769 PetscCall(PetscLogGpuFlops(4.0 * aij->nz - A->rmap->n));
770 PetscFunctionReturn(PETSC_SUCCESS);
771 }
772 #else
MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A)773 static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A)
774 {
775 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
776 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
777 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr;
778 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr;
779 PetscInt *AiUp, *AjUp;
780 PetscScalar *AAUp;
781 PetscScalar *AALo;
782 PetscInt nzUpper = a->nz, n = A->rmap->n, i, offset, nz, j;
783 Mat_SeqSBAIJ *b = (Mat_SeqSBAIJ *)A->data;
784 const PetscInt *ai = b->i, *aj = b->j, *vj;
785 const MatScalar *aa = b->a, *v;
786
787 PetscFunctionBegin;
788 if (!n) PetscFunctionReturn(PETSC_SUCCESS);
789 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
790 try {
791 PetscCallCUDA(cudaMallocHost((void **)&AAUp, nzUpper * sizeof(PetscScalar)));
792 PetscCallCUDA(cudaMallocHost((void **)&AALo, nzUpper * sizeof(PetscScalar)));
793 if (!upTriFactor && !loTriFactor) {
794 /* Allocate Space for the upper triangular matrix */
795 PetscCallCUDA(cudaMallocHost((void **)&AiUp, (n + 1) * sizeof(PetscInt)));
796 PetscCallCUDA(cudaMallocHost((void **)&AjUp, nzUpper * sizeof(PetscInt)));
797
798 /* Fill the upper triangular matrix */
799 AiUp[0] = (PetscInt)0;
800 AiUp[n] = nzUpper;
801 offset = 0;
802 for (i = 0; i < n; i++) {
803 /* set the pointers */
804 v = aa + ai[i];
805 vj = aj + ai[i];
806 nz = ai[i + 1] - ai[i] - 1; /* exclude diag[i] */
807
808 /* first, set the diagonal elements */
809 AjUp[offset] = (PetscInt)i;
810 AAUp[offset] = (MatScalar)1.0 / v[nz];
811 AiUp[i] = offset;
812 AALo[offset] = (MatScalar)1.0 / v[nz];
813
814 offset += 1;
815 if (nz > 0) {
816 PetscCall(PetscArraycpy(&AjUp[offset], vj, nz));
817 PetscCall(PetscArraycpy(&AAUp[offset], v, nz));
818 for (j = offset; j < offset + nz; j++) {
819 AAUp[j] = -AAUp[j];
820 AALo[j] = AAUp[j] / v[nz];
821 }
822 offset += nz;
823 }
824 }
825
826 /* allocate space for the triangular factor information */
827 PetscCall(PetscNew(&upTriFactor));
828 upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
829
830 /* Create the matrix description */
831 PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactor->descr));
832 PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
833 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
834 PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
835 #else
836 PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
837 #endif
838 PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER));
839 PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT));
840
841 /* set the matrix */
842 upTriFactor->csrMat = new CsrMatrix;
843 upTriFactor->csrMat->num_rows = A->rmap->n;
844 upTriFactor->csrMat->num_cols = A->cmap->n;
845 upTriFactor->csrMat->num_entries = a->nz;
846
847 upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1);
848 upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + A->rmap->n + 1);
849
850 upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
851 upTriFactor->csrMat->column_indices->assign(AjUp, AjUp + a->nz);
852
853 upTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
854 upTriFactor->csrMat->values->assign(AAUp, AAUp + a->nz);
855
856 /* set the operation */
857 upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
858
859 /* Create the solve analysis information */
860 PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
861 PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactor->solveInfo));
862 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
863 PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
864 upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, &upTriFactor->solveBufferSize));
865 PetscCallCUDA(cudaMalloc(&upTriFactor->solveBuffer, upTriFactor->solveBufferSize));
866 #endif
867
868 /* perform the solve analysis */
869 PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
870 upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, upTriFactor->solvePolicy, upTriFactor->solveBuffer));
871
872 PetscCallCUDA(WaitForCUDA());
873 PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
874
875 /* assign the pointer */
876 ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->upTriFactorPtr = upTriFactor;
877
878 /* allocate space for the triangular factor information */
879 PetscCall(PetscNew(&loTriFactor));
880 loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
881
882 /* Create the matrix description */
883 PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactor->descr));
884 PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
885 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
886 PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
887 #else
888 PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
889 #endif
890 PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER));
891 PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT));
892
893 /* set the operation */
894 loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE;
895
896 /* set the matrix */
897 loTriFactor->csrMat = new CsrMatrix;
898 loTriFactor->csrMat->num_rows = A->rmap->n;
899 loTriFactor->csrMat->num_cols = A->cmap->n;
900 loTriFactor->csrMat->num_entries = a->nz;
901
902 loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1);
903 loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + A->rmap->n + 1);
904
905 loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
906 loTriFactor->csrMat->column_indices->assign(AjUp, AjUp + a->nz);
907
908 loTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
909 loTriFactor->csrMat->values->assign(AALo, AALo + a->nz);
910
911 /* Create the solve analysis information */
912 PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
913 PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactor->solveInfo));
914 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
915 PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
916 loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, &loTriFactor->solveBufferSize));
917 PetscCallCUDA(cudaMalloc(&loTriFactor->solveBuffer, loTriFactor->solveBufferSize));
918 #endif
919
920 /* perform the solve analysis */
921 PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
922 loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, loTriFactor->solvePolicy, loTriFactor->solveBuffer));
923
924 PetscCallCUDA(WaitForCUDA());
925 PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
926
927 /* assign the pointer */
928 ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->loTriFactorPtr = loTriFactor;
929
930 PetscCall(PetscLogCpuToGpu(2 * (((A->rmap->n + 1) + (a->nz)) * sizeof(int) + (a->nz) * sizeof(PetscScalar))));
931 PetscCallCUDA(cudaFreeHost(AiUp));
932 PetscCallCUDA(cudaFreeHost(AjUp));
933 } else {
934 /* Fill the upper triangular matrix */
935 offset = 0;
936 for (i = 0; i < n; i++) {
937 /* set the pointers */
938 v = aa + ai[i];
939 nz = ai[i + 1] - ai[i] - 1; /* exclude diag[i] */
940
941 /* first, set the diagonal elements */
942 AAUp[offset] = 1.0 / v[nz];
943 AALo[offset] = 1.0 / v[nz];
944
945 offset += 1;
946 if (nz > 0) {
947 PetscCall(PetscArraycpy(&AAUp[offset], v, nz));
948 for (j = offset; j < offset + nz; j++) {
949 AAUp[j] = -AAUp[j];
950 AALo[j] = AAUp[j] / v[nz];
951 }
952 offset += nz;
953 }
954 }
955 PetscCheck(upTriFactor, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors");
956 PetscCheck(loTriFactor, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors");
957 upTriFactor->csrMat->values->assign(AAUp, AAUp + a->nz);
958 loTriFactor->csrMat->values->assign(AALo, AALo + a->nz);
959 PetscCall(PetscLogCpuToGpu(2 * (a->nz) * sizeof(PetscScalar)));
960 }
961 PetscCallCUDA(cudaFreeHost(AAUp));
962 PetscCallCUDA(cudaFreeHost(AALo));
963 } catch (char *ex) {
964 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex);
965 }
966 }
967 PetscFunctionReturn(PETSC_SUCCESS);
968 }
969 #endif
970
MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A)971 static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A)
972 {
973 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
974 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
975 IS ip = a->row;
976 PetscBool perm_identity;
977 PetscInt n = A->rmap->n;
978
979 PetscFunctionBegin;
980 PetscCheck(cusparseTriFactors, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors");
981
982 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
983 PetscCall(MatSeqAIJCUSPARSEBuildFactoredMatrix_Cholesky(A));
984 #else
985 PetscCall(MatSeqAIJCUSPARSEBuildICCTriMatrices(A));
986 if (!cusparseTriFactors->workVector) cusparseTriFactors->workVector = new THRUSTARRAY(n);
987 #endif
988 cusparseTriFactors->nnz = (a->nz - n) * 2 + n;
989
990 A->offloadmask = PETSC_OFFLOAD_BOTH;
991
992 /* lower triangular indices */
993 PetscCall(ISIdentity(ip, &perm_identity));
994 if (!perm_identity) {
995 IS iip;
996 const PetscInt *irip, *rip;
997
998 PetscCall(ISInvertPermutation(ip, PETSC_DECIDE, &iip));
999 PetscCall(ISGetIndices(iip, &irip));
1000 PetscCall(ISGetIndices(ip, &rip));
1001 cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
1002 cusparseTriFactors->rpermIndices->assign(rip, rip + n);
1003 cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
1004 cusparseTriFactors->cpermIndices->assign(irip, irip + n);
1005 PetscCall(ISRestoreIndices(iip, &irip));
1006 PetscCall(ISDestroy(&iip));
1007 PetscCall(ISRestoreIndices(ip, &rip));
1008 PetscCall(PetscLogCpuToGpu(2. * n * sizeof(PetscInt)));
1009 }
1010 PetscFunctionReturn(PETSC_SUCCESS);
1011 }
1012
MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo * info)1013 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B, Mat A, const MatFactorInfo *info)
1014 {
1015 PetscFunctionBegin;
1016 PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
1017 PetscCall(MatCholeskyFactorNumeric_SeqAIJ(B, A, info));
1018 B->offloadmask = PETSC_OFFLOAD_CPU;
1019
1020 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
1021 B->ops->solve = MatSolve_SeqAIJCUSPARSE_Cholesky;
1022 B->ops->solvetranspose = MatSolve_SeqAIJCUSPARSE_Cholesky;
1023 #else
1024 /* determine which version of MatSolve needs to be used. */
1025 Mat_SeqAIJ *b = (Mat_SeqAIJ *)B->data;
1026 IS ip = b->row;
1027 PetscBool perm_identity;
1028
1029 PetscCall(ISIdentity(ip, &perm_identity));
1030 if (perm_identity) {
1031 B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
1032 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
1033 } else {
1034 B->ops->solve = MatSolve_SeqAIJCUSPARSE;
1035 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
1036 }
1037 #endif
1038 B->ops->matsolve = NULL;
1039 B->ops->matsolvetranspose = NULL;
1040
1041 /* get the triangular factors */
1042 PetscCall(MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B));
1043 PetscFunctionReturn(PETSC_SUCCESS);
1044 }
1045
1046 #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0)
MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A)1047 static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A)
1048 {
1049 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
1050 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr;
1051 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr;
1052 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT;
1053 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT;
1054 cusparseIndexBase_t indexBase;
1055 cusparseMatrixType_t matrixType;
1056 cusparseFillMode_t fillMode;
1057 cusparseDiagType_t diagType;
1058
1059 PetscFunctionBegin;
1060 /* allocate space for the transpose of the lower triangular factor */
1061 PetscCall(PetscNew(&loTriFactorT));
1062 loTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
1063
1064 /* set the matrix descriptors of the lower triangular factor */
1065 matrixType = cusparseGetMatType(loTriFactor->descr);
1066 indexBase = cusparseGetMatIndexBase(loTriFactor->descr);
1067 fillMode = cusparseGetMatFillMode(loTriFactor->descr) == CUSPARSE_FILL_MODE_UPPER ? CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
1068 diagType = cusparseGetMatDiagType(loTriFactor->descr);
1069
1070 /* Create the matrix description */
1071 PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactorT->descr));
1072 PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactorT->descr, indexBase));
1073 PetscCallCUSPARSE(cusparseSetMatType(loTriFactorT->descr, matrixType));
1074 PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactorT->descr, fillMode));
1075 PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactorT->descr, diagType));
1076
1077 /* set the operation */
1078 loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
1079
1080 /* allocate GPU space for the CSC of the lower triangular factor*/
1081 loTriFactorT->csrMat = new CsrMatrix;
1082 loTriFactorT->csrMat->num_rows = loTriFactor->csrMat->num_cols;
1083 loTriFactorT->csrMat->num_cols = loTriFactor->csrMat->num_rows;
1084 loTriFactorT->csrMat->num_entries = loTriFactor->csrMat->num_entries;
1085 loTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_rows + 1);
1086 loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_entries);
1087 loTriFactorT->csrMat->values = new THRUSTARRAY(loTriFactorT->csrMat->num_entries);
1088
1089 /* compute the transpose of the lower triangular factor, i.e. the CSC */
1090 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1091 PetscCallCUSPARSE(cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, loTriFactor->csrMat->values->data().get(),
1092 loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
1093 loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, &loTriFactor->csr2cscBufferSize));
1094 PetscCallCUDA(cudaMalloc(&loTriFactor->csr2cscBuffer, loTriFactor->csr2cscBufferSize));
1095 #endif
1096
1097 PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
1098 {
1099 // there is no clean way to have PetscCallCUSPARSE wrapping this function...
1100 auto stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
1101 loTriFactor->csrMat->column_indices->data().get(), loTriFactorT->csrMat->values->data().get(),
1102 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1103 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, loTriFactor->csr2cscBuffer);
1104 #else
1105 loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase);
1106 #endif
1107 PetscCallCUSPARSE(stat);
1108 }
1109
1110 PetscCallCUDA(WaitForCUDA());
1111 PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
1112
1113 /* Create the solve analysis information */
1114 PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
1115 PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactorT->solveInfo));
1116 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
1117 PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
1118 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, &loTriFactorT->solveBufferSize));
1119 PetscCallCUDA(cudaMalloc(&loTriFactorT->solveBuffer, loTriFactorT->solveBufferSize));
1120 #endif
1121
1122 /* perform the solve analysis */
1123 PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
1124 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer));
1125
1126 PetscCallCUDA(WaitForCUDA());
1127 PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
1128
1129 /* assign the pointer */
1130 ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->loTriFactorPtrTranspose = loTriFactorT;
1131
1132 /*********************************************/
1133 /* Now the Transpose of the Upper Tri Factor */
1134 /*********************************************/
1135
1136 /* allocate space for the transpose of the upper triangular factor */
1137 PetscCall(PetscNew(&upTriFactorT));
1138 upTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
1139
1140 /* set the matrix descriptors of the upper triangular factor */
1141 matrixType = cusparseGetMatType(upTriFactor->descr);
1142 indexBase = cusparseGetMatIndexBase(upTriFactor->descr);
1143 fillMode = cusparseGetMatFillMode(upTriFactor->descr) == CUSPARSE_FILL_MODE_UPPER ? CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
1144 diagType = cusparseGetMatDiagType(upTriFactor->descr);
1145
1146 /* Create the matrix description */
1147 PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactorT->descr));
1148 PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactorT->descr, indexBase));
1149 PetscCallCUSPARSE(cusparseSetMatType(upTriFactorT->descr, matrixType));
1150 PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactorT->descr, fillMode));
1151 PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactorT->descr, diagType));
1152
1153 /* set the operation */
1154 upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
1155
1156 /* allocate GPU space for the CSC of the upper triangular factor*/
1157 upTriFactorT->csrMat = new CsrMatrix;
1158 upTriFactorT->csrMat->num_rows = upTriFactor->csrMat->num_cols;
1159 upTriFactorT->csrMat->num_cols = upTriFactor->csrMat->num_rows;
1160 upTriFactorT->csrMat->num_entries = upTriFactor->csrMat->num_entries;
1161 upTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_rows + 1);
1162 upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_entries);
1163 upTriFactorT->csrMat->values = new THRUSTARRAY(upTriFactorT->csrMat->num_entries);
1164
1165 /* compute the transpose of the upper triangular factor, i.e. the CSC */
1166 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1167 PetscCallCUSPARSE(cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, upTriFactor->csrMat->values->data().get(),
1168 upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1169 upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, &upTriFactor->csr2cscBufferSize));
1170 PetscCallCUDA(cudaMalloc(&upTriFactor->csr2cscBuffer, upTriFactor->csr2cscBufferSize));
1171 #endif
1172
1173 PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
1174 {
1175 // there is no clean way to have PetscCallCUSPARSE wrapping this function...
1176 auto stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
1177 upTriFactor->csrMat->column_indices->data().get(), upTriFactorT->csrMat->values->data().get(),
1178 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1179 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, upTriFactor->csr2cscBuffer);
1180 #else
1181 upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase);
1182 #endif
1183 PetscCallCUSPARSE(stat);
1184 }
1185
1186 PetscCallCUDA(WaitForCUDA());
1187 PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
1188
1189 /* Create the solve analysis information */
1190 PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
1191 PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactorT->solveInfo));
1192 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
1193 PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
1194 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, &upTriFactorT->solveBufferSize));
1195 PetscCallCUDA(cudaMalloc(&upTriFactorT->solveBuffer, upTriFactorT->solveBufferSize));
1196 #endif
1197
1198 /* perform the solve analysis */
1199 /* christ, would it have killed you to put this stuff in a function????????? */
1200 PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
1201 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, upTriFactorT->solvePolicy, upTriFactorT->solveBuffer));
1202
1203 PetscCallCUDA(WaitForCUDA());
1204 PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
1205
1206 /* assign the pointer */
1207 ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->upTriFactorPtrTranspose = upTriFactorT;
1208 PetscFunctionReturn(PETSC_SUCCESS);
1209 }
1210 #endif
1211
1212 struct PetscScalarToPetscInt {
operator ()PetscScalarToPetscInt1213 __host__ __device__ PetscInt operator()(PetscScalar s) { return (PetscInt)PetscRealPart(s); }
1214 };
1215
MatSeqAIJCUSPARSEFormExplicitTranspose(Mat A)1216 static PetscErrorCode MatSeqAIJCUSPARSEFormExplicitTranspose(Mat A)
1217 {
1218 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
1219 Mat_SeqAIJCUSPARSEMultStruct *matstruct, *matstructT;
1220 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
1221 cusparseStatus_t stat;
1222 cusparseIndexBase_t indexBase;
1223
1224 PetscFunctionBegin;
1225 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
1226 matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->mat;
1227 PetscCheck(matstruct, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing mat struct");
1228 matstructT = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->matTranspose;
1229 PetscCheck(!A->transupdated || matstructT, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing matTranspose struct");
1230 if (A->transupdated) PetscFunctionReturn(PETSC_SUCCESS);
1231 PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
1232 PetscCall(PetscLogGpuTimeBegin());
1233 if (cusparsestruct->format != MAT_CUSPARSE_CSR) PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE));
1234 if (!cusparsestruct->matTranspose) { /* create cusparse matrix */
1235 matstructT = new Mat_SeqAIJCUSPARSEMultStruct;
1236 PetscCallCUSPARSE(cusparseCreateMatDescr(&matstructT->descr));
1237 indexBase = cusparseGetMatIndexBase(matstruct->descr);
1238 PetscCallCUSPARSE(cusparseSetMatIndexBase(matstructT->descr, indexBase));
1239 PetscCallCUSPARSE(cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
1240
1241 /* set alpha and beta */
1242 PetscCallCUDA(cudaMalloc((void **)&matstructT->alpha_one, sizeof(PetscScalar)));
1243 PetscCallCUDA(cudaMalloc((void **)&matstructT->beta_zero, sizeof(PetscScalar)));
1244 PetscCallCUDA(cudaMalloc((void **)&matstructT->beta_one, sizeof(PetscScalar)));
1245 PetscCallCUDA(cudaMemcpy(matstructT->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
1246 PetscCallCUDA(cudaMemcpy(matstructT->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
1247 PetscCallCUDA(cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
1248
1249 if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
1250 CsrMatrix *matrixT = new CsrMatrix;
1251 matstructT->mat = matrixT;
1252 matrixT->num_rows = A->cmap->n;
1253 matrixT->num_cols = A->rmap->n;
1254 matrixT->num_entries = a->nz;
1255 matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows + 1);
1256 matrixT->column_indices = new THRUSTINTARRAY32(a->nz);
1257 matrixT->values = new THRUSTARRAY(a->nz);
1258
1259 if (!cusparsestruct->rowoffsets_gpu) cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
1260 cusparsestruct->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
1261
1262 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1263 #if PETSC_PKG_CUDA_VERSION_GE(11, 2, 1)
1264 stat = cusparseCreateCsr(&matstructT->matDescr, matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), matrixT->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, /* row offset, col idx type due to THRUSTINTARRAY32 */
1265 indexBase, cusparse_scalartype);
1266 PetscCallCUSPARSE(stat);
1267 #else
1268 /* cusparse-11.x returns errors with zero-sized matrices until 11.2.1,
1269 see https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html#cusparse-11.2.1
1270
1271 I don't know what a proper value should be for matstructT->matDescr with empty matrices, so I just set
1272 it to NULL to blow it up if one relies on it. Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2,
1273 when nnz = 0, matrixT->row_offsets[] should be filled with indexBase. So I also set it accordingly.
1274 */
1275 if (matrixT->num_entries) {
1276 stat = cusparseCreateCsr(&matstructT->matDescr, matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), matrixT->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, indexBase, cusparse_scalartype);
1277 PetscCallCUSPARSE(stat);
1278
1279 } else {
1280 matstructT->matDescr = NULL;
1281 matrixT->row_offsets->assign(matrixT->row_offsets->size(), indexBase);
1282 }
1283 #endif
1284 #endif
1285 } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) {
1286 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1287 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
1288 #else
1289 CsrMatrix *temp = new CsrMatrix;
1290 CsrMatrix *tempT = new CsrMatrix;
1291 /* First convert HYB to CSR */
1292 temp->num_rows = A->rmap->n;
1293 temp->num_cols = A->cmap->n;
1294 temp->num_entries = a->nz;
1295 temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1);
1296 temp->column_indices = new THRUSTINTARRAY32(a->nz);
1297 temp->values = new THRUSTARRAY(a->nz);
1298
1299 stat = cusparse_hyb2csr(cusparsestruct->handle, matstruct->descr, (cusparseHybMat_t)matstruct->mat, temp->values->data().get(), temp->row_offsets->data().get(), temp->column_indices->data().get());
1300 PetscCallCUSPARSE(stat);
1301
1302 /* Next, convert CSR to CSC (i.e. the matrix transpose) */
1303 tempT->num_rows = A->rmap->n;
1304 tempT->num_cols = A->cmap->n;
1305 tempT->num_entries = a->nz;
1306 tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1);
1307 tempT->column_indices = new THRUSTINTARRAY32(a->nz);
1308 tempT->values = new THRUSTARRAY(a->nz);
1309
1310 stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows, temp->num_cols, temp->num_entries, temp->values->data().get(), temp->row_offsets->data().get(), temp->column_indices->data().get(), tempT->values->data().get(),
1311 tempT->column_indices->data().get(), tempT->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase);
1312 PetscCallCUSPARSE(stat);
1313
1314 /* Last, convert CSC to HYB */
1315 cusparseHybMat_t hybMat;
1316 PetscCallCUSPARSE(cusparseCreateHybMat(&hybMat));
1317 cusparseHybPartition_t partition = cusparsestruct->format == MAT_CUSPARSE_ELL ? CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1318 stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n, matstructT->descr, tempT->values->data().get(), tempT->row_offsets->data().get(), tempT->column_indices->data().get(), hybMat, 0, partition);
1319 PetscCallCUSPARSE(stat);
1320
1321 /* assign the pointer */
1322 matstructT->mat = hybMat;
1323 A->transupdated = PETSC_TRUE;
1324 /* delete temporaries */
1325 if (tempT) {
1326 if (tempT->values) delete (THRUSTARRAY *)tempT->values;
1327 if (tempT->column_indices) delete (THRUSTINTARRAY32 *)tempT->column_indices;
1328 if (tempT->row_offsets) delete (THRUSTINTARRAY32 *)tempT->row_offsets;
1329 delete (CsrMatrix *)tempT;
1330 }
1331 if (temp) {
1332 if (temp->values) delete (THRUSTARRAY *)temp->values;
1333 if (temp->column_indices) delete (THRUSTINTARRAY32 *)temp->column_indices;
1334 if (temp->row_offsets) delete (THRUSTINTARRAY32 *)temp->row_offsets;
1335 delete (CsrMatrix *)temp;
1336 }
1337 #endif
1338 }
1339 }
1340 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { /* transpose mat struct may be already present, update data */
1341 CsrMatrix *matrix = (CsrMatrix *)matstruct->mat;
1342 CsrMatrix *matrixT = (CsrMatrix *)matstructT->mat;
1343 PetscCheck(matrix, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix");
1344 PetscCheck(matrix->row_offsets, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix rows");
1345 PetscCheck(matrix->column_indices, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix cols");
1346 PetscCheck(matrix->values, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix values");
1347 PetscCheck(matrixT, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT");
1348 PetscCheck(matrixT->row_offsets, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT rows");
1349 PetscCheck(matrixT->column_indices, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT cols");
1350 PetscCheck(matrixT->values, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT values");
1351 if (!cusparsestruct->rowoffsets_gpu) { /* this may be absent when we did not construct the transpose with csr2csc */
1352 cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
1353 cusparsestruct->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
1354 PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt)));
1355 }
1356 if (!cusparsestruct->csr2csc_i) {
1357 THRUSTARRAY csr2csc_a(matrix->num_entries);
1358 PetscCallThrust(thrust::sequence(thrust::device, csr2csc_a.begin(), csr2csc_a.end(), 0.0));
1359
1360 indexBase = cusparseGetMatIndexBase(matstruct->descr);
1361 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1362 void *csr2cscBuffer;
1363 size_t csr2cscBufferSize;
1364 stat = cusparseCsr2cscEx2_bufferSize(cusparsestruct->handle, A->rmap->n, A->cmap->n, matrix->num_entries, matrix->values->data().get(), cusparsestruct->rowoffsets_gpu->data().get(), matrix->column_indices->data().get(), matrixT->values->data().get(),
1365 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, cusparsestruct->csr2cscAlg, &csr2cscBufferSize);
1366 PetscCallCUSPARSE(stat);
1367 PetscCallCUDA(cudaMalloc(&csr2cscBuffer, csr2cscBufferSize));
1368 #endif
1369
1370 if (matrix->num_entries) {
1371 /* When there are no nonzeros, this routine mistakenly returns CUSPARSE_STATUS_INVALID_VALUE in
1372 mat_tests-ex62_15_mpiaijcusparse on ranks 0 and 2 with CUDA-11. But CUDA-10 is OK.
1373 I checked every parameters and they were just fine. I have no clue why cusparse complains.
1374
1375 Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2, when nnz = 0, matrixT->row_offsets[]
1376 should be filled with indexBase. So I just take a shortcut here.
1377 */
1378 stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n, A->cmap->n, matrix->num_entries, csr2csc_a.data().get(), cusparsestruct->rowoffsets_gpu->data().get(), matrix->column_indices->data().get(), matrixT->values->data().get(),
1379 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1380 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, cusparsestruct->csr2cscAlg, csr2cscBuffer);
1381 PetscCallCUSPARSE(stat);
1382 #else
1383 matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase);
1384 PetscCallCUSPARSE(stat);
1385 #endif
1386 } else {
1387 matrixT->row_offsets->assign(matrixT->row_offsets->size(), indexBase);
1388 }
1389
1390 cusparsestruct->csr2csc_i = new THRUSTINTARRAY(matrix->num_entries);
1391 PetscCallThrust(thrust::transform(thrust::device, matrixT->values->begin(), matrixT->values->end(), cusparsestruct->csr2csc_i->begin(), PetscScalarToPetscInt()));
1392 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1393 PetscCallCUDA(cudaFree(csr2cscBuffer));
1394 #endif
1395 }
1396 PetscCallThrust(
1397 thrust::copy(thrust::device, thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->begin()), thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->end()), matrixT->values->begin()));
1398 }
1399 PetscCall(PetscLogGpuTimeEnd());
1400 PetscCall(PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
1401 /* the compressed row indices is not used for matTranspose */
1402 matstructT->cprowIndices = NULL;
1403 /* assign the pointer */
1404 ((Mat_SeqAIJCUSPARSE *)A->spptr)->matTranspose = matstructT;
1405 A->transupdated = PETSC_TRUE;
1406 PetscFunctionReturn(PETSC_SUCCESS);
1407 }
1408
1409 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
MatSolve_SeqAIJCUSPARSE_LU(Mat A,Vec b,Vec x)1410 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_LU(Mat A, Vec b, Vec x)
1411 {
1412 const PetscScalar *barray;
1413 PetscScalar *xarray;
1414 thrust::device_ptr<const PetscScalar> bGPU;
1415 thrust::device_ptr<PetscScalar> xGPU;
1416 Mat_SeqAIJCUSPARSETriFactors *fs = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr);
1417 const Mat_SeqAIJ *aij = static_cast<Mat_SeqAIJ *>(A->data);
1418 const cusparseOperation_t op = CUSPARSE_OPERATION_NON_TRANSPOSE;
1419 const cusparseSpSVAlg_t alg = CUSPARSE_SPSV_ALG_DEFAULT;
1420 PetscInt m = A->rmap->n;
1421
1422 PetscFunctionBegin;
1423 PetscCall(PetscLogGpuTimeBegin());
1424 PetscCall(VecCUDAGetArrayWrite(x, &xarray));
1425 PetscCall(VecCUDAGetArrayRead(b, &barray));
1426 xGPU = thrust::device_pointer_cast(xarray);
1427 bGPU = thrust::device_pointer_cast(barray);
1428
1429 // Reorder b with the row permutation if needed, and wrap the result in fs->X
1430 if (fs->rpermIndices) {
1431 PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->end()), thrust::device_pointer_cast(fs->X)));
1432 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X));
1433 } else {
1434 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray));
1435 }
1436
1437 // Solve L Y = X
1438 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y));
1439 // Note that cusparseSpSV_solve() secretly uses the external buffer used in cusparseSpSV_analysis()!
1440 PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, op, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_L));
1441
1442 // Solve U X = Y
1443 if (fs->cpermIndices) {
1444 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X));
1445 } else {
1446 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray));
1447 }
1448 PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, op, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, alg, fs->spsvDescr_U));
1449
1450 // Reorder X with the column permutation if needed, and put the result back to x
1451 if (fs->cpermIndices) {
1452 PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X), fs->cpermIndices->begin()),
1453 thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X + m), fs->cpermIndices->end()), xGPU));
1454 }
1455 PetscCall(VecCUDARestoreArrayRead(b, &barray));
1456 PetscCall(VecCUDARestoreArrayWrite(x, &xarray));
1457 PetscCall(PetscLogGpuTimeEnd());
1458 PetscCall(PetscLogGpuFlops(2.0 * aij->nz - m));
1459 PetscFunctionReturn(PETSC_SUCCESS);
1460 }
1461
MatSolveTranspose_SeqAIJCUSPARSE_LU(Mat A,Vec b,Vec x)1462 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_LU(Mat A, Vec b, Vec x)
1463 {
1464 Mat_SeqAIJCUSPARSETriFactors *fs = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr);
1465 Mat_SeqAIJ *aij = static_cast<Mat_SeqAIJ *>(A->data);
1466 const PetscScalar *barray;
1467 PetscScalar *xarray;
1468 thrust::device_ptr<const PetscScalar> bGPU;
1469 thrust::device_ptr<PetscScalar> xGPU;
1470 const cusparseOperation_t opA = CUSPARSE_OPERATION_TRANSPOSE;
1471 const cusparseSpSVAlg_t alg = CUSPARSE_SPSV_ALG_DEFAULT;
1472 PetscInt m = A->rmap->n;
1473
1474 PetscFunctionBegin;
1475 PetscCall(PetscLogGpuTimeBegin());
1476 if (!fs->createdTransposeSpSVDescr) { // Call MatSolveTranspose() for the first time
1477 PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Lt));
1478 PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* The matrix is still L. We only do transpose solve with it */
1479 fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Lt, &fs->spsvBufferSize_Lt));
1480
1481 PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Ut));
1482 PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Ut, &fs->spsvBufferSize_Ut));
1483 PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Lt, fs->spsvBufferSize_Lt));
1484 PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Ut, fs->spsvBufferSize_Ut));
1485 fs->createdTransposeSpSVDescr = PETSC_TRUE;
1486 }
1487
1488 if (!fs->updatedTransposeSpSVAnalysis) {
1489 PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Lt, fs->spsvBuffer_Lt));
1490
1491 PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Ut, fs->spsvBuffer_Ut));
1492 fs->updatedTransposeSpSVAnalysis = PETSC_TRUE;
1493 }
1494
1495 PetscCall(VecCUDAGetArrayWrite(x, &xarray));
1496 PetscCall(VecCUDAGetArrayRead(b, &barray));
1497 xGPU = thrust::device_pointer_cast(xarray);
1498 bGPU = thrust::device_pointer_cast(barray);
1499
1500 // Reorder b with the row permutation if needed, and wrap the result in fs->X
1501 if (fs->rpermIndices) {
1502 PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->end()), thrust::device_pointer_cast(fs->X)));
1503 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X));
1504 } else {
1505 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray));
1506 }
1507
1508 // Solve Ut Y = X
1509 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y));
1510 PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Ut));
1511
1512 // Solve Lt X = Y
1513 if (fs->cpermIndices) { // if need to permute, we need to use the intermediate buffer X
1514 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X));
1515 } else {
1516 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray));
1517 }
1518 PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, alg, fs->spsvDescr_Lt));
1519
1520 // Reorder X with the column permutation if needed, and put the result back to x
1521 if (fs->cpermIndices) {
1522 PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X), fs->cpermIndices->begin()),
1523 thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X + m), fs->cpermIndices->end()), xGPU));
1524 }
1525
1526 PetscCall(VecCUDARestoreArrayRead(b, &barray));
1527 PetscCall(VecCUDARestoreArrayWrite(x, &xarray));
1528 PetscCall(PetscLogGpuTimeEnd());
1529 PetscCall(PetscLogGpuFlops(2.0 * aij->nz - A->rmap->n));
1530 PetscFunctionReturn(PETSC_SUCCESS);
1531 }
1532 #else
1533 /* Why do we need to analyze the transposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */
MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)1534 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A, Vec bb, Vec xx)
1535 {
1536 PetscInt n = xx->map->n;
1537 const PetscScalar *barray;
1538 PetscScalar *xarray;
1539 thrust::device_ptr<const PetscScalar> bGPU;
1540 thrust::device_ptr<PetscScalar> xGPU;
1541 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
1542 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose;
1543 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose;
1544 THRUSTARRAY *tempGPU = (THRUSTARRAY *)cusparseTriFactors->workVector;
1545
1546 PetscFunctionBegin;
1547 /* Analyze the matrix and create the transpose ... on the fly */
1548 if (!loTriFactorT && !upTriFactorT) {
1549 PetscCall(MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A));
1550 loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose;
1551 upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose;
1552 }
1553
1554 /* Get the GPU pointers */
1555 PetscCall(VecCUDAGetArrayWrite(xx, &xarray));
1556 PetscCall(VecCUDAGetArrayRead(bb, &barray));
1557 xGPU = thrust::device_pointer_cast(xarray);
1558 bGPU = thrust::device_pointer_cast(barray);
1559
1560 PetscCall(PetscLogGpuTimeBegin());
1561 /* First, reorder with the row permutation */
1562 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU + n, cusparseTriFactors->rpermIndices->end()), xGPU);
1563
1564 /* First, solve U */
1565 PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
1566 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, xarray, tempGPU->data().get(), upTriFactorT->solvePolicy, upTriFactorT->solveBuffer));
1567
1568 /* Then, solve L */
1569 PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
1570 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, tempGPU->data().get(), xarray, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer));
1571
1572 /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1573 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()), thrust::make_permutation_iterator(xGPU + n, cusparseTriFactors->cpermIndices->end()), tempGPU->begin());
1574
1575 /* Copy the temporary to the full solution. */
1576 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), tempGPU->begin(), tempGPU->end(), xGPU);
1577
1578 /* restore */
1579 PetscCall(VecCUDARestoreArrayRead(bb, &barray));
1580 PetscCall(VecCUDARestoreArrayWrite(xx, &xarray));
1581 PetscCall(PetscLogGpuTimeEnd());
1582 PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n));
1583 PetscFunctionReturn(PETSC_SUCCESS);
1584 }
1585
MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)1586 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A, Vec bb, Vec xx)
1587 {
1588 const PetscScalar *barray;
1589 PetscScalar *xarray;
1590 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
1591 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose;
1592 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose;
1593 THRUSTARRAY *tempGPU = (THRUSTARRAY *)cusparseTriFactors->workVector;
1594
1595 PetscFunctionBegin;
1596 /* Analyze the matrix and create the transpose ... on the fly */
1597 if (!loTriFactorT && !upTriFactorT) {
1598 PetscCall(MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A));
1599 loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose;
1600 upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose;
1601 }
1602
1603 /* Get the GPU pointers */
1604 PetscCall(VecCUDAGetArrayWrite(xx, &xarray));
1605 PetscCall(VecCUDAGetArrayRead(bb, &barray));
1606
1607 PetscCall(PetscLogGpuTimeBegin());
1608 /* First, solve U */
1609 PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
1610 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, barray, tempGPU->data().get(), upTriFactorT->solvePolicy, upTriFactorT->solveBuffer));
1611
1612 /* Then, solve L */
1613 PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
1614 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, tempGPU->data().get(), xarray, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer));
1615
1616 /* restore */
1617 PetscCall(VecCUDARestoreArrayRead(bb, &barray));
1618 PetscCall(VecCUDARestoreArrayWrite(xx, &xarray));
1619 PetscCall(PetscLogGpuTimeEnd());
1620 PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n));
1621 PetscFunctionReturn(PETSC_SUCCESS);
1622 }
1623
MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)1624 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A, Vec bb, Vec xx)
1625 {
1626 const PetscScalar *barray;
1627 PetscScalar *xarray;
1628 thrust::device_ptr<const PetscScalar> bGPU;
1629 thrust::device_ptr<PetscScalar> xGPU;
1630 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
1631 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr;
1632 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr;
1633 THRUSTARRAY *tempGPU = (THRUSTARRAY *)cusparseTriFactors->workVector;
1634
1635 PetscFunctionBegin;
1636 /* Get the GPU pointers */
1637 PetscCall(VecCUDAGetArrayWrite(xx, &xarray));
1638 PetscCall(VecCUDAGetArrayRead(bb, &barray));
1639 xGPU = thrust::device_pointer_cast(xarray);
1640 bGPU = thrust::device_pointer_cast(barray);
1641
1642 PetscCall(PetscLogGpuTimeBegin());
1643 /* First, reorder with the row permutation */
1644 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()), tempGPU->begin());
1645
1646 /* Next, solve L */
1647 PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
1648 loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, tempGPU->data().get(), xarray, loTriFactor->solvePolicy, loTriFactor->solveBuffer));
1649
1650 /* Then, solve U */
1651 PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
1652 upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, xarray, tempGPU->data().get(), upTriFactor->solvePolicy, upTriFactor->solveBuffer));
1653
1654 /* Last, reorder with the column permutation */
1655 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()), thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()), xGPU);
1656
1657 PetscCall(VecCUDARestoreArrayRead(bb, &barray));
1658 PetscCall(VecCUDARestoreArrayWrite(xx, &xarray));
1659 PetscCall(PetscLogGpuTimeEnd());
1660 PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n));
1661 PetscFunctionReturn(PETSC_SUCCESS);
1662 }
1663
MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)1664 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A, Vec bb, Vec xx)
1665 {
1666 const PetscScalar *barray;
1667 PetscScalar *xarray;
1668 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
1669 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr;
1670 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr;
1671 THRUSTARRAY *tempGPU = (THRUSTARRAY *)cusparseTriFactors->workVector;
1672
1673 PetscFunctionBegin;
1674 /* Get the GPU pointers */
1675 PetscCall(VecCUDAGetArrayWrite(xx, &xarray));
1676 PetscCall(VecCUDAGetArrayRead(bb, &barray));
1677
1678 PetscCall(PetscLogGpuTimeBegin());
1679 /* First, solve L */
1680 PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
1681 loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, barray, tempGPU->data().get(), loTriFactor->solvePolicy, loTriFactor->solveBuffer));
1682
1683 /* Next, solve U */
1684 PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
1685 upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, tempGPU->data().get(), xarray, upTriFactor->solvePolicy, upTriFactor->solveBuffer));
1686
1687 PetscCall(VecCUDARestoreArrayRead(bb, &barray));
1688 PetscCall(VecCUDARestoreArrayWrite(xx, &xarray));
1689 PetscCall(PetscLogGpuTimeEnd());
1690 PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n));
1691 PetscFunctionReturn(PETSC_SUCCESS);
1692 }
1693 #endif
1694
1695 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
MatILUFactorNumeric_SeqAIJCUSPARSE_ILU0(Mat fact,Mat A,const MatFactorInfo *)1696 static PetscErrorCode MatILUFactorNumeric_SeqAIJCUSPARSE_ILU0(Mat fact, Mat A, const MatFactorInfo *)
1697 {
1698 Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1699 Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data;
1700 Mat_SeqAIJCUSPARSE *Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
1701 CsrMatrix *Acsr;
1702 PetscInt m, nz;
1703 PetscBool flg;
1704
1705 PetscFunctionBegin;
1706 if (PetscDefined(USE_DEBUG)) {
1707 PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
1708 PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name);
1709 }
1710
1711 /* Copy A's value to fact */
1712 m = fact->rmap->n;
1713 nz = aij->nz;
1714 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
1715 Acsr = (CsrMatrix *)Acusp->mat->mat;
1716 PetscCallCUDA(cudaMemcpyAsync(fs->csrVal, Acsr->values->data().get(), sizeof(PetscScalar) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
1717
1718 PetscCall(PetscLogGpuTimeBegin());
1719 /* Factorize fact inplace */
1720 if (m)
1721 PetscCallCUSPARSE(cusparseXcsrilu02(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */
1722 fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ilu0Info_M, fs->policy_M, fs->factBuffer_M));
1723 if (PetscDefined(USE_DEBUG)) {
1724 int numerical_zero;
1725 cusparseStatus_t status;
1726 status = cusparseXcsrilu02_zeroPivot(fs->handle, fs->ilu0Info_M, &numerical_zero);
1727 PetscAssert(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Numerical zero pivot detected in csrilu02: A(%d,%d) is zero", numerical_zero, numerical_zero);
1728 }
1729
1730 #if PETSC_PKG_CUDA_VERSION_GE(12, 1, 1)
1731 if (fs->updatedSpSVAnalysis) {
1732 if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_L, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL));
1733 if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_U, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL));
1734 } else
1735 #endif
1736 {
1737 /* cusparseSpSV_analysis() is numeric, i.e., it requires valid matrix values, therefore, we do it after cusparseXcsrilu02()
1738 See discussion at https://github.com/NVIDIA/CUDALibrarySamples/issues/78
1739 */
1740 PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L));
1741
1742 PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, fs->spsvBuffer_U));
1743
1744 fs->updatedSpSVAnalysis = PETSC_TRUE;
1745 /* L, U values have changed, reset the flag to indicate we need to redo cusparseSpSV_analysis() for transpose solve */
1746 fs->updatedTransposeSpSVAnalysis = PETSC_FALSE;
1747 }
1748
1749 fact->offloadmask = PETSC_OFFLOAD_GPU;
1750 fact->ops->solve = MatSolve_SeqAIJCUSPARSE_LU; // spMatDescr_L/U uses 32-bit indices, but cusparseSpSV_solve() supports both 32 and 64. The info is encoded in cusparseSpMatDescr_t.
1751 fact->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_LU;
1752 fact->ops->matsolve = NULL;
1753 fact->ops->matsolvetranspose = NULL;
1754 PetscCall(PetscLogGpuTimeEnd());
1755 PetscCall(PetscLogGpuFlops(fs->numericFactFlops));
1756 PetscFunctionReturn(PETSC_SUCCESS);
1757 }
1758
MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0(Mat fact,Mat A,IS,IS,const MatFactorInfo * info)1759 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0(Mat fact, Mat A, IS, IS, const MatFactorInfo *info)
1760 {
1761 Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1762 Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data;
1763 PetscInt m, nz;
1764
1765 PetscFunctionBegin;
1766 if (PetscDefined(USE_DEBUG)) {
1767 PetscBool flg, diagDense;
1768
1769 PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
1770 PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name);
1771 PetscCheck(A->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Must be square matrix, rows %" PetscInt_FMT " columns %" PetscInt_FMT, A->rmap->n, A->cmap->n);
1772 PetscCall(MatGetDiagonalMarkers_SeqAIJ(A, NULL, &diagDense));
1773 PetscCheck(diagDense, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing a diagonal entry");
1774 }
1775
1776 /* Free the old stale stuff */
1777 PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&fs));
1778
1779 /* Copy over A's meta data to fact. Note that we also allocated fact's i,j,a on host,
1780 but they will not be used. Allocate them just for easy debugging.
1781 */
1782 PetscCall(MatDuplicateNoCreate_SeqAIJ(fact, A, MAT_DO_NOT_COPY_VALUES, PETSC_TRUE /*malloc*/));
1783
1784 fact->offloadmask = PETSC_OFFLOAD_BOTH;
1785 fact->factortype = MAT_FACTOR_ILU;
1786 fact->info.factor_mallocs = 0;
1787 fact->info.fill_ratio_given = info->fill;
1788 fact->info.fill_ratio_needed = 1.0;
1789
1790 aij->row = NULL;
1791 aij->col = NULL;
1792
1793 /* ====================================================================== */
1794 /* Copy A's i, j to fact and also allocate the value array of fact. */
1795 /* We'll do in-place factorization on fact */
1796 /* ====================================================================== */
1797 const int *Ai, *Aj;
1798
1799 m = fact->rmap->n;
1800 nz = aij->nz;
1801
1802 PetscCallCUDA(cudaMalloc((void **)&fs->csrRowPtr32, sizeof(*fs->csrRowPtr32) * (m + 1)));
1803 PetscCallCUDA(cudaMalloc((void **)&fs->csrColIdx32, sizeof(*fs->csrColIdx32) * nz));
1804 PetscCallCUDA(cudaMalloc((void **)&fs->csrVal, sizeof(*fs->csrVal) * nz));
1805 PetscCall(MatSeqAIJCUSPARSEGetIJ(A, PETSC_FALSE, &Ai, &Aj)); /* Do not use compressed Ai. The returned Ai, Aj are 32-bit */
1806 PetscCallCUDA(cudaMemcpyAsync(fs->csrRowPtr32, Ai, sizeof(*Ai) * (m + 1), cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
1807 PetscCallCUDA(cudaMemcpyAsync(fs->csrColIdx32, Aj, sizeof(*Aj) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
1808
1809 /* ====================================================================== */
1810 /* Create descriptors for M, L, U */
1811 /* ====================================================================== */
1812 cusparseFillMode_t fillMode;
1813 cusparseDiagType_t diagType;
1814
1815 PetscCallCUSPARSE(cusparseCreateMatDescr(&fs->matDescr_M));
1816 PetscCallCUSPARSE(cusparseSetMatIndexBase(fs->matDescr_M, CUSPARSE_INDEX_BASE_ZERO));
1817 PetscCallCUSPARSE(cusparseSetMatType(fs->matDescr_M, CUSPARSE_MATRIX_TYPE_GENERAL));
1818
1819 /* https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t
1820 cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always
1821 assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that
1822 all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine
1823 assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory.
1824 */
1825 fillMode = CUSPARSE_FILL_MODE_LOWER;
1826 diagType = CUSPARSE_DIAG_TYPE_UNIT;
1827 PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_L, m, m, nz, fs->csrRowPtr32, fs->csrColIdx32, fs->csrVal, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
1828 PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
1829 PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
1830
1831 fillMode = CUSPARSE_FILL_MODE_UPPER;
1832 diagType = CUSPARSE_DIAG_TYPE_NON_UNIT;
1833 PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_U, m, m, nz, fs->csrRowPtr32, fs->csrColIdx32, fs->csrVal, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
1834 PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
1835 PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
1836
1837 /* ========================================================================= */
1838 /* Query buffer sizes for csrilu0, SpSV and allocate buffers */
1839 /* ========================================================================= */
1840 PetscCallCUSPARSE(cusparseCreateCsrilu02Info(&fs->ilu0Info_M));
1841 if (m)
1842 PetscCallCUSPARSE(cusparseXcsrilu02_bufferSize(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */
1843 fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ilu0Info_M, &fs->factBufferSize_M));
1844
1845 PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(PetscScalar) * m));
1846 PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(PetscScalar) * m));
1847
1848 PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype));
1849 PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype));
1850
1851 PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_L));
1852 PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, &fs->spsvBufferSize_L));
1853
1854 PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_U));
1855 PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, &fs->spsvBufferSize_U));
1856
1857 /* From my experiment with the example at https://github.com/NVIDIA/CUDALibrarySamples/tree/master/cuSPARSE/bicgstab,
1858 and discussion at https://github.com/NVIDIA/CUDALibrarySamples/issues/77,
1859 spsvBuffer_L/U can not be shared (i.e., the same) for our case, but factBuffer_M can share with either of spsvBuffer_L/U.
1860 To save memory, we make factBuffer_M share with the bigger of spsvBuffer_L/U.
1861 */
1862 if (fs->spsvBufferSize_L > fs->spsvBufferSize_U) {
1863 PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_L, (size_t)fs->factBufferSize_M)));
1864 fs->spsvBuffer_L = fs->factBuffer_M;
1865 PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_U, fs->spsvBufferSize_U));
1866 } else {
1867 PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_U, (size_t)fs->factBufferSize_M)));
1868 fs->spsvBuffer_U = fs->factBuffer_M;
1869 PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_L, fs->spsvBufferSize_L));
1870 }
1871
1872 /* ========================================================================== */
1873 /* Perform analysis of ilu0 on M, SpSv on L and U */
1874 /* The lower(upper) triangular part of M has the same sparsity pattern as L(U)*/
1875 /* ========================================================================== */
1876 int structural_zero;
1877 cusparseStatus_t status;
1878
1879 fs->policy_M = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
1880 if (m)
1881 PetscCallCUSPARSE(cusparseXcsrilu02_analysis(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */
1882 fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ilu0Info_M, fs->policy_M, fs->factBuffer_M));
1883 if (PetscDefined(USE_DEBUG)) {
1884 /* cusparseXcsrilu02_zeroPivot() is a blocking call. It calls cudaDeviceSynchronize() to make sure all previous kernels are done. */
1885 status = cusparseXcsrilu02_zeroPivot(fs->handle, fs->ilu0Info_M, &structural_zero);
1886 PetscCheck(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Structural zero pivot detected in csrilu02: A(%d,%d) is missing", structural_zero, structural_zero);
1887 }
1888
1889 /* Estimate FLOPs of the numeric factorization */
1890 {
1891 Mat_SeqAIJ *Aseq = (Mat_SeqAIJ *)A->data;
1892 PetscInt *Ai, nzRow, nzLeft;
1893 const PetscInt *adiag;
1894 PetscLogDouble flops = 0.0;
1895
1896 PetscCall(MatGetDiagonalMarkers_SeqAIJ(A, &adiag, NULL));
1897 Ai = Aseq->i;
1898 for (PetscInt i = 0; i < m; i++) {
1899 if (Ai[i] < adiag[i] && adiag[i] < Ai[i + 1]) { /* There are nonzeros left to the diagonal of row i */
1900 nzRow = Ai[i + 1] - Ai[i];
1901 nzLeft = adiag[i] - Ai[i];
1902 /* We want to eliminate nonzeros left to the diagonal one by one. Assume each time, nonzeros right
1903 and include the eliminated one will be updated, which incurs a multiplication and an addition.
1904 */
1905 nzLeft = (nzRow - 1) / 2;
1906 flops += nzLeft * (2.0 * nzRow - nzLeft + 1);
1907 }
1908 }
1909 fs->numericFactFlops = flops;
1910 }
1911 fact->ops->lufactornumeric = MatILUFactorNumeric_SeqAIJCUSPARSE_ILU0;
1912 PetscFunctionReturn(PETSC_SUCCESS);
1913 }
1914
MatSolve_SeqAIJCUSPARSE_ICC0(Mat fact,Vec b,Vec x)1915 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_ICC0(Mat fact, Vec b, Vec x)
1916 {
1917 Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1918 Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data;
1919 const PetscScalar *barray;
1920 PetscScalar *xarray;
1921
1922 PetscFunctionBegin;
1923 PetscCall(VecCUDAGetArrayWrite(x, &xarray));
1924 PetscCall(VecCUDAGetArrayRead(b, &barray));
1925 PetscCall(PetscLogGpuTimeBegin());
1926
1927 /* Solve L*y = b */
1928 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray));
1929 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y));
1930 PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* L Y = X */
1931 fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L));
1932
1933 /* Solve Lt*x = y */
1934 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray));
1935 PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* Lt X = Y */
1936 fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt));
1937
1938 PetscCall(VecCUDARestoreArrayRead(b, &barray));
1939 PetscCall(VecCUDARestoreArrayWrite(x, &xarray));
1940
1941 PetscCall(PetscLogGpuTimeEnd());
1942 PetscCall(PetscLogGpuFlops(2.0 * aij->nz - fact->rmap->n));
1943 PetscFunctionReturn(PETSC_SUCCESS);
1944 }
1945
MatICCFactorNumeric_SeqAIJCUSPARSE_ICC0(Mat fact,Mat A,const MatFactorInfo *)1946 static PetscErrorCode MatICCFactorNumeric_SeqAIJCUSPARSE_ICC0(Mat fact, Mat A, const MatFactorInfo *)
1947 {
1948 Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1949 Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data;
1950 Mat_SeqAIJCUSPARSE *Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
1951 CsrMatrix *Acsr;
1952 PetscInt m, nz;
1953 PetscBool flg;
1954
1955 PetscFunctionBegin;
1956 if (PetscDefined(USE_DEBUG)) {
1957 PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
1958 PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name);
1959 }
1960
1961 /* Copy A's value to fact */
1962 m = fact->rmap->n;
1963 nz = aij->nz;
1964 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
1965 Acsr = (CsrMatrix *)Acusp->mat->mat;
1966 PetscCallCUDA(cudaMemcpyAsync(fs->csrVal, Acsr->values->data().get(), sizeof(PetscScalar) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
1967
1968 /* Factorize fact inplace */
1969 /* https://docs.nvidia.com/cuda/cusparse/index.html#csric02_solve
1970 csric02() only takes the lower triangular part of matrix A to perform factorization.
1971 The matrix type must be CUSPARSE_MATRIX_TYPE_GENERAL, the fill mode and diagonal type are ignored,
1972 and the strictly upper triangular part is ignored and never touched. It does not matter if A is Hermitian or not.
1973 In other words, from the point of view of csric02() A is Hermitian and only the lower triangular part is provided.
1974 */
1975 if (m) PetscCallCUSPARSE(cusparseXcsric02(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ic0Info_M, fs->policy_M, fs->factBuffer_M));
1976 if (PetscDefined(USE_DEBUG)) {
1977 int numerical_zero;
1978 cusparseStatus_t status;
1979 status = cusparseXcsric02_zeroPivot(fs->handle, fs->ic0Info_M, &numerical_zero);
1980 PetscAssert(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Numerical zero pivot detected in csric02: A(%d,%d) is zero", numerical_zero, numerical_zero);
1981 }
1982
1983 #if PETSC_PKG_CUDA_VERSION_GE(12, 1, 1)
1984 if (fs->updatedSpSVAnalysis) {
1985 if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_L, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL));
1986 if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_Lt, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL));
1987 } else
1988 #endif
1989 {
1990 PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L));
1991
1992 /* Note that cusparse reports this error if we use double and CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE
1993 ** On entry to cusparseSpSV_analysis(): conjugate transpose (opA) is not supported for matA data type, current -> CUDA_R_64F
1994 */
1995 PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, fs->spsvBuffer_Lt));
1996 fs->updatedSpSVAnalysis = PETSC_TRUE;
1997 }
1998
1999 fact->offloadmask = PETSC_OFFLOAD_GPU;
2000 fact->ops->solve = MatSolve_SeqAIJCUSPARSE_ICC0;
2001 fact->ops->solvetranspose = MatSolve_SeqAIJCUSPARSE_ICC0;
2002 fact->ops->matsolve = NULL;
2003 fact->ops->matsolvetranspose = NULL;
2004 PetscCall(PetscLogGpuFlops(fs->numericFactFlops));
2005 PetscFunctionReturn(PETSC_SUCCESS);
2006 }
2007
MatICCFactorSymbolic_SeqAIJCUSPARSE_ICC0(Mat fact,Mat A,IS,const MatFactorInfo * info)2008 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE_ICC0(Mat fact, Mat A, IS, const MatFactorInfo *info)
2009 {
2010 Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
2011 Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data;
2012 PetscInt m, nz;
2013
2014 PetscFunctionBegin;
2015 if (PetscDefined(USE_DEBUG)) {
2016 PetscBool flg, diagDense;
2017
2018 PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
2019 PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name);
2020 PetscCheck(A->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Must be square matrix, rows %" PetscInt_FMT " columns %" PetscInt_FMT, A->rmap->n, A->cmap->n);
2021 PetscCall(MatGetDiagonalMarkers_SeqAIJ(A, NULL, &diagDense));
2022 PetscCheck(diagDense, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entries");
2023 }
2024
2025 /* Free the old stale stuff */
2026 PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&fs));
2027
2028 /* Copy over A's meta data to fact. Note that we also allocated fact's i,j,a on host,
2029 but they will not be used. Allocate them just for easy debugging.
2030 */
2031 PetscCall(MatDuplicateNoCreate_SeqAIJ(fact, A, MAT_DO_NOT_COPY_VALUES, PETSC_TRUE /*malloc*/));
2032
2033 fact->offloadmask = PETSC_OFFLOAD_BOTH;
2034 fact->factortype = MAT_FACTOR_ICC;
2035 fact->info.factor_mallocs = 0;
2036 fact->info.fill_ratio_given = info->fill;
2037 fact->info.fill_ratio_needed = 1.0;
2038
2039 aij->row = NULL;
2040 aij->col = NULL;
2041
2042 /* ====================================================================== */
2043 /* Copy A's i, j to fact and also allocate the value array of fact. */
2044 /* We'll do in-place factorization on fact */
2045 /* ====================================================================== */
2046 const int *Ai, *Aj;
2047
2048 m = fact->rmap->n;
2049 nz = aij->nz;
2050
2051 PetscCallCUDA(cudaMalloc((void **)&fs->csrRowPtr32, sizeof(*fs->csrRowPtr32) * (m + 1)));
2052 PetscCallCUDA(cudaMalloc((void **)&fs->csrColIdx32, sizeof(*fs->csrColIdx32) * nz));
2053 PetscCallCUDA(cudaMalloc((void **)&fs->csrVal, sizeof(PetscScalar) * nz));
2054 PetscCall(MatSeqAIJCUSPARSEGetIJ(A, PETSC_FALSE, &Ai, &Aj)); /* Do not use compressed Ai */
2055 PetscCallCUDA(cudaMemcpyAsync(fs->csrRowPtr32, Ai, sizeof(*Ai) * (m + 1), cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
2056 PetscCallCUDA(cudaMemcpyAsync(fs->csrColIdx32, Aj, sizeof(*Aj) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
2057
2058 /* ====================================================================== */
2059 /* Create mat descriptors for M, L */
2060 /* ====================================================================== */
2061 cusparseFillMode_t fillMode;
2062 cusparseDiagType_t diagType;
2063
2064 PetscCallCUSPARSE(cusparseCreateMatDescr(&fs->matDescr_M));
2065 PetscCallCUSPARSE(cusparseSetMatIndexBase(fs->matDescr_M, CUSPARSE_INDEX_BASE_ZERO));
2066 PetscCallCUSPARSE(cusparseSetMatType(fs->matDescr_M, CUSPARSE_MATRIX_TYPE_GENERAL));
2067
2068 /* https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t
2069 cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always
2070 assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that
2071 all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine
2072 assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory.
2073 */
2074 fillMode = CUSPARSE_FILL_MODE_LOWER;
2075 diagType = CUSPARSE_DIAG_TYPE_NON_UNIT;
2076 PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_L, m, m, nz, fs->csrRowPtr32, fs->csrColIdx32, fs->csrVal, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
2077 PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
2078 PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
2079
2080 /* ========================================================================= */
2081 /* Query buffer sizes for csric0, SpSV of L and Lt, and allocate buffers */
2082 /* ========================================================================= */
2083 PetscCallCUSPARSE(cusparseCreateCsric02Info(&fs->ic0Info_M));
2084 if (m) PetscCallCUSPARSE(cusparseXcsric02_bufferSize(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ic0Info_M, &fs->factBufferSize_M));
2085
2086 PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(PetscScalar) * m));
2087 PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(PetscScalar) * m));
2088
2089 PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype));
2090 PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype));
2091
2092 PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_L));
2093 PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, &fs->spsvBufferSize_L));
2094
2095 PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Lt));
2096 PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, &fs->spsvBufferSize_Lt));
2097
2098 /* To save device memory, we make the factorization buffer share with one of the solver buffer.
2099 See also comments in MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0().
2100 */
2101 if (fs->spsvBufferSize_L > fs->spsvBufferSize_Lt) {
2102 PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_L, (size_t)fs->factBufferSize_M)));
2103 fs->spsvBuffer_L = fs->factBuffer_M;
2104 PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Lt, fs->spsvBufferSize_Lt));
2105 } else {
2106 PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_Lt, (size_t)fs->factBufferSize_M)));
2107 fs->spsvBuffer_Lt = fs->factBuffer_M;
2108 PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_L, fs->spsvBufferSize_L));
2109 }
2110
2111 /* ========================================================================== */
2112 /* Perform analysis of ic0 on M */
2113 /* The lower triangular part of M has the same sparsity pattern as L */
2114 /* ========================================================================== */
2115 int structural_zero;
2116 cusparseStatus_t status;
2117
2118 fs->policy_M = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
2119 if (m) PetscCallCUSPARSE(cusparseXcsric02_analysis(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ic0Info_M, fs->policy_M, fs->factBuffer_M));
2120 if (PetscDefined(USE_DEBUG)) {
2121 /* cusparseXcsric02_zeroPivot() is a blocking call. It calls cudaDeviceSynchronize() to make sure all previous kernels are done. */
2122 status = cusparseXcsric02_zeroPivot(fs->handle, fs->ic0Info_M, &structural_zero);
2123 PetscCheck(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Structural zero pivot detected in csric02: A(%d,%d) is missing", structural_zero, structural_zero);
2124 }
2125
2126 /* Estimate FLOPs of the numeric factorization */
2127 {
2128 Mat_SeqAIJ *Aseq = (Mat_SeqAIJ *)A->data;
2129 PetscInt *Ai, nzRow, nzLeft;
2130 PetscLogDouble flops = 0.0;
2131
2132 Ai = Aseq->i;
2133 for (PetscInt i = 0; i < m; i++) {
2134 nzRow = Ai[i + 1] - Ai[i];
2135 if (nzRow > 1) {
2136 /* We want to eliminate nonzeros left to the diagonal one by one. Assume each time, nonzeros right
2137 and include the eliminated one will be updated, which incurs a multiplication and an addition.
2138 */
2139 nzLeft = (nzRow - 1) / 2;
2140 flops += nzLeft * (2.0 * nzRow - nzLeft + 1);
2141 }
2142 }
2143 fs->numericFactFlops = flops;
2144 }
2145 fact->ops->choleskyfactornumeric = MatICCFactorNumeric_SeqAIJCUSPARSE_ICC0;
2146 PetscFunctionReturn(PETSC_SUCCESS);
2147 }
2148 #endif
2149
MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo * info)2150 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B, Mat A, const MatFactorInfo *info)
2151 {
2152 // use_cpu_solve is a field in Mat_SeqAIJCUSPARSE. B, a factored matrix, uses Mat_SeqAIJCUSPARSETriFactors.
2153 Mat_SeqAIJCUSPARSE *cusparsestruct = static_cast<Mat_SeqAIJCUSPARSE *>(A->spptr);
2154
2155 PetscFunctionBegin;
2156 PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
2157 PetscCall(MatLUFactorNumeric_SeqAIJ(B, A, info));
2158 B->offloadmask = PETSC_OFFLOAD_CPU;
2159
2160 if (!cusparsestruct->use_cpu_solve) {
2161 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2162 B->ops->solve = MatSolve_SeqAIJCUSPARSE_LU;
2163 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_LU;
2164 #else
2165 /* determine which version of MatSolve needs to be used. */
2166 Mat_SeqAIJ *b = (Mat_SeqAIJ *)B->data;
2167 IS isrow = b->row, iscol = b->col;
2168 PetscBool row_identity, col_identity;
2169
2170 PetscCall(ISIdentity(isrow, &row_identity));
2171 PetscCall(ISIdentity(iscol, &col_identity));
2172 if (row_identity && col_identity) {
2173 B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
2174 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
2175 } else {
2176 B->ops->solve = MatSolve_SeqAIJCUSPARSE;
2177 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
2178 }
2179 #endif
2180 }
2181 B->ops->matsolve = NULL;
2182 B->ops->matsolvetranspose = NULL;
2183
2184 /* get the triangular factors */
2185 if (!cusparsestruct->use_cpu_solve) PetscCall(MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B));
2186 PetscFunctionReturn(PETSC_SUCCESS);
2187 }
2188
MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo * info)2189 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
2190 {
2191 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(B->spptr);
2192
2193 PetscFunctionBegin;
2194 PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
2195 PetscCall(MatLUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info));
2196 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
2197 PetscFunctionReturn(PETSC_SUCCESS);
2198 }
2199
MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo * info)2200 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
2201 {
2202 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr;
2203
2204 PetscFunctionBegin;
2205 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2206 PetscBool row_identity = PETSC_FALSE, col_identity = PETSC_FALSE;
2207 if (!info->factoronhost) {
2208 PetscCall(ISIdentity(isrow, &row_identity));
2209 PetscCall(ISIdentity(iscol, &col_identity));
2210 }
2211 if (!info->levels && row_identity && col_identity) {
2212 PetscCall(MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0(B, A, isrow, iscol, info));
2213 } else
2214 #endif
2215 {
2216 PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
2217 PetscCall(MatILUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info));
2218 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
2219 }
2220 PetscFunctionReturn(PETSC_SUCCESS);
2221 }
2222
MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo * info)2223 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS perm, const MatFactorInfo *info)
2224 {
2225 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr;
2226
2227 PetscFunctionBegin;
2228 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2229 PetscBool perm_identity = PETSC_FALSE;
2230 if (!info->factoronhost) PetscCall(ISIdentity(perm, &perm_identity));
2231 if (!info->levels && perm_identity) {
2232 PetscCall(MatICCFactorSymbolic_SeqAIJCUSPARSE_ICC0(B, A, perm, info));
2233 } else
2234 #endif
2235 {
2236 PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
2237 PetscCall(MatICCFactorSymbolic_SeqAIJ(B, A, perm, info));
2238 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
2239 }
2240 PetscFunctionReturn(PETSC_SUCCESS);
2241 }
2242
MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo * info)2243 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS perm, const MatFactorInfo *info)
2244 {
2245 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr;
2246
2247 PetscFunctionBegin;
2248 PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
2249 PetscCall(MatCholeskyFactorSymbolic_SeqAIJ(B, A, perm, info));
2250 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
2251 PetscFunctionReturn(PETSC_SUCCESS);
2252 }
2253
MatFactorGetSolverType_seqaij_cusparse(Mat,MatSolverType * type)2254 static PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat, MatSolverType *type)
2255 {
2256 PetscFunctionBegin;
2257 *type = MATSOLVERCUSPARSE;
2258 PetscFunctionReturn(PETSC_SUCCESS);
2259 }
2260
2261 /*MC
2262 MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices
2263 on a single GPU of type, `MATSEQAIJCUSPARSE`. Currently supported
2264 algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer
2265 performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the
2266 CuSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these
2267 algorithms are not recommended. This class does NOT support direct solver operations.
2268
2269 Level: beginner
2270
2271 .seealso: [](ch_matrices), `Mat`, `MATSEQAIJCUSPARSE`, `PCFactorSetMatSolverType()`, `MatSolverType`, `MatCreateSeqAIJCUSPARSE()`,
2272 `MATAIJCUSPARSE`, `MatCreateAIJCUSPARSE()`, `MatCUSPARSESetFormat()`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
2273 M*/
2274
MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat * B)2275 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A, MatFactorType ftype, Mat *B)
2276 {
2277 PetscInt n = A->rmap->n;
2278
2279 PetscFunctionBegin;
2280 PetscCall(MatCreate(PetscObjectComm((PetscObject)A), B));
2281 PetscCall(MatSetSizes(*B, n, n, n, n));
2282 (*B)->factortype = ftype; // factortype makes MatSetType() allocate spptr of type Mat_SeqAIJCUSPARSETriFactors
2283 PetscCall(MatSetType(*B, MATSEQAIJCUSPARSE));
2284
2285 if (A->boundtocpu && A->bindingpropagates) PetscCall(MatBindToCPU(*B, PETSC_TRUE));
2286 if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
2287 PetscCall(MatSetBlockSizesFromMats(*B, A, A));
2288 if (!A->boundtocpu) {
2289 (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE;
2290 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJCUSPARSE;
2291 } else {
2292 (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJ;
2293 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJ;
2294 }
2295 PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_LU]));
2296 PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILU]));
2297 PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILUDT]));
2298 } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
2299 if (!A->boundtocpu) {
2300 (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJCUSPARSE;
2301 (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE;
2302 } else {
2303 (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJ;
2304 (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJ;
2305 }
2306 PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_CHOLESKY]));
2307 PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ICC]));
2308 } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Factor type not supported for CUSPARSE Matrix Types");
2309
2310 PetscCall(MatSeqAIJSetPreallocation(*B, MAT_SKIP_ALLOCATION, NULL));
2311 (*B)->canuseordering = PETSC_TRUE;
2312 PetscCall(PetscObjectComposeFunction((PetscObject)*B, "MatFactorGetSolverType_C", MatFactorGetSolverType_seqaij_cusparse));
2313 PetscFunctionReturn(PETSC_SUCCESS);
2314 }
2315
MatSeqAIJCUSPARSECopyFromGPU(Mat A)2316 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat A)
2317 {
2318 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
2319 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
2320 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2321 Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
2322 #endif
2323
2324 PetscFunctionBegin;
2325 if (A->offloadmask == PETSC_OFFLOAD_GPU) {
2326 PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyFromGPU, A, 0, 0, 0));
2327 if (A->factortype == MAT_FACTOR_NONE) {
2328 CsrMatrix *matrix = (CsrMatrix *)cusp->mat->mat;
2329 PetscCallCUDA(cudaMemcpy(a->a, matrix->values->data().get(), a->nz * sizeof(PetscScalar), cudaMemcpyDeviceToHost));
2330 }
2331 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2332 else if (fs->csrVal) {
2333 /* We have a factorized matrix on device and are able to copy it to host */
2334 PetscCallCUDA(cudaMemcpy(a->a, fs->csrVal, a->nz * sizeof(PetscScalar), cudaMemcpyDeviceToHost));
2335 }
2336 #endif
2337 else
2338 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for copying this type of factorized matrix from device to host");
2339 PetscCall(PetscLogGpuToCpu(a->nz * sizeof(PetscScalar)));
2340 PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyFromGPU, A, 0, 0, 0));
2341 A->offloadmask = PETSC_OFFLOAD_BOTH;
2342 }
2343 PetscFunctionReturn(PETSC_SUCCESS);
2344 }
2345
MatSeqAIJGetArray_SeqAIJCUSPARSE(Mat A,PetscScalar * array[])2346 static PetscErrorCode MatSeqAIJGetArray_SeqAIJCUSPARSE(Mat A, PetscScalar *array[])
2347 {
2348 PetscFunctionBegin;
2349 PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
2350 *array = ((Mat_SeqAIJ *)A->data)->a;
2351 PetscFunctionReturn(PETSC_SUCCESS);
2352 }
2353
MatSeqAIJRestoreArray_SeqAIJCUSPARSE(Mat A,PetscScalar * array[])2354 static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJCUSPARSE(Mat A, PetscScalar *array[])
2355 {
2356 PetscFunctionBegin;
2357 A->offloadmask = PETSC_OFFLOAD_CPU;
2358 *array = NULL;
2359 PetscFunctionReturn(PETSC_SUCCESS);
2360 }
2361
MatSeqAIJGetArrayRead_SeqAIJCUSPARSE(Mat A,const PetscScalar * array[])2362 static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJCUSPARSE(Mat A, const PetscScalar *array[])
2363 {
2364 PetscFunctionBegin;
2365 PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
2366 *array = ((Mat_SeqAIJ *)A->data)->a;
2367 PetscFunctionReturn(PETSC_SUCCESS);
2368 }
2369
MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE(Mat,const PetscScalar * array[])2370 static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE(Mat, const PetscScalar *array[])
2371 {
2372 PetscFunctionBegin;
2373 *array = NULL;
2374 PetscFunctionReturn(PETSC_SUCCESS);
2375 }
2376
MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE(Mat A,PetscScalar * array[])2377 static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE(Mat A, PetscScalar *array[])
2378 {
2379 PetscFunctionBegin;
2380 *array = ((Mat_SeqAIJ *)A->data)->a;
2381 PetscFunctionReturn(PETSC_SUCCESS);
2382 }
2383
MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE(Mat A,PetscScalar * array[])2384 static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE(Mat A, PetscScalar *array[])
2385 {
2386 PetscFunctionBegin;
2387 A->offloadmask = PETSC_OFFLOAD_CPU;
2388 *array = NULL;
2389 PetscFunctionReturn(PETSC_SUCCESS);
2390 }
2391
MatSeqAIJGetCSRAndMemType_SeqAIJCUSPARSE(Mat A,const PetscInt ** i,const PetscInt ** j,PetscScalar ** a,PetscMemType * mtype)2392 static PetscErrorCode MatSeqAIJGetCSRAndMemType_SeqAIJCUSPARSE(Mat A, const PetscInt **i, const PetscInt **j, PetscScalar **a, PetscMemType *mtype)
2393 {
2394 Mat_SeqAIJCUSPARSE *cusp;
2395 CsrMatrix *matrix;
2396
2397 PetscFunctionBegin;
2398 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
2399 PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "Not for factored matrix");
2400 cusp = static_cast<Mat_SeqAIJCUSPARSE *>(A->spptr);
2401 PetscCheck(cusp != NULL, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "cusp is NULL");
2402 matrix = (CsrMatrix *)cusp->mat->mat;
2403
2404 if (i) {
2405 #if !defined(PETSC_USE_64BIT_INDICES)
2406 *i = matrix->row_offsets->data().get();
2407 #else
2408 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSparse does not supported 64-bit indices");
2409 #endif
2410 }
2411 if (j) {
2412 #if !defined(PETSC_USE_64BIT_INDICES)
2413 *j = matrix->column_indices->data().get();
2414 #else
2415 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSparse does not supported 64-bit indices");
2416 #endif
2417 }
2418 if (a) *a = matrix->values->data().get();
2419 if (mtype) *mtype = PETSC_MEMTYPE_CUDA;
2420 PetscFunctionReturn(PETSC_SUCCESS);
2421 }
2422
MatSeqAIJCUSPARSECopyToGPU(Mat A)2423 PETSC_INTERN PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A)
2424 {
2425 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
2426 Mat_SeqAIJCUSPARSEMultStruct *matstruct = cusparsestruct->mat;
2427 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
2428 PetscInt m = A->rmap->n, *ii, *ridx, tmp;
2429 cusparseStatus_t stat;
2430 PetscBool both = PETSC_TRUE;
2431
2432 PetscFunctionBegin;
2433 PetscCheck(!A->boundtocpu, PETSC_COMM_SELF, PETSC_ERR_GPU, "Cannot copy to GPU");
2434 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
2435 if (A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) { /* Copy values only */
2436 CsrMatrix *matrix;
2437 matrix = (CsrMatrix *)cusparsestruct->mat->mat;
2438
2439 PetscCheck(!a->nz || a->a, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR values");
2440 PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyToGPU, A, 0, 0, 0));
2441 matrix->values->assign(a->a, a->a + a->nz);
2442 PetscCallCUDA(WaitForCUDA());
2443 PetscCall(PetscLogCpuToGpu(a->nz * sizeof(PetscScalar)));
2444 PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyToGPU, A, 0, 0, 0));
2445 PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_FALSE));
2446 } else {
2447 PetscInt nnz;
2448 PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyToGPU, A, 0, 0, 0));
2449 PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat, cusparsestruct->format));
2450 PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE));
2451 delete cusparsestruct->workVector;
2452 delete cusparsestruct->rowoffsets_gpu;
2453 cusparsestruct->workVector = NULL;
2454 cusparsestruct->rowoffsets_gpu = NULL;
2455 try {
2456 if (a->compressedrow.use) {
2457 m = a->compressedrow.nrows;
2458 ii = a->compressedrow.i;
2459 ridx = a->compressedrow.rindex;
2460 } else {
2461 m = A->rmap->n;
2462 ii = a->i;
2463 ridx = NULL;
2464 }
2465 PetscCheck(ii, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR row data");
2466 if (!a->a) {
2467 nnz = ii[m];
2468 both = PETSC_FALSE;
2469 } else nnz = a->nz;
2470 PetscCheck(!nnz || a->j, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR column data");
2471
2472 /* create cusparse matrix */
2473 cusparsestruct->nrows = m;
2474 matstruct = new Mat_SeqAIJCUSPARSEMultStruct;
2475 PetscCallCUSPARSE(cusparseCreateMatDescr(&matstruct->descr));
2476 PetscCallCUSPARSE(cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO));
2477 PetscCallCUSPARSE(cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
2478
2479 PetscCallCUDA(cudaMalloc((void **)&matstruct->alpha_one, sizeof(PetscScalar)));
2480 PetscCallCUDA(cudaMalloc((void **)&matstruct->beta_zero, sizeof(PetscScalar)));
2481 PetscCallCUDA(cudaMalloc((void **)&matstruct->beta_one, sizeof(PetscScalar)));
2482 PetscCallCUDA(cudaMemcpy(matstruct->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
2483 PetscCallCUDA(cudaMemcpy(matstruct->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
2484 PetscCallCUDA(cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
2485 PetscCallCUSPARSE(cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE));
2486
2487 /* Build a hybrid/ellpack matrix if this option is chosen for the storage */
2488 if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
2489 /* set the matrix */
2490 CsrMatrix *mat = new CsrMatrix;
2491 mat->num_rows = m;
2492 mat->num_cols = A->cmap->n;
2493 mat->num_entries = nnz;
2494 PetscCallCXX(mat->row_offsets = new THRUSTINTARRAY32(m + 1));
2495 mat->row_offsets->assign(ii, ii + m + 1);
2496 PetscCallCXX(mat->column_indices = new THRUSTINTARRAY32(nnz));
2497 mat->column_indices->assign(a->j, a->j + nnz);
2498
2499 PetscCallCXX(mat->values = new THRUSTARRAY(nnz));
2500 if (a->a) mat->values->assign(a->a, a->a + nnz);
2501
2502 /* assign the pointer */
2503 matstruct->mat = mat;
2504 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2505 if (mat->num_rows) { /* cusparse errors on empty matrices! */
2506 stat = cusparseCreateCsr(&matstruct->matDescr, mat->num_rows, mat->num_cols, mat->num_entries, mat->row_offsets->data().get(), mat->column_indices->data().get(), mat->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
2507 CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
2508 PetscCallCUSPARSE(stat);
2509 }
2510 #endif
2511 } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) {
2512 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2513 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
2514 #else
2515 CsrMatrix *mat = new CsrMatrix;
2516 mat->num_rows = m;
2517 mat->num_cols = A->cmap->n;
2518 mat->num_entries = nnz;
2519 PetscCallCXX(mat->row_offsets = new THRUSTINTARRAY32(m + 1));
2520 mat->row_offsets->assign(ii, ii + m + 1);
2521
2522 PetscCallCXX(mat->column_indices = new THRUSTINTARRAY32(nnz));
2523 mat->column_indices->assign(a->j, a->j + nnz);
2524
2525 PetscCallCXX(mat->values = new THRUSTARRAY(nnz));
2526 if (a->a) mat->values->assign(a->a, a->a + nnz);
2527
2528 cusparseHybMat_t hybMat;
2529 PetscCallCUSPARSE(cusparseCreateHybMat(&hybMat));
2530 cusparseHybPartition_t partition = cusparsestruct->format == MAT_CUSPARSE_ELL ? CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
2531 stat = cusparse_csr2hyb(cusparsestruct->handle, mat->num_rows, mat->num_cols, matstruct->descr, mat->values->data().get(), mat->row_offsets->data().get(), mat->column_indices->data().get(), hybMat, 0, partition);
2532 PetscCallCUSPARSE(stat);
2533 /* assign the pointer */
2534 matstruct->mat = hybMat;
2535
2536 if (mat) {
2537 if (mat->values) delete (THRUSTARRAY *)mat->values;
2538 if (mat->column_indices) delete (THRUSTINTARRAY32 *)mat->column_indices;
2539 if (mat->row_offsets) delete (THRUSTINTARRAY32 *)mat->row_offsets;
2540 delete (CsrMatrix *)mat;
2541 }
2542 #endif
2543 }
2544
2545 /* assign the compressed row indices */
2546 if (a->compressedrow.use) {
2547 PetscCallCXX(cusparsestruct->workVector = new THRUSTARRAY(m));
2548 PetscCallCXX(matstruct->cprowIndices = new THRUSTINTARRAY(m));
2549 matstruct->cprowIndices->assign(ridx, ridx + m);
2550 tmp = m;
2551 } else {
2552 cusparsestruct->workVector = NULL;
2553 matstruct->cprowIndices = NULL;
2554 tmp = 0;
2555 }
2556 PetscCall(PetscLogCpuToGpu(((m + 1) + (a->nz)) * sizeof(int) + tmp * sizeof(PetscInt) + (3 + (a->nz)) * sizeof(PetscScalar)));
2557
2558 /* assign the pointer */
2559 cusparsestruct->mat = matstruct;
2560 } catch (char *ex) {
2561 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex);
2562 }
2563 PetscCallCUDA(WaitForCUDA());
2564 PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyToGPU, A, 0, 0, 0));
2565 cusparsestruct->nonzerostate = A->nonzerostate;
2566 }
2567 if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
2568 }
2569 PetscFunctionReturn(PETSC_SUCCESS);
2570 }
2571
2572 struct VecCUDAPlusEquals {
2573 template <typename Tuple>
operator ()VecCUDAPlusEquals2574 __host__ __device__ void operator()(Tuple t)
2575 {
2576 thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t);
2577 }
2578 };
2579
2580 struct VecCUDAEquals {
2581 template <typename Tuple>
operator ()VecCUDAEquals2582 __host__ __device__ void operator()(Tuple t)
2583 {
2584 thrust::get<1>(t) = thrust::get<0>(t);
2585 }
2586 };
2587
2588 struct VecCUDAEqualsReverse {
2589 template <typename Tuple>
operator ()VecCUDAEqualsReverse2590 __host__ __device__ void operator()(Tuple t)
2591 {
2592 thrust::get<0>(t) = thrust::get<1>(t);
2593 }
2594 };
2595
2596 struct MatProductCtx_MatMatCusparse {
2597 PetscBool cisdense;
2598 PetscScalar *Bt;
2599 Mat X;
2600 PetscBool reusesym; /* Cusparse does not have split symbolic and numeric phases for sparse matmat operations */
2601 PetscLogDouble flops;
2602 CsrMatrix *Bcsr;
2603
2604 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2605 cusparseSpMatDescr_t matSpBDescr;
2606 PetscBool initialized; /* C = alpha op(A) op(B) + beta C */
2607 cusparseDnMatDescr_t matBDescr;
2608 cusparseDnMatDescr_t matCDescr;
2609 PetscInt Blda, Clda; /* Record leading dimensions of B and C here to detect changes*/
2610 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2611 void *dBuffer4;
2612 void *dBuffer5;
2613 #endif
2614 size_t mmBufferSize;
2615 void *mmBuffer;
2616 void *mmBuffer2; /* SpGEMM WorkEstimation buffer */
2617 cusparseSpGEMMDescr_t spgemmDesc;
2618 #endif
2619 };
2620
MatProductCtxDestroy_MatMatCusparse(PetscCtxRt data)2621 static PetscErrorCode MatProductCtxDestroy_MatMatCusparse(PetscCtxRt data)
2622 {
2623 MatProductCtx_MatMatCusparse *mmdata = *(MatProductCtx_MatMatCusparse **)data;
2624
2625 PetscFunctionBegin;
2626 PetscCallCUDA(cudaFree(mmdata->Bt));
2627 delete mmdata->Bcsr;
2628 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2629 if (mmdata->matSpBDescr) PetscCallCUSPARSE(cusparseDestroySpMat(mmdata->matSpBDescr));
2630 if (mmdata->matBDescr) PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matBDescr));
2631 if (mmdata->matCDescr) PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matCDescr));
2632 if (mmdata->spgemmDesc) PetscCallCUSPARSE(cusparseSpGEMM_destroyDescr(mmdata->spgemmDesc));
2633 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2634 if (mmdata->dBuffer4) PetscCallCUDA(cudaFree(mmdata->dBuffer4));
2635 if (mmdata->dBuffer5) PetscCallCUDA(cudaFree(mmdata->dBuffer5));
2636 #endif
2637 if (mmdata->mmBuffer) PetscCallCUDA(cudaFree(mmdata->mmBuffer));
2638 if (mmdata->mmBuffer2) PetscCallCUDA(cudaFree(mmdata->mmBuffer2));
2639 #endif
2640 PetscCall(MatDestroy(&mmdata->X));
2641 PetscCall(PetscFree(mmdata));
2642 PetscFunctionReturn(PETSC_SUCCESS);
2643 }
2644
2645 #include <../src/mat/impls/dense/seq/dense.h> // MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Internal()
2646
MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)2647 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
2648 {
2649 Mat_Product *product = C->product;
2650 Mat A, B;
2651 PetscInt m, n, blda, clda;
2652 PetscBool flg, biscuda;
2653 Mat_SeqAIJCUSPARSE *cusp;
2654 cusparseStatus_t stat;
2655 cusparseOperation_t opA;
2656 const PetscScalar *barray;
2657 PetscScalar *carray;
2658 MatProductCtx_MatMatCusparse *mmdata;
2659 Mat_SeqAIJCUSPARSEMultStruct *mat;
2660 CsrMatrix *csrmat;
2661
2662 PetscFunctionBegin;
2663 MatCheckProduct(C, 1);
2664 PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data empty");
2665 mmdata = (MatProductCtx_MatMatCusparse *)product->data;
2666 A = product->A;
2667 B = product->B;
2668 PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
2669 PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
2670 /* currently CopyToGpu does not copy if the matrix is bound to CPU
2671 Instead of silently accepting the wrong answer, I prefer to raise the error */
2672 PetscCheck(!A->boundtocpu, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2673 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
2674 cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
2675 switch (product->type) {
2676 case MATPRODUCT_AB:
2677 case MATPRODUCT_PtAP:
2678 mat = cusp->mat;
2679 opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2680 m = A->rmap->n;
2681 n = B->cmap->n;
2682 break;
2683 case MATPRODUCT_AtB:
2684 if (!A->form_explicit_transpose) {
2685 mat = cusp->mat;
2686 opA = CUSPARSE_OPERATION_TRANSPOSE;
2687 } else {
2688 PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
2689 mat = cusp->matTranspose;
2690 opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2691 }
2692 m = A->cmap->n;
2693 n = B->cmap->n;
2694 break;
2695 case MATPRODUCT_ABt:
2696 case MATPRODUCT_RARt:
2697 mat = cusp->mat;
2698 opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2699 m = A->rmap->n;
2700 n = B->rmap->n;
2701 break;
2702 default:
2703 SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
2704 }
2705 PetscCheck(mat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing Mat_SeqAIJCUSPARSEMultStruct");
2706 csrmat = (CsrMatrix *)mat->mat;
2707 /* if the user passed a CPU matrix, copy the data to the GPU */
2708 PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQDENSECUDA, &biscuda));
2709 if (!biscuda) PetscCall(MatConvert(B, MATSEQDENSECUDA, MAT_INPLACE_MATRIX, &B));
2710 PetscCall(MatDenseGetArrayReadAndMemType(B, &barray, nullptr));
2711
2712 PetscCall(MatDenseGetLDA(B, &blda));
2713 if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
2714 PetscCall(MatDenseGetArrayWriteAndMemType(mmdata->X, &carray, nullptr));
2715 PetscCall(MatDenseGetLDA(mmdata->X, &clda));
2716 } else {
2717 PetscCall(MatDenseGetArrayWriteAndMemType(C, &carray, nullptr));
2718 PetscCall(MatDenseGetLDA(C, &clda));
2719 }
2720
2721 PetscCall(PetscLogGpuTimeBegin());
2722 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2723 cusparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE;
2724 #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0)
2725 cusparseSpMatDescr_t &matADescr = mat->matDescr_SpMM[opA];
2726 #else
2727 cusparseSpMatDescr_t &matADescr = mat->matDescr;
2728 #endif
2729
2730 /* (re)allocate mmBuffer if not initialized or LDAs are different */
2731 if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) {
2732 size_t mmBufferSize;
2733 if (mmdata->initialized && mmdata->Blda != blda) {
2734 PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matBDescr));
2735 mmdata->matBDescr = NULL;
2736 }
2737 if (!mmdata->matBDescr) {
2738 PetscCallCUSPARSE(cusparseCreateDnMat(&mmdata->matBDescr, B->rmap->n, B->cmap->n, blda, (void *)barray, cusparse_scalartype, CUSPARSE_ORDER_COL));
2739 mmdata->Blda = blda;
2740 }
2741
2742 if (mmdata->initialized && mmdata->Clda != clda) {
2743 PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matCDescr));
2744 mmdata->matCDescr = NULL;
2745 }
2746 if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */
2747 PetscCallCUSPARSE(cusparseCreateDnMat(&mmdata->matCDescr, m, n, clda, (void *)carray, cusparse_scalartype, CUSPARSE_ORDER_COL));
2748 mmdata->Clda = clda;
2749 }
2750
2751 #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0) // tested up to 12.6.0
2752 if (matADescr) {
2753 PetscCallCUSPARSE(cusparseDestroySpMat(matADescr)); // Because I find I could not reuse matADescr. It could be a cusparse bug
2754 matADescr = NULL;
2755 }
2756 #endif
2757
2758 if (!matADescr) {
2759 stat = cusparseCreateCsr(&matADescr, csrmat->num_rows, csrmat->num_cols, csrmat->num_entries, csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(), csrmat->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
2760 CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
2761 PetscCallCUSPARSE(stat);
2762 }
2763
2764 PetscCallCUSPARSE(cusparseSpMM_bufferSize(cusp->handle, opA, opB, mat->alpha_one, matADescr, mmdata->matBDescr, mat->beta_zero, mmdata->matCDescr, cusparse_scalartype, cusp->spmmAlg, &mmBufferSize));
2765
2766 if ((mmdata->mmBuffer && mmdata->mmBufferSize < mmBufferSize) || !mmdata->mmBuffer) {
2767 PetscCallCUDA(cudaFree(mmdata->mmBuffer));
2768 PetscCallCUDA(cudaMalloc(&mmdata->mmBuffer, mmBufferSize));
2769 mmdata->mmBufferSize = mmBufferSize;
2770 }
2771
2772 #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0) // the _preprocess was added in 11.2.1, but PETSc worked without it until 12.4.0
2773 PetscCallCUSPARSE(cusparseSpMM_preprocess(cusp->handle, opA, opB, mat->alpha_one, matADescr, mmdata->matBDescr, mat->beta_zero, mmdata->matCDescr, cusparse_scalartype, cusp->spmmAlg, mmdata->mmBuffer));
2774 #endif
2775
2776 mmdata->initialized = PETSC_TRUE;
2777 } else {
2778 /* to be safe, always update pointers of the mats */
2779 PetscCallCUSPARSE(cusparseSpMatSetValues(matADescr, csrmat->values->data().get()));
2780 PetscCallCUSPARSE(cusparseDnMatSetValues(mmdata->matBDescr, (void *)barray));
2781 PetscCallCUSPARSE(cusparseDnMatSetValues(mmdata->matCDescr, (void *)carray));
2782 }
2783
2784 /* do cusparseSpMM, which supports transpose on B */
2785 PetscCallCUSPARSE(cusparseSpMM(cusp->handle, opA, opB, mat->alpha_one, matADescr, mmdata->matBDescr, mat->beta_zero, mmdata->matCDescr, cusparse_scalartype, cusp->spmmAlg, mmdata->mmBuffer));
2786 #else
2787 PetscInt k;
2788 /* cusparseXcsrmm does not support transpose on B */
2789 if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
2790 cublasHandle_t cublasv2handle;
2791 cublasStatus_t cerr;
2792
2793 PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
2794 cerr = cublasXgeam(cublasv2handle, CUBLAS_OP_T, CUBLAS_OP_T, B->cmap->n, B->rmap->n, &PETSC_CUSPARSE_ONE, barray, blda, &PETSC_CUSPARSE_ZERO, barray, blda, mmdata->Bt, B->cmap->n);
2795 PetscCallCUBLAS(cerr);
2796 blda = B->cmap->n;
2797 k = B->cmap->n;
2798 } else {
2799 k = B->rmap->n;
2800 }
2801
2802 /* perform the MatMat operation, op(A) is m x k, op(B) is k x n */
2803 stat = cusparse_csr_spmm(cusp->handle, opA, m, n, k, csrmat->num_entries, mat->alpha_one, mat->descr, csrmat->values->data().get(), csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(), mmdata->Bt ? mmdata->Bt : barray, blda, mat->beta_zero, carray, clda);
2804 PetscCallCUSPARSE(stat);
2805 #endif
2806 PetscCall(PetscLogGpuTimeEnd());
2807 PetscCall(PetscLogGpuFlops(n * 2.0 * csrmat->num_entries));
2808 PetscCall(MatDenseRestoreArrayReadAndMemType(B, &barray));
2809 if (product->type == MATPRODUCT_RARt) {
2810 PetscCall(MatDenseRestoreArrayWriteAndMemType(mmdata->X, &carray));
2811 PetscCall(MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Internal(B, mmdata->X, C, PETSC_FALSE, PETSC_FALSE));
2812 } else if (product->type == MATPRODUCT_PtAP) {
2813 PetscCall(MatDenseRestoreArrayWriteAndMemType(mmdata->X, &carray));
2814 PetscCall(MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Internal(B, mmdata->X, C, PETSC_TRUE, PETSC_FALSE));
2815 } else {
2816 PetscCall(MatDenseRestoreArrayWriteAndMemType(C, &carray));
2817 }
2818 if (mmdata->cisdense) PetscCall(MatConvert(C, MATSEQDENSE, MAT_INPLACE_MATRIX, &C));
2819 if (!biscuda) PetscCall(MatConvert(B, MATSEQDENSE, MAT_INPLACE_MATRIX, &B));
2820 PetscFunctionReturn(PETSC_SUCCESS);
2821 }
2822
MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)2823 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
2824 {
2825 Mat_Product *product = C->product;
2826 Mat A, B;
2827 PetscInt m, n;
2828 PetscBool cisdense, flg;
2829 MatProductCtx_MatMatCusparse *mmdata;
2830 Mat_SeqAIJCUSPARSE *cusp;
2831
2832 PetscFunctionBegin;
2833 MatCheckProduct(C, 1);
2834 PetscCheck(!C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data not empty");
2835 A = product->A;
2836 B = product->B;
2837 PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
2838 PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
2839 cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
2840 PetscCheck(cusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
2841 switch (product->type) {
2842 case MATPRODUCT_AB:
2843 m = A->rmap->n;
2844 n = B->cmap->n;
2845 PetscCall(MatSetBlockSizesFromMats(C, A, B));
2846 break;
2847 case MATPRODUCT_AtB:
2848 m = A->cmap->n;
2849 n = B->cmap->n;
2850 if (A->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->rmap, A->cmap->bs));
2851 if (B->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->cmap, B->cmap->bs));
2852 break;
2853 case MATPRODUCT_ABt:
2854 m = A->rmap->n;
2855 n = B->rmap->n;
2856 if (A->rmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->rmap, A->rmap->bs));
2857 if (B->rmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->cmap, B->rmap->bs));
2858 break;
2859 case MATPRODUCT_PtAP:
2860 m = B->cmap->n;
2861 n = B->cmap->n;
2862 if (B->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->rmap, B->cmap->bs));
2863 if (B->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->cmap, B->cmap->bs));
2864 break;
2865 case MATPRODUCT_RARt:
2866 m = B->rmap->n;
2867 n = B->rmap->n;
2868 if (B->rmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->rmap, B->rmap->bs));
2869 if (B->rmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->cmap, B->rmap->bs));
2870 break;
2871 default:
2872 SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
2873 }
2874 PetscCall(MatSetSizes(C, m, n, m, n));
2875 /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */
2876 PetscCall(PetscObjectTypeCompare((PetscObject)C, MATSEQDENSE, &cisdense));
2877 PetscCall(MatSetType(C, MATSEQDENSECUDA));
2878
2879 /* product data */
2880 PetscCall(PetscNew(&mmdata));
2881 mmdata->cisdense = cisdense;
2882 #if PETSC_PKG_CUDA_VERSION_LT(11, 0, 0)
2883 /* cusparseXcsrmm does not support transpose on B, so we allocate buffer to store B^T */
2884 if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) PetscCallCUDA(cudaMalloc((void **)&mmdata->Bt, (size_t)B->rmap->n * (size_t)B->cmap->n * sizeof(PetscScalar)));
2885 #endif
2886 /* for these products we need intermediate storage */
2887 if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
2888 PetscCall(MatCreate(PetscObjectComm((PetscObject)C), &mmdata->X));
2889 PetscCall(MatSetType(mmdata->X, MATSEQDENSECUDA));
2890 if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */
2891 PetscCall(MatSetSizes(mmdata->X, A->rmap->n, B->rmap->n, A->rmap->n, B->rmap->n));
2892 } else {
2893 PetscCall(MatSetSizes(mmdata->X, A->rmap->n, B->cmap->n, A->rmap->n, B->cmap->n));
2894 }
2895 }
2896 C->product->data = mmdata;
2897 C->product->destroy = MatProductCtxDestroy_MatMatCusparse;
2898
2899 C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA;
2900 PetscFunctionReturn(PETSC_SUCCESS);
2901 }
2902
MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C)2903 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C)
2904 {
2905 Mat_Product *product = C->product;
2906 Mat A, B;
2907 Mat_SeqAIJCUSPARSE *Acusp, *Bcusp, *Ccusp;
2908 Mat_SeqAIJ *c = (Mat_SeqAIJ *)C->data;
2909 Mat_SeqAIJCUSPARSEMultStruct *Amat, *Bmat, *Cmat;
2910 CsrMatrix *Acsr, *Bcsr, *Ccsr;
2911 PetscBool flg;
2912 cusparseStatus_t stat;
2913 MatProductType ptype;
2914 MatProductCtx_MatMatCusparse *mmdata;
2915 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2916 cusparseSpMatDescr_t BmatSpDescr;
2917 #endif
2918 cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE, opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */
2919
2920 PetscFunctionBegin;
2921 MatCheckProduct(C, 1);
2922 PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data empty");
2923 PetscCall(PetscObjectTypeCompare((PetscObject)C, MATSEQAIJCUSPARSE, &flg));
2924 PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for C of type %s", ((PetscObject)C)->type_name);
2925 mmdata = (MatProductCtx_MatMatCusparse *)C->product->data;
2926 A = product->A;
2927 B = product->B;
2928 if (mmdata->reusesym) { /* this happens when api_user is true, meaning that the matrix values have been already computed in the MatProductSymbolic phase */
2929 mmdata->reusesym = PETSC_FALSE;
2930 Ccusp = (Mat_SeqAIJCUSPARSE *)C->spptr;
2931 PetscCheck(Ccusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
2932 Cmat = Ccusp->mat;
2933 PetscCheck(Cmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C mult struct for product type %s", MatProductTypes[C->product->type]);
2934 Ccsr = (CsrMatrix *)Cmat->mat;
2935 PetscCheck(Ccsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C CSR struct");
2936 goto finalize;
2937 }
2938 if (!c->nz) goto finalize;
2939 PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
2940 PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
2941 PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQAIJCUSPARSE, &flg));
2942 PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for B of type %s", ((PetscObject)B)->type_name);
2943 PetscCheck(!A->boundtocpu, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2944 PetscCheck(!B->boundtocpu, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2945 Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
2946 Bcusp = (Mat_SeqAIJCUSPARSE *)B->spptr;
2947 Ccusp = (Mat_SeqAIJCUSPARSE *)C->spptr;
2948 PetscCheck(Acusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
2949 PetscCheck(Bcusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
2950 PetscCheck(Ccusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
2951 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
2952 PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
2953
2954 ptype = product->type;
2955 if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) {
2956 ptype = MATPRODUCT_AB;
2957 PetscCheck(product->symbolic_used_the_fact_A_is_symmetric, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Symbolic should have been built using the fact that A is symmetric");
2958 }
2959 if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) {
2960 ptype = MATPRODUCT_AB;
2961 PetscCheck(product->symbolic_used_the_fact_B_is_symmetric, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Symbolic should have been built using the fact that B is symmetric");
2962 }
2963 switch (ptype) {
2964 case MATPRODUCT_AB:
2965 Amat = Acusp->mat;
2966 Bmat = Bcusp->mat;
2967 break;
2968 case MATPRODUCT_AtB:
2969 Amat = Acusp->matTranspose;
2970 Bmat = Bcusp->mat;
2971 break;
2972 case MATPRODUCT_ABt:
2973 Amat = Acusp->mat;
2974 Bmat = Bcusp->matTranspose;
2975 break;
2976 default:
2977 SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
2978 }
2979 Cmat = Ccusp->mat;
2980 PetscCheck(Amat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A mult struct for product type %s", MatProductTypes[ptype]);
2981 PetscCheck(Bmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B mult struct for product type %s", MatProductTypes[ptype]);
2982 PetscCheck(Cmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C mult struct for product type %s", MatProductTypes[ptype]);
2983 Acsr = (CsrMatrix *)Amat->mat;
2984 Bcsr = mmdata->Bcsr ? mmdata->Bcsr : (CsrMatrix *)Bmat->mat; /* B may be in compressed row storage */
2985 Ccsr = (CsrMatrix *)Cmat->mat;
2986 PetscCheck(Acsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A CSR struct");
2987 PetscCheck(Bcsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B CSR struct");
2988 PetscCheck(Ccsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C CSR struct");
2989 PetscCall(PetscLogGpuTimeBegin());
2990 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2991 BmatSpDescr = mmdata->Bcsr ? mmdata->matSpBDescr : Bmat->matDescr; /* B may be in compressed row storage */
2992 PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE));
2993 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2994 stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);
2995 PetscCallCUSPARSE(stat);
2996 #else
2997 stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);
2998 PetscCallCUSPARSE(stat);
2999 stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);
3000 PetscCallCUSPARSE(stat);
3001 #endif
3002 #else
3003 stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr, Bcsr->num_entries,
3004 Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());
3005 PetscCallCUSPARSE(stat);
3006 #endif
3007 PetscCall(PetscLogGpuFlops(mmdata->flops));
3008 PetscCallCUDA(WaitForCUDA());
3009 PetscCall(PetscLogGpuTimeEnd());
3010 C->offloadmask = PETSC_OFFLOAD_GPU;
3011 finalize:
3012 /* shorter version of MatAssemblyEnd_SeqAIJ */
3013 PetscCall(PetscInfo(C, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded, %" PetscInt_FMT " used\n", C->rmap->n, C->cmap->n, c->nz));
3014 PetscCall(PetscInfo(C, "Number of mallocs during MatSetValues() is 0\n"));
3015 PetscCall(PetscInfo(C, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", c->rmax));
3016 c->reallocs = 0;
3017 C->info.mallocs += 0;
3018 C->info.nz_unneeded = 0;
3019 C->assembled = C->was_assembled = PETSC_TRUE;
3020 C->num_ass++;
3021 PetscFunctionReturn(PETSC_SUCCESS);
3022 }
3023
MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C)3024 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C)
3025 {
3026 Mat_Product *product = C->product;
3027 Mat A, B;
3028 Mat_SeqAIJCUSPARSE *Acusp, *Bcusp, *Ccusp;
3029 Mat_SeqAIJ *a, *b, *c;
3030 Mat_SeqAIJCUSPARSEMultStruct *Amat, *Bmat, *Cmat;
3031 CsrMatrix *Acsr, *Bcsr, *Ccsr;
3032 PetscInt i, j, m, n, k;
3033 PetscBool flg;
3034 cusparseStatus_t stat;
3035 MatProductType ptype;
3036 MatProductCtx_MatMatCusparse *mmdata;
3037 PetscLogDouble flops;
3038 PetscBool biscompressed, ciscompressed;
3039 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3040 int64_t C_num_rows1, C_num_cols1, C_nnz1;
3041 cusparseSpMatDescr_t BmatSpDescr;
3042 #else
3043 int cnz;
3044 #endif
3045 cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE, opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */
3046
3047 PetscFunctionBegin;
3048 MatCheckProduct(C, 1);
3049 PetscCheck(!C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data not empty");
3050 A = product->A;
3051 B = product->B;
3052 PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
3053 PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
3054 PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQAIJCUSPARSE, &flg));
3055 PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for B of type %s", ((PetscObject)B)->type_name);
3056 a = (Mat_SeqAIJ *)A->data;
3057 b = (Mat_SeqAIJ *)B->data;
3058 /* product data */
3059 PetscCall(PetscNew(&mmdata));
3060 C->product->data = mmdata;
3061 C->product->destroy = MatProductCtxDestroy_MatMatCusparse;
3062
3063 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
3064 PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
3065 Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr; /* Access spptr after MatSeqAIJCUSPARSECopyToGPU, not before */
3066 Bcusp = (Mat_SeqAIJCUSPARSE *)B->spptr;
3067 PetscCheck(Acusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
3068 PetscCheck(Bcusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
3069
3070 ptype = product->type;
3071 if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) {
3072 ptype = MATPRODUCT_AB;
3073 product->symbolic_used_the_fact_A_is_symmetric = PETSC_TRUE;
3074 }
3075 if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) {
3076 ptype = MATPRODUCT_AB;
3077 product->symbolic_used_the_fact_B_is_symmetric = PETSC_TRUE;
3078 }
3079 biscompressed = PETSC_FALSE;
3080 ciscompressed = PETSC_FALSE;
3081 switch (ptype) {
3082 case MATPRODUCT_AB:
3083 m = A->rmap->n;
3084 n = B->cmap->n;
3085 k = A->cmap->n;
3086 Amat = Acusp->mat;
3087 Bmat = Bcusp->mat;
3088 if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
3089 if (b->compressedrow.use) biscompressed = PETSC_TRUE;
3090 break;
3091 case MATPRODUCT_AtB:
3092 m = A->cmap->n;
3093 n = B->cmap->n;
3094 k = A->rmap->n;
3095 PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
3096 Amat = Acusp->matTranspose;
3097 Bmat = Bcusp->mat;
3098 if (b->compressedrow.use) biscompressed = PETSC_TRUE;
3099 break;
3100 case MATPRODUCT_ABt:
3101 m = A->rmap->n;
3102 n = B->rmap->n;
3103 k = A->cmap->n;
3104 PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(B));
3105 Amat = Acusp->mat;
3106 Bmat = Bcusp->matTranspose;
3107 if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
3108 break;
3109 default:
3110 SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
3111 }
3112
3113 /* create cusparse matrix */
3114 PetscCall(MatSetSizes(C, m, n, m, n));
3115 PetscCall(MatSetType(C, MATSEQAIJCUSPARSE));
3116 c = (Mat_SeqAIJ *)C->data;
3117 Ccusp = (Mat_SeqAIJCUSPARSE *)C->spptr;
3118 Cmat = new Mat_SeqAIJCUSPARSEMultStruct;
3119 Ccsr = new CsrMatrix;
3120
3121 c->compressedrow.use = ciscompressed;
3122 if (c->compressedrow.use) { /* if a is in compressed row, than c will be in compressed row format */
3123 c->compressedrow.nrows = a->compressedrow.nrows;
3124 PetscCall(PetscMalloc2(c->compressedrow.nrows + 1, &c->compressedrow.i, c->compressedrow.nrows, &c->compressedrow.rindex));
3125 PetscCall(PetscArraycpy(c->compressedrow.rindex, a->compressedrow.rindex, c->compressedrow.nrows));
3126 Ccusp->workVector = new THRUSTARRAY(c->compressedrow.nrows);
3127 Cmat->cprowIndices = new THRUSTINTARRAY(c->compressedrow.nrows);
3128 Cmat->cprowIndices->assign(c->compressedrow.rindex, c->compressedrow.rindex + c->compressedrow.nrows);
3129 } else {
3130 c->compressedrow.nrows = 0;
3131 c->compressedrow.i = NULL;
3132 c->compressedrow.rindex = NULL;
3133 Ccusp->workVector = NULL;
3134 Cmat->cprowIndices = NULL;
3135 }
3136 Ccusp->nrows = ciscompressed ? c->compressedrow.nrows : m;
3137 Ccusp->mat = Cmat;
3138 Ccusp->mat->mat = Ccsr;
3139 Ccsr->num_rows = Ccusp->nrows;
3140 Ccsr->num_cols = n;
3141 Ccsr->row_offsets = new THRUSTINTARRAY32(Ccusp->nrows + 1);
3142 PetscCallCUSPARSE(cusparseCreateMatDescr(&Cmat->descr));
3143 PetscCallCUSPARSE(cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO));
3144 PetscCallCUSPARSE(cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
3145 PetscCallCUDA(cudaMalloc((void **)&Cmat->alpha_one, sizeof(PetscScalar)));
3146 PetscCallCUDA(cudaMalloc((void **)&Cmat->beta_zero, sizeof(PetscScalar)));
3147 PetscCallCUDA(cudaMalloc((void **)&Cmat->beta_one, sizeof(PetscScalar)));
3148 PetscCallCUDA(cudaMemcpy(Cmat->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
3149 PetscCallCUDA(cudaMemcpy(Cmat->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
3150 PetscCallCUDA(cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
3151 if (!Ccsr->num_rows || !Ccsr->num_cols || !a->nz || !b->nz) { /* cusparse raise errors in different calls when matrices have zero rows/columns! */
3152 PetscCallThrust(thrust::fill(thrust::device, Ccsr->row_offsets->begin(), Ccsr->row_offsets->end(), 0));
3153 c->nz = 0;
3154 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
3155 Ccsr->values = new THRUSTARRAY(c->nz);
3156 goto finalizesym;
3157 }
3158
3159 PetscCheck(Amat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A mult struct for product type %s", MatProductTypes[ptype]);
3160 PetscCheck(Bmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B mult struct for product type %s", MatProductTypes[ptype]);
3161 Acsr = (CsrMatrix *)Amat->mat;
3162 if (!biscompressed) {
3163 Bcsr = (CsrMatrix *)Bmat->mat;
3164 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3165 BmatSpDescr = Bmat->matDescr;
3166 #endif
3167 } else { /* we need to use row offsets for the full matrix */
3168 CsrMatrix *cBcsr = (CsrMatrix *)Bmat->mat;
3169 Bcsr = new CsrMatrix;
3170 Bcsr->num_rows = B->rmap->n;
3171 Bcsr->num_cols = cBcsr->num_cols;
3172 Bcsr->num_entries = cBcsr->num_entries;
3173 Bcsr->column_indices = cBcsr->column_indices;
3174 Bcsr->values = cBcsr->values;
3175 if (!Bcusp->rowoffsets_gpu) {
3176 Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1);
3177 Bcusp->rowoffsets_gpu->assign(b->i, b->i + B->rmap->n + 1);
3178 PetscCall(PetscLogCpuToGpu((B->rmap->n + 1) * sizeof(PetscInt)));
3179 }
3180 Bcsr->row_offsets = Bcusp->rowoffsets_gpu;
3181 mmdata->Bcsr = Bcsr;
3182 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3183 if (Bcsr->num_rows && Bcsr->num_cols) {
3184 stat = cusparseCreateCsr(&mmdata->matSpBDescr, Bcsr->num_rows, Bcsr->num_cols, Bcsr->num_entries, Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Bcsr->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
3185 PetscCallCUSPARSE(stat);
3186 }
3187 BmatSpDescr = mmdata->matSpBDescr;
3188 #endif
3189 }
3190 PetscCheck(Acsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A CSR struct");
3191 PetscCheck(Bcsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B CSR struct");
3192 /* precompute flops count */
3193 if (ptype == MATPRODUCT_AB) {
3194 for (i = 0, flops = 0; i < A->rmap->n; i++) {
3195 const PetscInt st = a->i[i];
3196 const PetscInt en = a->i[i + 1];
3197 for (j = st; j < en; j++) {
3198 const PetscInt brow = a->j[j];
3199 flops += 2. * (b->i[brow + 1] - b->i[brow]);
3200 }
3201 }
3202 } else if (ptype == MATPRODUCT_AtB) {
3203 for (i = 0, flops = 0; i < A->rmap->n; i++) {
3204 const PetscInt anzi = a->i[i + 1] - a->i[i];
3205 const PetscInt bnzi = b->i[i + 1] - b->i[i];
3206 flops += (2. * anzi) * bnzi;
3207 }
3208 } else { /* TODO */
3209 flops = 0.;
3210 }
3211
3212 mmdata->flops = flops;
3213 PetscCall(PetscLogGpuTimeBegin());
3214
3215 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3216 PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE));
3217 // cuda-12.2 requires non-null csrRowOffsets
3218 stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, 0, Ccsr->row_offsets->data().get(), NULL, NULL, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
3219 PetscCallCUSPARSE(stat);
3220 PetscCallCUSPARSE(cusparseSpGEMM_createDescr(&mmdata->spgemmDesc));
3221 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
3222 {
3223 /* cusparseSpGEMMreuse has more reasonable APIs than cusparseSpGEMM, so we prefer to use it.
3224 We follow the sample code at https://github.com/NVIDIA/CUDALibrarySamples/blob/master/cuSPARSE/spgemm_reuse
3225 */
3226 void *dBuffer1 = NULL;
3227 void *dBuffer2 = NULL;
3228 void *dBuffer3 = NULL;
3229 /* dBuffer4, dBuffer5 are needed by cusparseSpGEMMreuse_compute, and therefore are stored in mmdata */
3230 size_t bufferSize1 = 0;
3231 size_t bufferSize2 = 0;
3232 size_t bufferSize3 = 0;
3233 size_t bufferSize4 = 0;
3234 size_t bufferSize5 = 0;
3235
3236 /* ask bufferSize1 bytes for external memory */
3237 stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize1, NULL);
3238 PetscCallCUSPARSE(stat);
3239 PetscCallCUDA(cudaMalloc((void **)&dBuffer1, bufferSize1));
3240 /* inspect the matrices A and B to understand the memory requirement for the next step */
3241 stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize1, dBuffer1);
3242 PetscCallCUSPARSE(stat);
3243
3244 stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize2, NULL, &bufferSize3, NULL, &bufferSize4, NULL);
3245 PetscCallCUSPARSE(stat);
3246 PetscCallCUDA(cudaMalloc((void **)&dBuffer2, bufferSize2));
3247 PetscCallCUDA(cudaMalloc((void **)&dBuffer3, bufferSize3));
3248 PetscCallCUDA(cudaMalloc((void **)&mmdata->dBuffer4, bufferSize4));
3249 stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize2, dBuffer2, &bufferSize3, dBuffer3, &bufferSize4, mmdata->dBuffer4);
3250 PetscCallCUSPARSE(stat);
3251 PetscCallCUDA(cudaFree(dBuffer1));
3252 PetscCallCUDA(cudaFree(dBuffer2));
3253
3254 /* get matrix C non-zero entries C_nnz1 */
3255 PetscCallCUSPARSE(cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1));
3256 c->nz = (PetscInt)C_nnz1;
3257 /* allocate matrix C */
3258 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
3259 PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
3260 Ccsr->values = new THRUSTARRAY(c->nz);
3261 PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
3262 /* update matC with the new pointers */
3263 stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get());
3264 PetscCallCUSPARSE(stat);
3265
3266 stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize5, NULL);
3267 PetscCallCUSPARSE(stat);
3268 PetscCallCUDA(cudaMalloc((void **)&mmdata->dBuffer5, bufferSize5));
3269 stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize5, mmdata->dBuffer5);
3270 PetscCallCUSPARSE(stat);
3271 PetscCallCUDA(cudaFree(dBuffer3));
3272 stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);
3273 PetscCallCUSPARSE(stat);
3274 PetscCall(PetscInfo(C, "Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n", MatProductTypes[ptype], m, n, k, a->nz, b->nz, c->nz, bufferSize4 / 1024, bufferSize5 / 1024));
3275 }
3276 #else
3277 size_t bufSize2;
3278 /* ask bufferSize bytes for external memory */
3279 stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufSize2, NULL);
3280 PetscCallCUSPARSE(stat);
3281 PetscCallCUDA(cudaMalloc((void **)&mmdata->mmBuffer2, bufSize2));
3282 /* inspect the matrices A and B to understand the memory requirement for the next step */
3283 stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufSize2, mmdata->mmBuffer2);
3284 PetscCallCUSPARSE(stat);
3285 /* ask bufferSize again bytes for external memory */
3286 stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, NULL);
3287 PetscCallCUSPARSE(stat);
3288 /* The CUSPARSE documentation is not clear, nor the API
3289 We need both buffers to perform the operations properly!
3290 mmdata->mmBuffer2 does not appear anywhere in the compute/copy API
3291 it only appears for the workEstimation stuff, but it seems it is needed in compute, so probably the address
3292 is stored in the descriptor! What a messy API... */
3293 PetscCallCUDA(cudaMalloc((void **)&mmdata->mmBuffer, mmdata->mmBufferSize));
3294 /* compute the intermediate product of A * B */
3295 stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);
3296 PetscCallCUSPARSE(stat);
3297 /* get matrix C non-zero entries C_nnz1 */
3298 PetscCallCUSPARSE(cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1));
3299 c->nz = (PetscInt)C_nnz1;
3300 PetscCall(PetscInfo(C, "Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n", MatProductTypes[ptype], m, n, k, a->nz, b->nz, c->nz, bufSize2 / 1024,
3301 mmdata->mmBufferSize / 1024));
3302 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
3303 PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
3304 Ccsr->values = new THRUSTARRAY(c->nz);
3305 PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
3306 stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get());
3307 PetscCallCUSPARSE(stat);
3308 stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);
3309 PetscCallCUSPARSE(stat);
3310 #endif // PETSC_PKG_CUDA_VERSION_GE(11,4,0)
3311 #else
3312 PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_HOST));
3313 stat = cusparseXcsrgemmNnz(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr, Bcsr->num_entries,
3314 Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->row_offsets->data().get(), &cnz);
3315 PetscCallCUSPARSE(stat);
3316 c->nz = cnz;
3317 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
3318 PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
3319 Ccsr->values = new THRUSTARRAY(c->nz);
3320 PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
3321
3322 PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE));
3323 /* with the old gemm interface (removed from 11.0 on) we cannot compute the symbolic factorization only.
3324 I have tried using the gemm2 interface (alpha * A * B + beta * D), which allows to do symbolic by passing NULL for values, but it seems quite buggy when
3325 D is NULL, despite the fact that CUSPARSE documentation claims it is supported! */
3326 stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr, Bcsr->num_entries,
3327 Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());
3328 PetscCallCUSPARSE(stat);
3329 #endif
3330 PetscCall(PetscLogGpuFlops(mmdata->flops));
3331 PetscCall(PetscLogGpuTimeEnd());
3332 finalizesym:
3333 c->free_a = PETSC_TRUE;
3334 PetscCall(PetscShmgetAllocateArray(c->nz, sizeof(PetscInt), (void **)&c->j));
3335 PetscCall(PetscShmgetAllocateArray(m + 1, sizeof(PetscInt), (void **)&c->i));
3336 c->free_ij = PETSC_TRUE;
3337 if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64-bit conversion on the GPU and then copy to host (lazy) */
3338 PetscInt *d_i = c->i;
3339 THRUSTINTARRAY ii(Ccsr->row_offsets->size());
3340 THRUSTINTARRAY jj(Ccsr->column_indices->size());
3341 ii = *Ccsr->row_offsets;
3342 jj = *Ccsr->column_indices;
3343 if (ciscompressed) d_i = c->compressedrow.i;
3344 PetscCallCUDA(cudaMemcpy(d_i, ii.data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
3345 PetscCallCUDA(cudaMemcpy(c->j, jj.data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
3346 } else {
3347 PetscInt *d_i = c->i;
3348 if (ciscompressed) d_i = c->compressedrow.i;
3349 PetscCallCUDA(cudaMemcpy(d_i, Ccsr->row_offsets->data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
3350 PetscCallCUDA(cudaMemcpy(c->j, Ccsr->column_indices->data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
3351 }
3352 if (ciscompressed) { /* need to expand host row offsets */
3353 PetscInt r = 0;
3354 c->i[0] = 0;
3355 for (k = 0; k < c->compressedrow.nrows; k++) {
3356 const PetscInt next = c->compressedrow.rindex[k];
3357 const PetscInt old = c->compressedrow.i[k];
3358 for (; r < next; r++) c->i[r + 1] = old;
3359 }
3360 for (; r < m; r++) c->i[r + 1] = c->compressedrow.i[c->compressedrow.nrows];
3361 }
3362 PetscCall(PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size()) * sizeof(PetscInt)));
3363 PetscCall(PetscMalloc1(m, &c->ilen));
3364 PetscCall(PetscMalloc1(m, &c->imax));
3365 c->maxnz = c->nz;
3366 c->nonzerorowcnt = 0;
3367 c->rmax = 0;
3368 for (k = 0; k < m; k++) {
3369 const PetscInt nn = c->i[k + 1] - c->i[k];
3370 c->ilen[k] = c->imax[k] = nn;
3371 c->nonzerorowcnt += (PetscInt)!!nn;
3372 c->rmax = PetscMax(c->rmax, nn);
3373 }
3374 PetscCall(PetscMalloc1(c->nz, &c->a));
3375 Ccsr->num_entries = c->nz;
3376
3377 C->nonzerostate++;
3378 PetscCall(PetscLayoutSetUp(C->rmap));
3379 PetscCall(PetscLayoutSetUp(C->cmap));
3380 Ccusp->nonzerostate = C->nonzerostate;
3381 C->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
3382 C->preallocated = PETSC_TRUE;
3383 C->assembled = PETSC_FALSE;
3384 C->was_assembled = PETSC_FALSE;
3385 if (product->api_user && A->offloadmask == PETSC_OFFLOAD_BOTH && B->offloadmask == PETSC_OFFLOAD_BOTH) { /* flag the matrix C values as computed, so that the numeric phase will only call MatAssembly */
3386 mmdata->reusesym = PETSC_TRUE;
3387 C->offloadmask = PETSC_OFFLOAD_GPU;
3388 }
3389 C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE;
3390 PetscFunctionReturn(PETSC_SUCCESS);
3391 }
3392
3393 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat);
3394
3395 /* handles sparse or dense B */
MatProductSetFromOptions_SeqAIJCUSPARSE(Mat mat)3396 static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat mat)
3397 {
3398 Mat_Product *product = mat->product;
3399 PetscBool isdense = PETSC_FALSE, Biscusp = PETSC_FALSE, Ciscusp = PETSC_TRUE;
3400
3401 PetscFunctionBegin;
3402 MatCheckProduct(mat, 1);
3403 PetscCall(PetscObjectBaseTypeCompare((PetscObject)product->B, MATSEQDENSE, &isdense));
3404 if (!product->A->boundtocpu && !product->B->boundtocpu) PetscCall(PetscObjectTypeCompare((PetscObject)product->B, MATSEQAIJCUSPARSE, &Biscusp));
3405 if (product->type == MATPRODUCT_ABC) {
3406 Ciscusp = PETSC_FALSE;
3407 if (!product->C->boundtocpu) PetscCall(PetscObjectTypeCompare((PetscObject)product->C, MATSEQAIJCUSPARSE, &Ciscusp));
3408 }
3409 if (Biscusp && Ciscusp) { /* we can always select the CPU backend */
3410 PetscBool usecpu = PETSC_FALSE;
3411 switch (product->type) {
3412 case MATPRODUCT_AB:
3413 if (product->api_user) {
3414 PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatMatMult", "Mat");
3415 PetscCall(PetscOptionsBool("-matmatmult_backend_cpu", "Use CPU code", "MatMatMult", usecpu, &usecpu, NULL));
3416 PetscOptionsEnd();
3417 } else {
3418 PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_AB", "Mat");
3419 PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatMatMult", usecpu, &usecpu, NULL));
3420 PetscOptionsEnd();
3421 }
3422 break;
3423 case MATPRODUCT_AtB:
3424 if (product->api_user) {
3425 PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatTransposeMatMult", "Mat");
3426 PetscCall(PetscOptionsBool("-mattransposematmult_backend_cpu", "Use CPU code", "MatTransposeMatMult", usecpu, &usecpu, NULL));
3427 PetscOptionsEnd();
3428 } else {
3429 PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_AtB", "Mat");
3430 PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatTransposeMatMult", usecpu, &usecpu, NULL));
3431 PetscOptionsEnd();
3432 }
3433 break;
3434 case MATPRODUCT_PtAP:
3435 if (product->api_user) {
3436 PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatPtAP", "Mat");
3437 PetscCall(PetscOptionsBool("-matptap_backend_cpu", "Use CPU code", "MatPtAP", usecpu, &usecpu, NULL));
3438 PetscOptionsEnd();
3439 } else {
3440 PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_PtAP", "Mat");
3441 PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatPtAP", usecpu, &usecpu, NULL));
3442 PetscOptionsEnd();
3443 }
3444 break;
3445 case MATPRODUCT_RARt:
3446 if (product->api_user) {
3447 PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatRARt", "Mat");
3448 PetscCall(PetscOptionsBool("-matrart_backend_cpu", "Use CPU code", "MatRARt", usecpu, &usecpu, NULL));
3449 PetscOptionsEnd();
3450 } else {
3451 PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_RARt", "Mat");
3452 PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatRARt", usecpu, &usecpu, NULL));
3453 PetscOptionsEnd();
3454 }
3455 break;
3456 case MATPRODUCT_ABC:
3457 if (product->api_user) {
3458 PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatMatMatMult", "Mat");
3459 PetscCall(PetscOptionsBool("-matmatmatmult_backend_cpu", "Use CPU code", "MatMatMatMult", usecpu, &usecpu, NULL));
3460 PetscOptionsEnd();
3461 } else {
3462 PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_ABC", "Mat");
3463 PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatMatMatMult", usecpu, &usecpu, NULL));
3464 PetscOptionsEnd();
3465 }
3466 break;
3467 default:
3468 break;
3469 }
3470 if (usecpu) Biscusp = Ciscusp = PETSC_FALSE;
3471 }
3472 /* dispatch */
3473 if (isdense) {
3474 switch (product->type) {
3475 case MATPRODUCT_AB:
3476 case MATPRODUCT_AtB:
3477 case MATPRODUCT_ABt:
3478 case MATPRODUCT_PtAP:
3479 case MATPRODUCT_RARt:
3480 if (product->A->boundtocpu) {
3481 PetscCall(MatProductSetFromOptions_SeqAIJ_SeqDense(mat));
3482 } else {
3483 mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA;
3484 }
3485 break;
3486 case MATPRODUCT_ABC:
3487 mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
3488 break;
3489 default:
3490 break;
3491 }
3492 } else if (Biscusp && Ciscusp) {
3493 switch (product->type) {
3494 case MATPRODUCT_AB:
3495 case MATPRODUCT_AtB:
3496 case MATPRODUCT_ABt:
3497 mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE;
3498 break;
3499 case MATPRODUCT_PtAP:
3500 case MATPRODUCT_RARt:
3501 case MATPRODUCT_ABC:
3502 mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
3503 break;
3504 default:
3505 break;
3506 }
3507 } else { /* fallback for AIJ */
3508 PetscCall(MatProductSetFromOptions_SeqAIJ(mat));
3509 }
3510 PetscFunctionReturn(PETSC_SUCCESS);
3511 }
3512
MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)3513 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy)
3514 {
3515 PetscFunctionBegin;
3516 PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, NULL, yy, PETSC_FALSE, PETSC_FALSE));
3517 PetscFunctionReturn(PETSC_SUCCESS);
3518 }
3519
MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)3520 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz)
3521 {
3522 PetscFunctionBegin;
3523 PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, yy, zz, PETSC_FALSE, PETSC_FALSE));
3524 PetscFunctionReturn(PETSC_SUCCESS);
3525 }
3526
MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)3527 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy)
3528 {
3529 PetscFunctionBegin;
3530 PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, NULL, yy, PETSC_TRUE, PETSC_TRUE));
3531 PetscFunctionReturn(PETSC_SUCCESS);
3532 }
3533
MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)3534 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz)
3535 {
3536 PetscFunctionBegin;
3537 PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, yy, zz, PETSC_TRUE, PETSC_TRUE));
3538 PetscFunctionReturn(PETSC_SUCCESS);
3539 }
3540
MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)3541 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy)
3542 {
3543 PetscFunctionBegin;
3544 PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, NULL, yy, PETSC_TRUE, PETSC_FALSE));
3545 PetscFunctionReturn(PETSC_SUCCESS);
3546 }
3547
ScatterAdd(PetscInt n,PetscInt * idx,const PetscScalar * x,PetscScalar * y)3548 __global__ static void ScatterAdd(PetscInt n, PetscInt *idx, const PetscScalar *x, PetscScalar *y)
3549 {
3550 int i = blockIdx.x * blockDim.x + threadIdx.x;
3551 if (i < n) y[idx[i]] += x[i];
3552 }
3553
3554 /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */
MatMultAddKernel_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz,PetscBool trans,PetscBool herm)3555 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz, PetscBool trans, PetscBool herm)
3556 {
3557 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
3558 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
3559 Mat_SeqAIJCUSPARSEMultStruct *matstruct;
3560 PetscScalar *xarray, *zarray, *dptr, *beta, *xptr;
3561 cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
3562 PetscBool compressed;
3563 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3564 PetscInt nx, ny;
3565 #endif
3566
3567 PetscFunctionBegin;
3568 PetscCheck(!herm || trans, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Hermitian and not transpose not supported");
3569 if (!a->nz) {
3570 if (yy) PetscCall(VecSeq_CUDA::Copy(yy, zz));
3571 else PetscCall(VecSeq_CUDA::Set(zz, 0));
3572 PetscFunctionReturn(PETSC_SUCCESS);
3573 }
3574 /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
3575 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
3576 if (!trans) {
3577 matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->mat;
3578 PetscCheck(matstruct, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "SeqAIJCUSPARSE does not have a 'mat' (need to fix)");
3579 } else {
3580 if (herm || !A->form_explicit_transpose) {
3581 opA = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE;
3582 matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->mat;
3583 } else {
3584 if (!cusparsestruct->matTranspose) PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
3585 matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->matTranspose;
3586 }
3587 }
3588 /* Does the matrix use compressed rows (i.e., drop zero rows)? */
3589 compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE;
3590
3591 try {
3592 PetscCall(VecCUDAGetArrayRead(xx, (const PetscScalar **)&xarray));
3593 if (yy == zz) PetscCall(VecCUDAGetArray(zz, &zarray)); /* read & write zz, so need to get up-to-date zarray on GPU */
3594 else PetscCall(VecCUDAGetArrayWrite(zz, &zarray)); /* write zz, so no need to init zarray on GPU */
3595
3596 PetscCall(PetscLogGpuTimeBegin());
3597 if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
3598 /* z = A x + beta y.
3599 If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax.
3600 When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call.
3601 */
3602 xptr = xarray;
3603 dptr = compressed ? cusparsestruct->workVector->data().get() : zarray;
3604 beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero;
3605 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3606 /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is
3607 allocated to accommodate different uses. So we get the length info directly from mat.
3608 */
3609 if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3610 CsrMatrix *mat = (CsrMatrix *)matstruct->mat;
3611 nx = mat->num_cols; // since y = Ax
3612 ny = mat->num_rows;
3613 }
3614 #endif
3615 } else {
3616 /* z = A^T x + beta y
3617 If A is compressed, then we need a work vector as the shorter version of x to compute A^T x.
3618 Note A^Tx is of full length, so we set beta to 1.0 if y exists.
3619 */
3620 xptr = compressed ? cusparsestruct->workVector->data().get() : xarray;
3621 dptr = zarray;
3622 beta = yy ? matstruct->beta_one : matstruct->beta_zero;
3623 if (compressed) { /* Scatter x to work vector */
3624 thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray);
3625
3626 thrust::for_each(
3627 #if PetscDefined(HAVE_THRUST_ASYNC)
3628 thrust::cuda::par.on(PetscDefaultCudaStream),
3629 #endif
3630 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))),
3631 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), VecCUDAEqualsReverse());
3632 }
3633 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3634 if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3635 CsrMatrix *mat = (CsrMatrix *)matstruct->mat;
3636 nx = mat->num_rows; // since y = A^T x
3637 ny = mat->num_cols;
3638 }
3639 #endif
3640 }
3641
3642 /* csr_spmv does y = alpha op(A) x + beta y */
3643 if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3644 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3645 #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0)
3646 cusparseSpMatDescr_t &matDescr = matstruct->matDescr_SpMV[opA]; // All opA's should use the same matDescr, but the cusparse issue/bug (#212) after 12.4 forced us to create a new one for each opA.
3647 #else
3648 cusparseSpMatDescr_t &matDescr = matstruct->matDescr;
3649 #endif
3650
3651 PetscCheck(opA >= 0 && opA <= 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE ABI on cusparseOperation_t has changed and PETSc has not been updated accordingly");
3652 #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0)
3653 if (!matDescr) {
3654 CsrMatrix *mat = (CsrMatrix *)matstruct->mat;
3655 PetscCallCUSPARSE(cusparseCreateCsr(&matDescr, mat->num_rows, mat->num_cols, mat->num_entries, mat->row_offsets->data().get(), mat->column_indices->data().get(), mat->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
3656 }
3657 #endif
3658
3659 if (!matstruct->cuSpMV[opA].initialized) { /* built on demand */
3660 PetscCallCUSPARSE(cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecXDescr, nx, xptr, cusparse_scalartype));
3661 PetscCallCUSPARSE(cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecYDescr, ny, dptr, cusparse_scalartype));
3662 PetscCallCUSPARSE(
3663 cusparseSpMV_bufferSize(cusparsestruct->handle, opA, matstruct->alpha_one, matDescr, matstruct->cuSpMV[opA].vecXDescr, beta, matstruct->cuSpMV[opA].vecYDescr, cusparse_scalartype, cusparsestruct->spmvAlg, &matstruct->cuSpMV[opA].spmvBufferSize));
3664 PetscCallCUDA(cudaMalloc(&matstruct->cuSpMV[opA].spmvBuffer, matstruct->cuSpMV[opA].spmvBufferSize));
3665 #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0) // cusparseSpMV_preprocess is added in 12.4
3666 PetscCallCUSPARSE(
3667 cusparseSpMV_preprocess(cusparsestruct->handle, opA, matstruct->alpha_one, matDescr, matstruct->cuSpMV[opA].vecXDescr, beta, matstruct->cuSpMV[opA].vecYDescr, cusparse_scalartype, cusparsestruct->spmvAlg, matstruct->cuSpMV[opA].spmvBuffer));
3668 #endif
3669 matstruct->cuSpMV[opA].initialized = PETSC_TRUE;
3670 } else {
3671 /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */
3672 PetscCallCUSPARSE(cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecXDescr, xptr));
3673 PetscCallCUSPARSE(cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecYDescr, dptr));
3674 }
3675
3676 PetscCallCUSPARSE(cusparseSpMV(cusparsestruct->handle, opA, matstruct->alpha_one, matDescr, matstruct->cuSpMV[opA].vecXDescr, beta, matstruct->cuSpMV[opA].vecYDescr, cusparse_scalartype, cusparsestruct->spmvAlg, matstruct->cuSpMV[opA].spmvBuffer));
3677 #else
3678 CsrMatrix *mat = (CsrMatrix *)matstruct->mat;
3679 PetscCallCUSPARSE(cusparse_csr_spmv(cusparsestruct->handle, opA, mat->num_rows, mat->num_cols, mat->num_entries, matstruct->alpha_one, matstruct->descr, mat->values->data().get(), mat->row_offsets->data().get(), mat->column_indices->data().get(), xptr, beta, dptr));
3680 #endif
3681 } else {
3682 if (cusparsestruct->nrows) {
3683 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3684 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
3685 #else
3686 cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat;
3687 PetscCallCUSPARSE(cusparse_hyb_spmv(cusparsestruct->handle, opA, matstruct->alpha_one, matstruct->descr, hybMat, xptr, beta, dptr));
3688 #endif
3689 }
3690 }
3691 PetscCall(PetscLogGpuTimeEnd());
3692
3693 if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
3694 if (yy) { /* MatMultAdd: zz = A*xx + yy */
3695 if (compressed) { /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */
3696 PetscCall(VecSeq_CUDA::Copy(yy, zz)); /* zz = yy */
3697 } else if (zz != yy) { /* A is not compressed. zz already contains A*xx, and we just need to add yy */
3698 PetscCall(VecSeq_CUDA::AXPY(zz, 1.0, yy)); /* zz += yy */
3699 }
3700 } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */
3701 PetscCall(VecSeq_CUDA::Set(zz, 0));
3702 }
3703
3704 /* ScatterAdd the result from work vector into the full vector when A is compressed */
3705 if (compressed) {
3706 PetscCall(PetscLogGpuTimeBegin());
3707 PetscInt n = (PetscInt)matstruct->cprowIndices->size();
3708 ScatterAdd<<<(int)((n + 255) / 256), 256, 0, PetscDefaultCudaStream>>>(n, matstruct->cprowIndices->data().get(), cusparsestruct->workVector->data().get(), zarray);
3709 PetscCall(PetscLogGpuTimeEnd());
3710 }
3711 } else {
3712 if (yy && yy != zz) PetscCall(VecSeq_CUDA::AXPY(zz, 1.0, yy)); /* zz += yy */
3713 }
3714 PetscCall(VecCUDARestoreArrayRead(xx, (const PetscScalar **)&xarray));
3715 if (yy == zz) PetscCall(VecCUDARestoreArray(zz, &zarray));
3716 else PetscCall(VecCUDARestoreArrayWrite(zz, &zarray));
3717 } catch (char *ex) {
3718 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex);
3719 }
3720 if (yy) {
3721 PetscCall(PetscLogGpuFlops(2.0 * a->nz));
3722 } else {
3723 PetscCall(PetscLogGpuFlops(2.0 * a->nz - a->nonzerorowcnt));
3724 }
3725 PetscFunctionReturn(PETSC_SUCCESS);
3726 }
3727
MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)3728 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz)
3729 {
3730 PetscFunctionBegin;
3731 PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, yy, zz, PETSC_TRUE, PETSC_FALSE));
3732 PetscFunctionReturn(PETSC_SUCCESS);
3733 }
3734
3735 PETSC_INTERN PetscErrorCode MatGetDiagonal_SeqAIJ(Mat A, Vec xx);
3736
GetDiagonal_CSR(const int * row,const int * col,const PetscScalar * val,const PetscInt len,PetscScalar * diag)3737 __global__ static void GetDiagonal_CSR(const int *row, const int *col, const PetscScalar *val, const PetscInt len, PetscScalar *diag)
3738 {
3739 const size_t x = blockIdx.x * blockDim.x + threadIdx.x;
3740
3741 if (x < len) {
3742 const PetscInt rowx = row[x], num_non0_row = row[x + 1] - rowx;
3743 PetscScalar d = 0.0;
3744
3745 for (PetscInt i = 0; i < num_non0_row; i++) {
3746 if (col[i + rowx] == x) {
3747 d = val[i + rowx];
3748 break;
3749 }
3750 }
3751 diag[x] = d;
3752 }
3753 }
3754
MatGetDiagonal_SeqAIJCUSPARSE(Mat A,Vec diag)3755 static PetscErrorCode MatGetDiagonal_SeqAIJCUSPARSE(Mat A, Vec diag)
3756 {
3757 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
3758 Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->mat;
3759 PetscScalar *darray;
3760
3761 PetscFunctionBegin;
3762 if (A->offloadmask == PETSC_OFFLOAD_BOTH || A->offloadmask == PETSC_OFFLOAD_GPU) {
3763 PetscInt n = A->rmap->n;
3764 CsrMatrix *mat = (CsrMatrix *)matstruct->mat;
3765
3766 PetscCheck(cusparsestruct->format == MAT_CUSPARSE_CSR, PETSC_COMM_SELF, PETSC_ERR_SUP, "Only CSR format supported");
3767 if (n > 0) {
3768 PetscCall(VecCUDAGetArrayWrite(diag, &darray));
3769 GetDiagonal_CSR<<<(int)((n + 255) / 256), 256, 0, PetscDefaultCudaStream>>>(mat->row_offsets->data().get(), mat->column_indices->data().get(), mat->values->data().get(), n, darray);
3770 PetscCallCUDA(cudaPeekAtLastError());
3771 PetscCall(VecCUDARestoreArrayWrite(diag, &darray));
3772 }
3773 } else PetscCall(MatGetDiagonal_SeqAIJ(A, diag));
3774 PetscFunctionReturn(PETSC_SUCCESS);
3775 }
3776
MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode)3777 static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A, MatAssemblyType mode)
3778 {
3779 PetscFunctionBegin;
3780 PetscCall(MatAssemblyEnd_SeqAIJ(A, mode));
3781 PetscFunctionReturn(PETSC_SUCCESS);
3782 }
3783
3784 /*@
3785 MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in `MATAIJCUSPARSE` (compressed row) format for use on NVIDIA GPUs
3786
3787 Collective
3788
3789 Input Parameters:
3790 + comm - MPI communicator, set to `PETSC_COMM_SELF`
3791 . m - number of rows
3792 . n - number of columns
3793 . nz - number of nonzeros per row (same for all rows), ignored if `nnz` is provide
3794 - nnz - array containing the number of nonzeros in the various rows (possibly different for each row) or `NULL`
3795
3796 Output Parameter:
3797 . A - the matrix
3798
3799 Level: intermediate
3800
3801 Notes:
3802 This matrix will ultimately pushed down to NVIDIA GPUs and use the CuSPARSE library for
3803 calculations. For good matrix assembly performance the user should preallocate the matrix
3804 storage by setting the parameter `nz` (or the array `nnz`).
3805
3806 It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`,
3807 MatXXXXSetPreallocation() paradgm instead of this routine directly.
3808 [MatXXXXSetPreallocation() is, for example, `MatSeqAIJSetPreallocation()`]
3809
3810 The AIJ format, also called
3811 compressed row storage, is fully compatible with standard Fortran
3812 storage. That is, the stored row and column indices can begin at
3813 either one (as in Fortran) or zero.
3814
3815 Specify the preallocated storage with either nz or nnz (not both).
3816 Set `nz` = `PETSC_DEFAULT` and `nnz` = `NULL` for PETSc to control dynamic memory
3817 allocation.
3818
3819 When working with matrices for GPUs, it is often better to use the `MatSetPreallocationCOO()` and `MatSetValuesCOO()` paradigm rather than using this routine and `MatSetValues()`
3820
3821 .seealso: [](ch_matrices), `Mat`, `MATSEQAIJCUSPARSE`, `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`, `MATAIJCUSPARSE`,
3822 `MatSetPreallocationCOO()`, `MatSetValuesCOO()`
3823 @*/
MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat * A)3824 PetscErrorCode MatCreateSeqAIJCUSPARSE(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt nz, const PetscInt nnz[], Mat *A)
3825 {
3826 PetscFunctionBegin;
3827 PetscCall(MatCreate(comm, A));
3828 PetscCall(MatSetSizes(*A, m, n, m, n));
3829 PetscCall(MatSetType(*A, MATSEQAIJCUSPARSE));
3830 PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A, nz, (PetscInt *)nnz));
3831 PetscFunctionReturn(PETSC_SUCCESS);
3832 }
3833
MatDestroy_SeqAIJCUSPARSE(Mat A)3834 static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A)
3835 {
3836 PetscFunctionBegin;
3837 if (A->factortype == MAT_FACTOR_NONE) {
3838 PetscCall(MatSeqAIJCUSPARSE_Destroy(A));
3839 } else {
3840 PetscCall(MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors **)&A->spptr));
3841 }
3842 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", NULL));
3843 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatCUSPARSESetFormat_C", NULL));
3844 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatCUSPARSESetUseCPUSolve_C", NULL));
3845 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C", NULL));
3846 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdense_C", NULL));
3847 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C", NULL));
3848 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatFactorGetSolverType_C", NULL));
3849 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL));
3850 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL));
3851 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqaijcusparse_hypre_C", NULL));
3852 PetscCall(MatDestroy_SeqAIJ(A));
3853 PetscFunctionReturn(PETSC_SUCCESS);
3854 }
3855
3856 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat, MatType, MatReuse, Mat *);
3857 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat, PetscBool);
MatDuplicate_SeqAIJCUSPARSE(Mat A,MatDuplicateOption cpvalues,Mat * B)3858 static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A, MatDuplicateOption cpvalues, Mat *B)
3859 {
3860 PetscFunctionBegin;
3861 PetscCall(MatDuplicate_SeqAIJ(A, cpvalues, B));
3862 PetscCall(MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B, MATSEQAIJCUSPARSE, MAT_INPLACE_MATRIX, B));
3863 PetscFunctionReturn(PETSC_SUCCESS);
3864 }
3865
MatAXPY_SeqAIJCUSPARSE(Mat Y,PetscScalar a,Mat X,MatStructure str)3866 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat Y, PetscScalar a, Mat X, MatStructure str)
3867 {
3868 Mat_SeqAIJ *x = (Mat_SeqAIJ *)X->data, *y = (Mat_SeqAIJ *)Y->data;
3869 Mat_SeqAIJCUSPARSE *cy;
3870 Mat_SeqAIJCUSPARSE *cx;
3871 PetscScalar *ay;
3872 const PetscScalar *ax;
3873 CsrMatrix *csry, *csrx;
3874
3875 PetscFunctionBegin;
3876 cy = (Mat_SeqAIJCUSPARSE *)Y->spptr;
3877 cx = (Mat_SeqAIJCUSPARSE *)X->spptr;
3878 if (X->ops->axpy != Y->ops->axpy) {
3879 PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(Y, PETSC_FALSE));
3880 PetscCall(MatAXPY_SeqAIJ(Y, a, X, str));
3881 PetscFunctionReturn(PETSC_SUCCESS);
3882 }
3883 /* if we are here, it means both matrices are bound to GPU */
3884 PetscCall(MatSeqAIJCUSPARSECopyToGPU(Y));
3885 PetscCall(MatSeqAIJCUSPARSECopyToGPU(X));
3886 PetscCheck(cy->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)Y), PETSC_ERR_GPU, "only MAT_CUSPARSE_CSR supported");
3887 PetscCheck(cx->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)X), PETSC_ERR_GPU, "only MAT_CUSPARSE_CSR supported");
3888 csry = (CsrMatrix *)cy->mat->mat;
3889 csrx = (CsrMatrix *)cx->mat->mat;
3890 /* see if we can turn this into a cublas axpy */
3891 if (str != SAME_NONZERO_PATTERN && x->nz == y->nz && !x->compressedrow.use && !y->compressedrow.use) {
3892 bool eq = thrust::equal(thrust::device, csry->row_offsets->begin(), csry->row_offsets->end(), csrx->row_offsets->begin());
3893 if (eq) eq = thrust::equal(thrust::device, csry->column_indices->begin(), csry->column_indices->end(), csrx->column_indices->begin());
3894 if (eq) str = SAME_NONZERO_PATTERN;
3895 }
3896 /* spgeam is buggy with one column */
3897 if (Y->cmap->n == 1 && str != SAME_NONZERO_PATTERN) str = DIFFERENT_NONZERO_PATTERN;
3898
3899 if (str == SUBSET_NONZERO_PATTERN) {
3900 PetscScalar b = 1.0;
3901 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3902 size_t bufferSize;
3903 void *buffer;
3904 #endif
3905
3906 PetscCall(MatSeqAIJCUSPARSEGetArrayRead(X, &ax));
3907 PetscCall(MatSeqAIJCUSPARSEGetArray(Y, &ay));
3908 PetscCallCUSPARSE(cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_HOST));
3909 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3910 PetscCallCUSPARSE(cusparse_csr_spgeam_bufferSize(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(),
3911 csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get(), &bufferSize));
3912 PetscCallCUDA(cudaMalloc(&buffer, bufferSize));
3913 PetscCall(PetscLogGpuTimeBegin());
3914 PetscCallCUSPARSE(cusparse_csr_spgeam(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(),
3915 csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get(), buffer));
3916 PetscCall(PetscLogGpuFlops(x->nz + y->nz));
3917 PetscCall(PetscLogGpuTimeEnd());
3918 PetscCallCUDA(cudaFree(buffer));
3919 #else
3920 PetscCall(PetscLogGpuTimeBegin());
3921 PetscCallCUSPARSE(cusparse_csr_spgeam(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(),
3922 csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get()));
3923 PetscCall(PetscLogGpuFlops(x->nz + y->nz));
3924 PetscCall(PetscLogGpuTimeEnd());
3925 #endif
3926 PetscCallCUSPARSE(cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_DEVICE));
3927 PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(X, &ax));
3928 PetscCall(MatSeqAIJCUSPARSERestoreArray(Y, &ay));
3929 } else if (str == SAME_NONZERO_PATTERN) {
3930 cublasHandle_t cublasv2handle;
3931 PetscBLASInt one = 1, bnz = 1;
3932
3933 PetscCall(MatSeqAIJCUSPARSEGetArrayRead(X, &ax));
3934 PetscCall(MatSeqAIJCUSPARSEGetArray(Y, &ay));
3935 PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
3936 PetscCall(PetscBLASIntCast(x->nz, &bnz));
3937 PetscCall(PetscLogGpuTimeBegin());
3938 PetscCallCUBLAS(cublasXaxpy(cublasv2handle, bnz, &a, ax, one, ay, one));
3939 PetscCall(PetscLogGpuFlops(2.0 * bnz));
3940 PetscCall(PetscLogGpuTimeEnd());
3941 PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(X, &ax));
3942 PetscCall(MatSeqAIJCUSPARSERestoreArray(Y, &ay));
3943 } else {
3944 PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(Y, PETSC_FALSE));
3945 PetscCall(MatAXPY_SeqAIJ(Y, a, X, str));
3946 }
3947 PetscFunctionReturn(PETSC_SUCCESS);
3948 }
3949
MatScale_SeqAIJCUSPARSE(Mat Y,PetscScalar a)3950 static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat Y, PetscScalar a)
3951 {
3952 Mat_SeqAIJ *y = (Mat_SeqAIJ *)Y->data;
3953 PetscScalar *ay;
3954 cublasHandle_t cublasv2handle;
3955 PetscBLASInt one = 1, bnz = 1;
3956
3957 PetscFunctionBegin;
3958 PetscCall(MatSeqAIJCUSPARSEGetArray(Y, &ay));
3959 PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
3960 PetscCall(PetscBLASIntCast(y->nz, &bnz));
3961 PetscCall(PetscLogGpuTimeBegin());
3962 PetscCallCUBLAS(cublasXscal(cublasv2handle, bnz, &a, ay, one));
3963 PetscCall(PetscLogGpuFlops(bnz));
3964 PetscCall(PetscLogGpuTimeEnd());
3965 PetscCall(MatSeqAIJCUSPARSERestoreArray(Y, &ay));
3966 PetscFunctionReturn(PETSC_SUCCESS);
3967 }
3968
MatZeroEntries_SeqAIJCUSPARSE(Mat A)3969 static PetscErrorCode MatZeroEntries_SeqAIJCUSPARSE(Mat A)
3970 {
3971 PetscBool gpu = PETSC_FALSE;
3972 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
3973
3974 PetscFunctionBegin;
3975 if (A->factortype == MAT_FACTOR_NONE) {
3976 Mat_SeqAIJCUSPARSE *spptr = (Mat_SeqAIJCUSPARSE *)A->spptr;
3977 if (spptr->mat) {
3978 CsrMatrix *matrix = (CsrMatrix *)spptr->mat->mat;
3979 if (matrix->values) {
3980 gpu = PETSC_TRUE;
3981 thrust::fill(thrust::device, matrix->values->begin(), matrix->values->end(), 0.);
3982 }
3983 }
3984 if (spptr->matTranspose) {
3985 CsrMatrix *matrix = (CsrMatrix *)spptr->matTranspose->mat;
3986 if (matrix->values) thrust::fill(thrust::device, matrix->values->begin(), matrix->values->end(), 0.);
3987 }
3988 }
3989 if (gpu) A->offloadmask = PETSC_OFFLOAD_GPU;
3990 else {
3991 PetscCall(PetscArrayzero(a->a, a->i[A->rmap->n]));
3992 A->offloadmask = PETSC_OFFLOAD_CPU;
3993 }
3994 PetscFunctionReturn(PETSC_SUCCESS);
3995 }
3996
MatGetCurrentMemType_SeqAIJCUSPARSE(PETSC_UNUSED Mat A,PetscMemType * m)3997 static PetscErrorCode MatGetCurrentMemType_SeqAIJCUSPARSE(PETSC_UNUSED Mat A, PetscMemType *m)
3998 {
3999 PetscFunctionBegin;
4000 *m = PETSC_MEMTYPE_CUDA;
4001 PetscFunctionReturn(PETSC_SUCCESS);
4002 }
4003
MatBindToCPU_SeqAIJCUSPARSE(Mat A,PetscBool flg)4004 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A, PetscBool flg)
4005 {
4006 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
4007
4008 PetscFunctionBegin;
4009 if (A->factortype != MAT_FACTOR_NONE) {
4010 A->boundtocpu = flg;
4011 PetscFunctionReturn(PETSC_SUCCESS);
4012 }
4013 if (flg) {
4014 PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
4015
4016 A->ops->scale = MatScale_SeqAIJ;
4017 A->ops->getdiagonal = MatGetDiagonal_SeqAIJ;
4018 A->ops->axpy = MatAXPY_SeqAIJ;
4019 A->ops->zeroentries = MatZeroEntries_SeqAIJ;
4020 A->ops->mult = MatMult_SeqAIJ;
4021 A->ops->multadd = MatMultAdd_SeqAIJ;
4022 A->ops->multtranspose = MatMultTranspose_SeqAIJ;
4023 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJ;
4024 A->ops->multhermitiantranspose = NULL;
4025 A->ops->multhermitiantransposeadd = NULL;
4026 A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJ;
4027 A->ops->getcurrentmemtype = NULL;
4028 PetscCall(PetscMemzero(a->ops, sizeof(Mat_SeqAIJOps)));
4029 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", NULL));
4030 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C", NULL));
4031 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdense_C", NULL));
4032 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL));
4033 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL));
4034 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C", NULL));
4035 } else {
4036 A->ops->scale = MatScale_SeqAIJCUSPARSE;
4037 A->ops->getdiagonal = MatGetDiagonal_SeqAIJCUSPARSE;
4038 A->ops->axpy = MatAXPY_SeqAIJCUSPARSE;
4039 A->ops->zeroentries = MatZeroEntries_SeqAIJCUSPARSE;
4040 A->ops->mult = MatMult_SeqAIJCUSPARSE;
4041 A->ops->multadd = MatMultAdd_SeqAIJCUSPARSE;
4042 A->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE;
4043 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE;
4044 A->ops->multhermitiantranspose = MatMultHermitianTranspose_SeqAIJCUSPARSE;
4045 A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE;
4046 A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJCUSPARSE;
4047 A->ops->getcurrentmemtype = MatGetCurrentMemType_SeqAIJCUSPARSE;
4048 a->ops->getarray = MatSeqAIJGetArray_SeqAIJCUSPARSE;
4049 a->ops->restorearray = MatSeqAIJRestoreArray_SeqAIJCUSPARSE;
4050 a->ops->getarrayread = MatSeqAIJGetArrayRead_SeqAIJCUSPARSE;
4051 a->ops->restorearrayread = MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE;
4052 a->ops->getarraywrite = MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE;
4053 a->ops->restorearraywrite = MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE;
4054 a->ops->getcsrandmemtype = MatSeqAIJGetCSRAndMemType_SeqAIJCUSPARSE;
4055
4056 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", MatSeqAIJCopySubArray_SeqAIJCUSPARSE));
4057 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C", MatProductSetFromOptions_SeqAIJCUSPARSE));
4058 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdense_C", MatProductSetFromOptions_SeqAIJCUSPARSE));
4059 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", MatSetPreallocationCOO_SeqAIJCUSPARSE));
4060 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", MatSetValuesCOO_SeqAIJCUSPARSE));
4061 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C", MatProductSetFromOptions_SeqAIJCUSPARSE));
4062 }
4063 A->boundtocpu = flg;
4064 if (flg && a->inode.size_csr) {
4065 a->inode.use = PETSC_TRUE;
4066 } else {
4067 a->inode.use = PETSC_FALSE;
4068 }
4069 PetscFunctionReturn(PETSC_SUCCESS);
4070 }
4071
MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A,MatType,MatReuse reuse,Mat * newmat)4072 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType, MatReuse reuse, Mat *newmat)
4073 {
4074 Mat B;
4075
4076 PetscFunctionBegin;
4077 PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA)); /* first use of CUSPARSE may be via MatConvert */
4078 if (reuse == MAT_INITIAL_MATRIX) {
4079 PetscCall(MatDuplicate(A, MAT_COPY_VALUES, newmat));
4080 } else if (reuse == MAT_REUSE_MATRIX) {
4081 PetscCall(MatCopy(A, *newmat, SAME_NONZERO_PATTERN));
4082 }
4083 B = *newmat;
4084
4085 PetscCall(PetscFree(B->defaultvectype));
4086 PetscCall(PetscStrallocpy(VECCUDA, &B->defaultvectype));
4087
4088 if (reuse != MAT_REUSE_MATRIX && !B->spptr) {
4089 if (B->factortype == MAT_FACTOR_NONE) {
4090 Mat_SeqAIJCUSPARSE *spptr;
4091 PetscCall(PetscNew(&spptr));
4092 PetscCallCUSPARSE(cusparseCreate(&spptr->handle));
4093 PetscCallCUSPARSE(cusparseSetStream(spptr->handle, PetscDefaultCudaStream));
4094 spptr->format = MAT_CUSPARSE_CSR;
4095 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
4096 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
4097 spptr->spmvAlg = CUSPARSE_SPMV_CSR_ALG1; /* default, since we only support csr */
4098 #else
4099 spptr->spmvAlg = CUSPARSE_CSRMV_ALG1; /* default, since we only support csr */
4100 #endif
4101 spptr->spmmAlg = CUSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */
4102 spptr->csr2cscAlg = CUSPARSE_CSR2CSC_ALG1;
4103 #endif
4104 B->spptr = spptr;
4105 } else {
4106 Mat_SeqAIJCUSPARSETriFactors *spptr;
4107
4108 PetscCall(PetscNew(&spptr));
4109 PetscCallCUSPARSE(cusparseCreate(&spptr->handle));
4110 PetscCallCUSPARSE(cusparseSetStream(spptr->handle, PetscDefaultCudaStream));
4111 B->spptr = spptr;
4112 }
4113 B->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
4114 }
4115 B->ops->assemblyend = MatAssemblyEnd_SeqAIJCUSPARSE;
4116 B->ops->destroy = MatDestroy_SeqAIJCUSPARSE;
4117 B->ops->setoption = MatSetOption_SeqAIJCUSPARSE;
4118 B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE;
4119 B->ops->bindtocpu = MatBindToCPU_SeqAIJCUSPARSE;
4120 B->ops->duplicate = MatDuplicate_SeqAIJCUSPARSE;
4121 B->ops->getcurrentmemtype = MatGetCurrentMemType_SeqAIJCUSPARSE;
4122
4123 PetscCall(MatBindToCPU_SeqAIJCUSPARSE(B, PETSC_FALSE));
4124 PetscCall(PetscObjectChangeTypeName((PetscObject)B, MATSEQAIJCUSPARSE));
4125 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_SeqAIJCUSPARSE));
4126 #if defined(PETSC_HAVE_HYPRE)
4127 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_seqaijcusparse_hypre_C", MatConvert_AIJ_HYPRE));
4128 #endif
4129 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetUseCPUSolve_C", MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE));
4130 PetscFunctionReturn(PETSC_SUCCESS);
4131 }
4132
MatCreate_SeqAIJCUSPARSE(Mat B)4133 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B)
4134 {
4135 PetscFunctionBegin;
4136 PetscCall(MatCreate_SeqAIJ(B));
4137 PetscCall(MatConvert_SeqAIJ_SeqAIJCUSPARSE(B, MATSEQAIJCUSPARSE, MAT_INPLACE_MATRIX, &B));
4138 PetscFunctionReturn(PETSC_SUCCESS);
4139 }
4140
4141 /*MC
4142 MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices on NVIDIA GPUs.
4143
4144 Options Database Keys:
4145 + -mat_type aijcusparse - Sets the matrix type to "seqaijcusparse" during a call to `MatSetFromOptions()`
4146 . -mat_cusparse_storage_format csr - Sets the storage format of matrices (for `MatMult()` and factors in `MatSolve()`).
4147 Other options include ell (ellpack) or hyb (hybrid).
4148 . -mat_cusparse_mult_storage_format csr - Sets the storage format of matrices (for `MatMult()`). Other options include ell (ellpack) or hyb (hybrid).
4149 - -mat_cusparse_use_cpu_solve - Performs the `MatSolve()` on the CPU
4150
4151 Level: beginner
4152
4153 Notes:
4154 These matrices can be in either CSR, ELL, or HYB format.
4155
4156 All matrix calculations are performed on NVIDIA GPUs using the cuSPARSE library.
4157
4158 Uses 32-bit integers internally. If PETSc is configured `--with-64-bit-indices`, the integer row and column indices are stored on the GPU with `int`. It is unclear what happens
4159 if some integer values passed in do not fit in `int`.
4160
4161 .seealso: [](ch_matrices), `Mat`, `MatCreateSeqAIJCUSPARSE()`, `MatCUSPARSESetUseCPUSolve()`, `MATAIJCUSPARSE`, `MatCreateAIJCUSPARSE()`, `MatCUSPARSESetFormat()`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
4162 M*/
4163
MatSolverTypeRegister_CUSPARSE(void)4164 PETSC_INTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void)
4165 {
4166 PetscFunctionBegin;
4167 PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_LU, MatGetFactor_seqaijcusparse_cusparse));
4168 PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_CHOLESKY, MatGetFactor_seqaijcusparse_cusparse));
4169 PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_ILU, MatGetFactor_seqaijcusparse_cusparse));
4170 PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_ICC, MatGetFactor_seqaijcusparse_cusparse));
4171 PetscFunctionReturn(PETSC_SUCCESS);
4172 }
4173
MatSeqAIJCUSPARSE_Destroy(Mat mat)4174 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat mat)
4175 {
4176 Mat_SeqAIJCUSPARSE *cusp = static_cast<Mat_SeqAIJCUSPARSE *>(mat->spptr);
4177
4178 PetscFunctionBegin;
4179 if (cusp) {
4180 PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->mat, cusp->format));
4181 PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose, cusp->format));
4182 delete cusp->workVector;
4183 delete cusp->rowoffsets_gpu;
4184 delete cusp->csr2csc_i;
4185 delete cusp->coords;
4186 if (cusp->handle) PetscCallCUSPARSE(cusparseDestroy(cusp->handle));
4187 PetscCall(PetscFree(mat->spptr));
4188 }
4189 PetscFunctionReturn(PETSC_SUCCESS);
4190 }
4191
CsrMatrix_Destroy(CsrMatrix ** mat)4192 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat)
4193 {
4194 PetscFunctionBegin;
4195 if (*mat) {
4196 delete (*mat)->values;
4197 delete (*mat)->column_indices;
4198 delete (*mat)->row_offsets;
4199 delete *mat;
4200 *mat = 0;
4201 }
4202 PetscFunctionReturn(PETSC_SUCCESS);
4203 }
4204
4205 #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0)
MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct ** trifactor)4206 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor)
4207 {
4208 PetscFunctionBegin;
4209 if (*trifactor) {
4210 if ((*trifactor)->descr) PetscCallCUSPARSE(cusparseDestroyMatDescr((*trifactor)->descr));
4211 if ((*trifactor)->solveInfo) PetscCallCUSPARSE(cusparseDestroyCsrsvInfo((*trifactor)->solveInfo));
4212 PetscCall(CsrMatrix_Destroy(&(*trifactor)->csrMat));
4213 if ((*trifactor)->solveBuffer) PetscCallCUDA(cudaFree((*trifactor)->solveBuffer));
4214 if ((*trifactor)->AA_h) PetscCallCUDA(cudaFreeHost((*trifactor)->AA_h));
4215 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
4216 if ((*trifactor)->csr2cscBuffer) PetscCallCUDA(cudaFree((*trifactor)->csr2cscBuffer));
4217 #endif
4218 PetscCall(PetscFree(*trifactor));
4219 }
4220 PetscFunctionReturn(PETSC_SUCCESS);
4221 }
4222 #endif
4223
MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct ** matstruct,MatCUSPARSEStorageFormat format)4224 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct, MatCUSPARSEStorageFormat format)
4225 {
4226 CsrMatrix *mat;
4227
4228 PetscFunctionBegin;
4229 if (*matstruct) {
4230 if ((*matstruct)->mat) {
4231 if (format == MAT_CUSPARSE_ELL || format == MAT_CUSPARSE_HYB) {
4232 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
4233 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
4234 #else
4235 cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat;
4236 PetscCallCUSPARSE(cusparseDestroyHybMat(hybMat));
4237 #endif
4238 } else {
4239 mat = (CsrMatrix *)(*matstruct)->mat;
4240 PetscCall(CsrMatrix_Destroy(&mat));
4241 }
4242 }
4243 if ((*matstruct)->descr) PetscCallCUSPARSE(cusparseDestroyMatDescr((*matstruct)->descr));
4244 delete (*matstruct)->cprowIndices;
4245 if ((*matstruct)->alpha_one) PetscCallCUDA(cudaFree((*matstruct)->alpha_one));
4246 if ((*matstruct)->beta_zero) PetscCallCUDA(cudaFree((*matstruct)->beta_zero));
4247 if ((*matstruct)->beta_one) PetscCallCUDA(cudaFree((*matstruct)->beta_one));
4248
4249 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
4250 Mat_SeqAIJCUSPARSEMultStruct *mdata = *matstruct;
4251 if (mdata->matDescr) PetscCallCUSPARSE(cusparseDestroySpMat(mdata->matDescr));
4252
4253 for (int i = 0; i < 3; i++) {
4254 if (mdata->cuSpMV[i].initialized) {
4255 PetscCallCUDA(cudaFree(mdata->cuSpMV[i].spmvBuffer));
4256 PetscCallCUSPARSE(cusparseDestroyDnVec(mdata->cuSpMV[i].vecXDescr));
4257 PetscCallCUSPARSE(cusparseDestroyDnVec(mdata->cuSpMV[i].vecYDescr));
4258 #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0)
4259 if (mdata->matDescr_SpMV[i]) PetscCallCUSPARSE(cusparseDestroySpMat(mdata->matDescr_SpMV[i]));
4260 if (mdata->matDescr_SpMM[i]) PetscCallCUSPARSE(cusparseDestroySpMat(mdata->matDescr_SpMM[i]));
4261 #endif
4262 }
4263 }
4264 #endif
4265 delete *matstruct;
4266 *matstruct = NULL;
4267 }
4268 PetscFunctionReturn(PETSC_SUCCESS);
4269 }
4270
MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors_p * trifactors)4271 PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors_p *trifactors)
4272 {
4273 Mat_SeqAIJCUSPARSETriFactors *fs = *trifactors;
4274
4275 PetscFunctionBegin;
4276 if (fs) {
4277 #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0)
4278 PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->loTriFactorPtr));
4279 PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->upTriFactorPtr));
4280 PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->loTriFactorPtrTranspose));
4281 PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->upTriFactorPtrTranspose));
4282 delete fs->workVector;
4283 fs->workVector = NULL;
4284 #endif
4285 delete fs->rpermIndices;
4286 delete fs->cpermIndices;
4287 fs->rpermIndices = NULL;
4288 fs->cpermIndices = NULL;
4289 fs->init_dev_prop = PETSC_FALSE;
4290 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
4291 PetscCallCUDA(cudaFree(fs->csrRowPtr));
4292 PetscCallCUDA(cudaFree(fs->csrColIdx));
4293 PetscCallCUDA(cudaFree(fs->csrRowPtr32));
4294 PetscCallCUDA(cudaFree(fs->csrColIdx32));
4295 PetscCallCUDA(cudaFree(fs->csrVal));
4296 PetscCallCUDA(cudaFree(fs->diag));
4297 PetscCallCUDA(cudaFree(fs->X));
4298 PetscCallCUDA(cudaFree(fs->Y));
4299 // PetscCallCUDA(cudaFree(fs->factBuffer_M)); /* No needed since factBuffer_M shares with one of spsvBuffer_L/U */
4300 PetscCallCUDA(cudaFree(fs->spsvBuffer_L));
4301 PetscCallCUDA(cudaFree(fs->spsvBuffer_U));
4302 PetscCallCUDA(cudaFree(fs->spsvBuffer_Lt));
4303 PetscCallCUDA(cudaFree(fs->spsvBuffer_Ut));
4304 PetscCallCUSPARSE(cusparseDestroyMatDescr(fs->matDescr_M));
4305 PetscCallCUSPARSE(cusparseDestroySpMat(fs->spMatDescr_L));
4306 PetscCallCUSPARSE(cusparseDestroySpMat(fs->spMatDescr_U));
4307 PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_L));
4308 PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_Lt));
4309 PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_U));
4310 PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_Ut));
4311 PetscCallCUSPARSE(cusparseDestroyDnVec(fs->dnVecDescr_X));
4312 PetscCallCUSPARSE(cusparseDestroyDnVec(fs->dnVecDescr_Y));
4313 PetscCallCUSPARSE(cusparseDestroyCsrilu02Info(fs->ilu0Info_M));
4314 PetscCallCUSPARSE(cusparseDestroyCsric02Info(fs->ic0Info_M));
4315 PetscCall(PetscFree(fs->csrRowPtr_h));
4316 PetscCall(PetscFree(fs->csrVal_h));
4317 PetscCall(PetscFree(fs->diag_h));
4318 fs->createdTransposeSpSVDescr = PETSC_FALSE;
4319 fs->updatedTransposeSpSVAnalysis = PETSC_FALSE;
4320 #endif
4321 }
4322 PetscFunctionReturn(PETSC_SUCCESS);
4323 }
4324
MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors ** trifactors)4325 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors **trifactors)
4326 {
4327 PetscFunctionBegin;
4328 if (*trifactors) {
4329 PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(trifactors));
4330 PetscCallCUSPARSE(cusparseDestroy((*trifactors)->handle));
4331 PetscCall(PetscFree(*trifactors));
4332 }
4333 PetscFunctionReturn(PETSC_SUCCESS);
4334 }
4335
4336 struct IJCompare {
operator ()IJCompare4337 __host__ __device__ inline bool operator()(const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2)
4338 {
4339 if (thrust::get<0>(t1) < thrust::get<0>(t2)) return true;
4340 if (thrust::get<0>(t1) == thrust::get<0>(t2)) return thrust::get<1>(t1) < thrust::get<1>(t2);
4341 return false;
4342 }
4343 };
4344
MatSeqAIJCUSPARSEInvalidateTranspose(Mat A,PetscBool destroy)4345 static PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat A, PetscBool destroy)
4346 {
4347 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
4348
4349 PetscFunctionBegin;
4350 PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4351 if (!cusp) PetscFunctionReturn(PETSC_SUCCESS);
4352 if (destroy) {
4353 PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose, cusp->format));
4354 delete cusp->csr2csc_i;
4355 cusp->csr2csc_i = NULL;
4356 }
4357 A->transupdated = PETSC_FALSE;
4358 PetscFunctionReturn(PETSC_SUCCESS);
4359 }
4360
MatCOOStructDestroy_SeqAIJCUSPARSE(PetscCtxRt ctx)4361 static PetscErrorCode MatCOOStructDestroy_SeqAIJCUSPARSE(PetscCtxRt ctx)
4362 {
4363 MatCOOStruct_SeqAIJ *coo = *(MatCOOStruct_SeqAIJ **)ctx;
4364
4365 PetscFunctionBegin;
4366 PetscCallCUDA(cudaFree(coo->perm));
4367 PetscCallCUDA(cudaFree(coo->jmap));
4368 PetscCall(PetscFree(coo));
4369 PetscFunctionReturn(PETSC_SUCCESS);
4370 }
4371
MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat mat,PetscCount coo_n,PetscInt coo_i[],PetscInt coo_j[])4372 static PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[])
4373 {
4374 PetscBool dev_ij = PETSC_FALSE;
4375 PetscMemType mtype = PETSC_MEMTYPE_HOST;
4376 PetscInt *i, *j;
4377 PetscContainer container_h;
4378 MatCOOStruct_SeqAIJ *coo_h, *coo_d;
4379
4380 PetscFunctionBegin;
4381 PetscCall(PetscGetMemType(coo_i, &mtype));
4382 if (PetscMemTypeDevice(mtype)) {
4383 dev_ij = PETSC_TRUE;
4384 PetscCall(PetscMalloc2(coo_n, &i, coo_n, &j));
4385 PetscCallCUDA(cudaMemcpy(i, coo_i, coo_n * sizeof(PetscInt), cudaMemcpyDeviceToHost));
4386 PetscCallCUDA(cudaMemcpy(j, coo_j, coo_n * sizeof(PetscInt), cudaMemcpyDeviceToHost));
4387 } else {
4388 i = coo_i;
4389 j = coo_j;
4390 }
4391
4392 PetscCall(MatSetPreallocationCOO_SeqAIJ(mat, coo_n, i, j));
4393 if (dev_ij) PetscCall(PetscFree2(i, j));
4394 mat->offloadmask = PETSC_OFFLOAD_CPU;
4395 // Create the GPU memory
4396 PetscCall(MatSeqAIJCUSPARSECopyToGPU(mat));
4397
4398 // Copy the COO struct to device
4399 PetscCall(PetscObjectQuery((PetscObject)mat, "__PETSc_MatCOOStruct_Host", (PetscObject *)&container_h));
4400 PetscCall(PetscContainerGetPointer(container_h, &coo_h));
4401 PetscCall(PetscMalloc1(1, &coo_d));
4402 *coo_d = *coo_h; // do a shallow copy and then amend some fields that need to be different
4403 PetscCallCUDA(cudaMalloc((void **)&coo_d->jmap, (coo_h->nz + 1) * sizeof(PetscCount)));
4404 PetscCallCUDA(cudaMemcpy(coo_d->jmap, coo_h->jmap, (coo_h->nz + 1) * sizeof(PetscCount), cudaMemcpyHostToDevice));
4405 PetscCallCUDA(cudaMalloc((void **)&coo_d->perm, coo_h->Atot * sizeof(PetscCount)));
4406 PetscCallCUDA(cudaMemcpy(coo_d->perm, coo_h->perm, coo_h->Atot * sizeof(PetscCount), cudaMemcpyHostToDevice));
4407
4408 // Put the COO struct in a container and then attach that to the matrix
4409 PetscCall(PetscObjectContainerCompose((PetscObject)mat, "__PETSc_MatCOOStruct_Device", coo_d, MatCOOStructDestroy_SeqAIJCUSPARSE));
4410 PetscFunctionReturn(PETSC_SUCCESS);
4411 }
4412
MatAddCOOValues(const PetscScalar kv[],PetscCount nnz,const PetscCount jmap[],const PetscCount perm[],InsertMode imode,PetscScalar a[])4413 __global__ static void MatAddCOOValues(const PetscScalar kv[], PetscCount nnz, const PetscCount jmap[], const PetscCount perm[], InsertMode imode, PetscScalar a[])
4414 {
4415 PetscCount i = blockIdx.x * blockDim.x + threadIdx.x;
4416 const PetscCount grid_size = gridDim.x * blockDim.x;
4417 for (; i < nnz; i += grid_size) {
4418 PetscScalar sum = 0.0;
4419 for (PetscCount k = jmap[i]; k < jmap[i + 1]; k++) sum += kv[perm[k]];
4420 a[i] = (imode == INSERT_VALUES ? 0.0 : a[i]) + sum;
4421 }
4422 }
4423
MatSetValuesCOO_SeqAIJCUSPARSE(Mat A,const PetscScalar v[],InsertMode imode)4424 static PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode)
4425 {
4426 Mat_SeqAIJ *seq = (Mat_SeqAIJ *)A->data;
4427 Mat_SeqAIJCUSPARSE *dev = (Mat_SeqAIJCUSPARSE *)A->spptr;
4428 PetscCount Annz = seq->nz;
4429 PetscMemType memtype;
4430 const PetscScalar *v1 = v;
4431 PetscScalar *Aa;
4432 PetscContainer container;
4433 MatCOOStruct_SeqAIJ *coo;
4434
4435 PetscFunctionBegin;
4436 if (!dev->mat) PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
4437
4438 PetscCall(PetscObjectQuery((PetscObject)A, "__PETSc_MatCOOStruct_Device", (PetscObject *)&container));
4439 PetscCall(PetscContainerGetPointer(container, &coo));
4440
4441 PetscCall(PetscGetMemType(v, &memtype));
4442 if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */
4443 PetscCallCUDA(cudaMalloc((void **)&v1, coo->n * sizeof(PetscScalar)));
4444 PetscCallCUDA(cudaMemcpy((void *)v1, v, coo->n * sizeof(PetscScalar), cudaMemcpyHostToDevice));
4445 }
4446
4447 if (imode == INSERT_VALUES) PetscCall(MatSeqAIJCUSPARSEGetArrayWrite(A, &Aa));
4448 else PetscCall(MatSeqAIJCUSPARSEGetArray(A, &Aa));
4449
4450 PetscCall(PetscLogGpuTimeBegin());
4451 if (Annz) {
4452 MatAddCOOValues<<<((int)(Annz + 255) / 256), 256>>>(v1, Annz, coo->jmap, coo->perm, imode, Aa);
4453 PetscCallCUDA(cudaPeekAtLastError());
4454 }
4455 PetscCall(PetscLogGpuTimeEnd());
4456
4457 if (imode == INSERT_VALUES) PetscCall(MatSeqAIJCUSPARSERestoreArrayWrite(A, &Aa));
4458 else PetscCall(MatSeqAIJCUSPARSERestoreArray(A, &Aa));
4459
4460 if (PetscMemTypeHost(memtype)) PetscCallCUDA(cudaFree((void *)v1));
4461 PetscFunctionReturn(PETSC_SUCCESS);
4462 }
4463
4464 /*@C
4465 MatSeqAIJCUSPARSEGetIJ - returns the device row storage `i` and `j` indices for `MATSEQAIJCUSPARSE` matrices.
4466
4467 Not Collective
4468
4469 Input Parameters:
4470 + A - the matrix
4471 - compressed - `PETSC_TRUE` or `PETSC_FALSE` indicating the matrix data structure should be always returned in compressed form
4472
4473 Output Parameters:
4474 + i - the CSR row pointers, these are always `int` even when PETSc is configured with `--with-64-bit-indices`
4475 - j - the CSR column indices, these are always `int` even when PETSc is configured with `--with-64-bit-indices`
4476
4477 Level: developer
4478
4479 Note:
4480 When compressed is true, the CSR structure does not contain empty rows
4481
4482 .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSERestoreIJ()`, `MatSeqAIJCUSPARSEGetArrayRead()`
4483 @*/
MatSeqAIJCUSPARSEGetIJ(Mat A,PetscBool compressed,const int ** i,const int ** j)4484 PetscErrorCode MatSeqAIJCUSPARSEGetIJ(Mat A, PetscBool compressed, const int **i, const int **j)
4485 {
4486 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
4487 CsrMatrix *csr;
4488 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
4489
4490 PetscFunctionBegin;
4491 PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4492 if (!i || !j) PetscFunctionReturn(PETSC_SUCCESS);
4493 PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4494 PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4495 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
4496 PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4497 csr = (CsrMatrix *)cusp->mat->mat;
4498 if (i) {
4499 if (!compressed && a->compressedrow.use) { /* need full row offset */
4500 if (!cusp->rowoffsets_gpu) {
4501 cusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
4502 cusp->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
4503 PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt)));
4504 }
4505 *i = cusp->rowoffsets_gpu->data().get();
4506 } else *i = csr->row_offsets->data().get();
4507 }
4508 if (j) *j = csr->column_indices->data().get();
4509 PetscFunctionReturn(PETSC_SUCCESS);
4510 }
4511
4512 /*@C
4513 MatSeqAIJCUSPARSERestoreIJ - restore the device row storage `i` and `j` indices obtained with `MatSeqAIJCUSPARSEGetIJ()`
4514
4515 Not Collective
4516
4517 Input Parameters:
4518 + A - the matrix
4519 . compressed - `PETSC_TRUE` or `PETSC_FALSE` indicating the matrix data structure should be always returned in compressed form
4520 . i - the CSR row pointers
4521 - j - the CSR column indices
4522
4523 Level: developer
4524
4525 .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetIJ()`
4526 @*/
MatSeqAIJCUSPARSERestoreIJ(Mat A,PetscBool compressed,const int ** i,const int ** j)4527 PetscErrorCode MatSeqAIJCUSPARSERestoreIJ(Mat A, PetscBool compressed, const int **i, const int **j)
4528 {
4529 PetscFunctionBegin;
4530 PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4531 PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4532 if (i) *i = NULL;
4533 if (j) *j = NULL;
4534 (void)compressed;
4535 PetscFunctionReturn(PETSC_SUCCESS);
4536 }
4537
4538 /*@C
4539 MatSeqAIJCUSPARSEGetArrayRead - gives read-only access to the array where the device data for a `MATSEQAIJCUSPARSE` matrix nonzero entries are stored
4540
4541 Not Collective
4542
4543 Input Parameter:
4544 . A - a `MATSEQAIJCUSPARSE` matrix
4545
4546 Output Parameter:
4547 . a - pointer to the device data
4548
4549 Level: developer
4550
4551 Note:
4552 Will trigger host-to-device copies if the most up-to-date matrix data is on the host
4553
4554 .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArray()`, `MatSeqAIJCUSPARSEGetArrayWrite()`, `MatSeqAIJCUSPARSERestoreArrayRead()`
4555 @*/
MatSeqAIJCUSPARSEGetArrayRead(Mat A,const PetscScalar ** a)4556 PetscErrorCode MatSeqAIJCUSPARSEGetArrayRead(Mat A, const PetscScalar **a)
4557 {
4558 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
4559 CsrMatrix *csr;
4560
4561 PetscFunctionBegin;
4562 PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4563 PetscAssertPointer(a, 2);
4564 PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4565 PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4566 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
4567 PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4568 csr = (CsrMatrix *)cusp->mat->mat;
4569 PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory");
4570 *a = csr->values->data().get();
4571 PetscFunctionReturn(PETSC_SUCCESS);
4572 }
4573
4574 /*@C
4575 MatSeqAIJCUSPARSERestoreArrayRead - restore the read-only access array obtained from `MatSeqAIJCUSPARSEGetArrayRead()`
4576
4577 Not Collective
4578
4579 Input Parameters:
4580 + A - a `MATSEQAIJCUSPARSE` matrix
4581 - a - pointer to the device data
4582
4583 Level: developer
4584
4585 .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArrayRead()`
4586 @*/
MatSeqAIJCUSPARSERestoreArrayRead(Mat A,const PetscScalar ** a)4587 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayRead(Mat A, const PetscScalar **a)
4588 {
4589 PetscFunctionBegin;
4590 PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4591 PetscAssertPointer(a, 2);
4592 PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4593 *a = NULL;
4594 PetscFunctionReturn(PETSC_SUCCESS);
4595 }
4596
4597 /*@C
4598 MatSeqAIJCUSPARSEGetArray - gives read-write access to the array where the device data for a `MATSEQAIJCUSPARSE` matrix is stored
4599
4600 Not Collective
4601
4602 Input Parameter:
4603 . A - a `MATSEQAIJCUSPARSE` matrix
4604
4605 Output Parameter:
4606 . a - pointer to the device data
4607
4608 Level: developer
4609
4610 Note:
4611 Will trigger host-to-device copies if the most up-to-date matrix data is on the host
4612
4613 .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArrayRead()`, `MatSeqAIJCUSPARSEGetArrayWrite()`, `MatSeqAIJCUSPARSERestoreArray()`
4614 @*/
MatSeqAIJCUSPARSEGetArray(Mat A,PetscScalar ** a)4615 PetscErrorCode MatSeqAIJCUSPARSEGetArray(Mat A, PetscScalar **a)
4616 {
4617 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
4618 CsrMatrix *csr;
4619
4620 PetscFunctionBegin;
4621 PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4622 PetscAssertPointer(a, 2);
4623 PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4624 PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4625 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
4626 PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4627 csr = (CsrMatrix *)cusp->mat->mat;
4628 PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory");
4629 *a = csr->values->data().get();
4630 A->offloadmask = PETSC_OFFLOAD_GPU;
4631 PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_FALSE));
4632 PetscFunctionReturn(PETSC_SUCCESS);
4633 }
4634 /*@C
4635 MatSeqAIJCUSPARSERestoreArray - restore the read-write access array obtained from `MatSeqAIJCUSPARSEGetArray()`
4636
4637 Not Collective
4638
4639 Input Parameters:
4640 + A - a `MATSEQAIJCUSPARSE` matrix
4641 - a - pointer to the device data
4642
4643 Level: developer
4644
4645 .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArray()`
4646 @*/
MatSeqAIJCUSPARSERestoreArray(Mat A,PetscScalar ** a)4647 PetscErrorCode MatSeqAIJCUSPARSERestoreArray(Mat A, PetscScalar **a)
4648 {
4649 PetscFunctionBegin;
4650 PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4651 PetscAssertPointer(a, 2);
4652 PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4653 PetscCall(PetscObjectStateIncrease((PetscObject)A));
4654 *a = NULL;
4655 PetscFunctionReturn(PETSC_SUCCESS);
4656 }
4657
4658 /*@C
4659 MatSeqAIJCUSPARSEGetArrayWrite - gives write access to the array where the device data for a `MATSEQAIJCUSPARSE` matrix is stored
4660
4661 Not Collective
4662
4663 Input Parameter:
4664 . A - a `MATSEQAIJCUSPARSE` matrix
4665
4666 Output Parameter:
4667 . a - pointer to the device data
4668
4669 Level: developer
4670
4671 Note:
4672 Does not trigger any host to device copies.
4673
4674 It marks the data GPU valid so users must set all the values in `a` to ensure out-of-date data is not considered current
4675
4676 .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArray()`, `MatSeqAIJCUSPARSEGetArrayRead()`, `MatSeqAIJCUSPARSERestoreArrayWrite()`
4677 @*/
MatSeqAIJCUSPARSEGetArrayWrite(Mat A,PetscScalar ** a)4678 PetscErrorCode MatSeqAIJCUSPARSEGetArrayWrite(Mat A, PetscScalar **a)
4679 {
4680 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
4681 CsrMatrix *csr;
4682
4683 PetscFunctionBegin;
4684 PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4685 PetscAssertPointer(a, 2);
4686 PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4687 PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4688 PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4689 csr = (CsrMatrix *)cusp->mat->mat;
4690 PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory");
4691 *a = csr->values->data().get();
4692 A->offloadmask = PETSC_OFFLOAD_GPU;
4693 PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_FALSE));
4694 PetscFunctionReturn(PETSC_SUCCESS);
4695 }
4696
4697 /*@C
4698 MatSeqAIJCUSPARSERestoreArrayWrite - restore the write-only access array obtained from `MatSeqAIJCUSPARSEGetArrayWrite()`
4699
4700 Not Collective
4701
4702 Input Parameters:
4703 + A - a `MATSEQAIJCUSPARSE` matrix
4704 - a - pointer to the device data
4705
4706 Level: developer
4707
4708 .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArrayWrite()`
4709 @*/
MatSeqAIJCUSPARSERestoreArrayWrite(Mat A,PetscScalar ** a)4710 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayWrite(Mat A, PetscScalar **a)
4711 {
4712 PetscFunctionBegin;
4713 PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4714 PetscAssertPointer(a, 2);
4715 PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4716 PetscCall(PetscObjectStateIncrease((PetscObject)A));
4717 *a = NULL;
4718 PetscFunctionReturn(PETSC_SUCCESS);
4719 }
4720
4721 struct IJCompare4 {
operator ()IJCompare44722 __host__ __device__ inline bool operator()(const thrust::tuple<int, int, PetscScalar, int> &t1, const thrust::tuple<int, int, PetscScalar, int> &t2)
4723 {
4724 if (thrust::get<0>(t1) < thrust::get<0>(t2)) return true;
4725 if (thrust::get<0>(t1) == thrust::get<0>(t2)) return thrust::get<1>(t1) < thrust::get<1>(t2);
4726 return false;
4727 }
4728 };
4729
4730 struct Shift {
4731 int _shift;
4732
ShiftShift4733 Shift(int shift) : _shift(shift) { }
operator ()Shift4734 __host__ __device__ inline int operator()(const int &c) { return c + _shift; }
4735 };
4736
4737 /* merges two SeqAIJCUSPARSE matrices A, B by concatenating their rows. [A';B']' operation in MATLAB notation */
MatSeqAIJCUSPARSEMergeMats(Mat A,Mat B,MatReuse reuse,Mat * C)4738 PetscErrorCode MatSeqAIJCUSPARSEMergeMats(Mat A, Mat B, MatReuse reuse, Mat *C)
4739 {
4740 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c;
4741 Mat_SeqAIJCUSPARSE *Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr, *Bcusp = (Mat_SeqAIJCUSPARSE *)B->spptr, *Ccusp;
4742 Mat_SeqAIJCUSPARSEMultStruct *Cmat;
4743 CsrMatrix *Acsr, *Bcsr, *Ccsr;
4744 PetscInt Annz, Bnnz;
4745 cusparseStatus_t stat;
4746 PetscInt i, m, n, zero = 0;
4747
4748 PetscFunctionBegin;
4749 PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4750 PetscValidHeaderSpecific(B, MAT_CLASSID, 2);
4751 PetscAssertPointer(C, 4);
4752 PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4753 PetscCheckTypeName(B, MATSEQAIJCUSPARSE);
4754 PetscCheck(A->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, A->rmap->n, B->rmap->n);
4755 PetscCheck(reuse != MAT_INPLACE_MATRIX, PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_INPLACE_MATRIX not supported");
4756 PetscCheck(Acusp->format != MAT_CUSPARSE_ELL && Acusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4757 PetscCheck(Bcusp->format != MAT_CUSPARSE_ELL && Bcusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4758 if (reuse == MAT_INITIAL_MATRIX) {
4759 m = A->rmap->n;
4760 n = A->cmap->n + B->cmap->n;
4761 PetscCall(MatCreate(PETSC_COMM_SELF, C));
4762 PetscCall(MatSetSizes(*C, m, n, m, n));
4763 PetscCall(MatSetType(*C, MATSEQAIJCUSPARSE));
4764 c = (Mat_SeqAIJ *)(*C)->data;
4765 Ccusp = (Mat_SeqAIJCUSPARSE *)(*C)->spptr;
4766 Cmat = new Mat_SeqAIJCUSPARSEMultStruct;
4767 Ccsr = new CsrMatrix;
4768 Cmat->cprowIndices = NULL;
4769 c->compressedrow.use = PETSC_FALSE;
4770 c->compressedrow.nrows = 0;
4771 c->compressedrow.i = NULL;
4772 c->compressedrow.rindex = NULL;
4773 Ccusp->workVector = NULL;
4774 Ccusp->nrows = m;
4775 Ccusp->mat = Cmat;
4776 Ccusp->mat->mat = Ccsr;
4777 Ccsr->num_rows = m;
4778 Ccsr->num_cols = n;
4779 PetscCallCUSPARSE(cusparseCreateMatDescr(&Cmat->descr));
4780 PetscCallCUSPARSE(cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO));
4781 PetscCallCUSPARSE(cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
4782 PetscCallCUDA(cudaMalloc((void **)&Cmat->alpha_one, sizeof(PetscScalar)));
4783 PetscCallCUDA(cudaMalloc((void **)&Cmat->beta_zero, sizeof(PetscScalar)));
4784 PetscCallCUDA(cudaMalloc((void **)&Cmat->beta_one, sizeof(PetscScalar)));
4785 PetscCallCUDA(cudaMemcpy(Cmat->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
4786 PetscCallCUDA(cudaMemcpy(Cmat->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
4787 PetscCallCUDA(cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
4788 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
4789 PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
4790 PetscCheck(Acusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4791 PetscCheck(Bcusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4792
4793 Acsr = (CsrMatrix *)Acusp->mat->mat;
4794 Bcsr = (CsrMatrix *)Bcusp->mat->mat;
4795 Annz = (PetscInt)Acsr->column_indices->size();
4796 Bnnz = (PetscInt)Bcsr->column_indices->size();
4797 c->nz = Annz + Bnnz;
4798 Ccsr->row_offsets = new THRUSTINTARRAY32(m + 1);
4799 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
4800 Ccsr->values = new THRUSTARRAY(c->nz);
4801 Ccsr->num_entries = c->nz;
4802 Ccusp->coords = new THRUSTINTARRAY(c->nz);
4803 if (c->nz) {
4804 auto Acoo = new THRUSTINTARRAY32(Annz);
4805 auto Bcoo = new THRUSTINTARRAY32(Bnnz);
4806 auto Ccoo = new THRUSTINTARRAY32(c->nz);
4807 THRUSTINTARRAY32 *Aroff, *Broff;
4808
4809 if (a->compressedrow.use) { /* need full row offset */
4810 if (!Acusp->rowoffsets_gpu) {
4811 Acusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
4812 Acusp->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
4813 PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt)));
4814 }
4815 Aroff = Acusp->rowoffsets_gpu;
4816 } else Aroff = Acsr->row_offsets;
4817 if (b->compressedrow.use) { /* need full row offset */
4818 if (!Bcusp->rowoffsets_gpu) {
4819 Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1);
4820 Bcusp->rowoffsets_gpu->assign(b->i, b->i + B->rmap->n + 1);
4821 PetscCall(PetscLogCpuToGpu((B->rmap->n + 1) * sizeof(PetscInt)));
4822 }
4823 Broff = Bcusp->rowoffsets_gpu;
4824 } else Broff = Bcsr->row_offsets;
4825 PetscCall(PetscLogGpuTimeBegin());
4826 stat = cusparseXcsr2coo(Acusp->handle, Aroff->data().get(), Annz, m, Acoo->data().get(), CUSPARSE_INDEX_BASE_ZERO);
4827 PetscCallCUSPARSE(stat);
4828 stat = cusparseXcsr2coo(Bcusp->handle, Broff->data().get(), Bnnz, m, Bcoo->data().get(), CUSPARSE_INDEX_BASE_ZERO);
4829 PetscCallCUSPARSE(stat);
4830 /* Issues when using bool with large matrices on SUMMIT 10.2.89 */
4831 auto Aperm = thrust::make_constant_iterator(1);
4832 auto Bperm = thrust::make_constant_iterator(0);
4833 #if PETSC_PKG_CUDA_VERSION_GE(10, 0, 0)
4834 auto Bcib = thrust::make_transform_iterator(Bcsr->column_indices->begin(), Shift(A->cmap->n));
4835 auto Bcie = thrust::make_transform_iterator(Bcsr->column_indices->end(), Shift(A->cmap->n));
4836 #else
4837 /* there are issues instantiating the merge operation using a transform iterator for the columns of B */
4838 auto Bcib = Bcsr->column_indices->begin();
4839 auto Bcie = Bcsr->column_indices->end();
4840 thrust::transform(Bcib, Bcie, Bcib, Shift(A->cmap->n));
4841 #endif
4842 auto wPerm = new THRUSTINTARRAY32(Annz + Bnnz);
4843 auto Azb = thrust::make_zip_iterator(thrust::make_tuple(Acoo->begin(), Acsr->column_indices->begin(), Acsr->values->begin(), Aperm));
4844 auto Aze = thrust::make_zip_iterator(thrust::make_tuple(Acoo->end(), Acsr->column_indices->end(), Acsr->values->end(), Aperm));
4845 auto Bzb = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->begin(), Bcib, Bcsr->values->begin(), Bperm));
4846 auto Bze = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->end(), Bcie, Bcsr->values->end(), Bperm));
4847 auto Czb = thrust::make_zip_iterator(thrust::make_tuple(Ccoo->begin(), Ccsr->column_indices->begin(), Ccsr->values->begin(), wPerm->begin()));
4848 auto p1 = Ccusp->coords->begin();
4849 auto p2 = Ccusp->coords->begin();
4850 #if CCCL_VERSION >= 3001000
4851 cuda::std::advance(p2, Annz);
4852 #else
4853 thrust::advance(p2, Annz);
4854 #endif
4855 PetscCallThrust(thrust::merge(thrust::device, Azb, Aze, Bzb, Bze, Czb, IJCompare4()));
4856 #if PETSC_PKG_CUDA_VERSION_LT(10, 0, 0)
4857 thrust::transform(Bcib, Bcie, Bcib, Shift(-A->cmap->n));
4858 #endif
4859 auto cci = thrust::make_counting_iterator(zero);
4860 auto cce = thrust::make_counting_iterator(c->nz);
4861 #if 0 //Errors on SUMMIT cuda 11.1.0
4862 PetscCallThrust(thrust::partition_copy(thrust::device,cci,cce,wPerm->begin(),p1,p2,thrust::identity<int>()));
4863 #else
4864 #if PETSC_PKG_CUDA_VERSION_LT(12, 9, 0) || PetscDefined(HAVE_THRUST)
4865 auto pred = thrust::identity<int>();
4866 #else
4867 auto pred = cuda::std::identity();
4868 #endif
4869 PetscCallThrust(thrust::copy_if(thrust::device, cci, cce, wPerm->begin(), p1, pred));
4870 PetscCallThrust(thrust::remove_copy_if(thrust::device, cci, cce, wPerm->begin(), p2, pred));
4871 #endif
4872 stat = cusparseXcoo2csr(Ccusp->handle, Ccoo->data().get(), c->nz, m, Ccsr->row_offsets->data().get(), CUSPARSE_INDEX_BASE_ZERO);
4873 PetscCallCUSPARSE(stat);
4874 PetscCall(PetscLogGpuTimeEnd());
4875 delete wPerm;
4876 delete Acoo;
4877 delete Bcoo;
4878 delete Ccoo;
4879 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
4880 stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, Ccsr->num_entries, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
4881 PetscCallCUSPARSE(stat);
4882 #endif
4883 if (A->form_explicit_transpose && B->form_explicit_transpose) { /* if A and B have the transpose, generate C transpose too */
4884 PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
4885 PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(B));
4886 PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
4887 Mat_SeqAIJCUSPARSEMultStruct *CmatT = new Mat_SeqAIJCUSPARSEMultStruct;
4888 CsrMatrix *CcsrT = new CsrMatrix;
4889 CsrMatrix *AcsrT = AT ? (CsrMatrix *)Acusp->matTranspose->mat : NULL;
4890 CsrMatrix *BcsrT = BT ? (CsrMatrix *)Bcusp->matTranspose->mat : NULL;
4891
4892 (*C)->form_explicit_transpose = PETSC_TRUE;
4893 (*C)->transupdated = PETSC_TRUE;
4894 Ccusp->rowoffsets_gpu = NULL;
4895 CmatT->cprowIndices = NULL;
4896 CmatT->mat = CcsrT;
4897 CcsrT->num_rows = n;
4898 CcsrT->num_cols = m;
4899 CcsrT->num_entries = c->nz;
4900
4901 CcsrT->row_offsets = new THRUSTINTARRAY32(n + 1);
4902 CcsrT->column_indices = new THRUSTINTARRAY32(c->nz);
4903 CcsrT->values = new THRUSTARRAY(c->nz);
4904
4905 PetscCall(PetscLogGpuTimeBegin());
4906 auto rT = CcsrT->row_offsets->begin();
4907 if (AT) {
4908 rT = thrust::copy(AcsrT->row_offsets->begin(), AcsrT->row_offsets->end(), rT);
4909 #if CCCL_VERSION >= 3001000
4910 cuda::std::advance(rT, -1);
4911 #else
4912 thrust::advance(rT, -1);
4913 #endif
4914 }
4915 if (BT) {
4916 auto titb = thrust::make_transform_iterator(BcsrT->row_offsets->begin(), Shift(a->nz));
4917 auto tite = thrust::make_transform_iterator(BcsrT->row_offsets->end(), Shift(a->nz));
4918 thrust::copy(titb, tite, rT);
4919 }
4920 auto cT = CcsrT->column_indices->begin();
4921 if (AT) cT = thrust::copy(AcsrT->column_indices->begin(), AcsrT->column_indices->end(), cT);
4922 if (BT) thrust::copy(BcsrT->column_indices->begin(), BcsrT->column_indices->end(), cT);
4923 auto vT = CcsrT->values->begin();
4924 if (AT) vT = thrust::copy(AcsrT->values->begin(), AcsrT->values->end(), vT);
4925 if (BT) thrust::copy(BcsrT->values->begin(), BcsrT->values->end(), vT);
4926 PetscCall(PetscLogGpuTimeEnd());
4927
4928 PetscCallCUSPARSE(cusparseCreateMatDescr(&CmatT->descr));
4929 PetscCallCUSPARSE(cusparseSetMatIndexBase(CmatT->descr, CUSPARSE_INDEX_BASE_ZERO));
4930 PetscCallCUSPARSE(cusparseSetMatType(CmatT->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
4931 PetscCallCUDA(cudaMalloc((void **)&CmatT->alpha_one, sizeof(PetscScalar)));
4932 PetscCallCUDA(cudaMalloc((void **)&CmatT->beta_zero, sizeof(PetscScalar)));
4933 PetscCallCUDA(cudaMalloc((void **)&CmatT->beta_one, sizeof(PetscScalar)));
4934 PetscCallCUDA(cudaMemcpy(CmatT->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
4935 PetscCallCUDA(cudaMemcpy(CmatT->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
4936 PetscCallCUDA(cudaMemcpy(CmatT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
4937 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
4938 stat = cusparseCreateCsr(&CmatT->matDescr, CcsrT->num_rows, CcsrT->num_cols, CcsrT->num_entries, CcsrT->row_offsets->data().get(), CcsrT->column_indices->data().get(), CcsrT->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
4939 PetscCallCUSPARSE(stat);
4940 #endif
4941 Ccusp->matTranspose = CmatT;
4942 }
4943 }
4944
4945 c->free_a = PETSC_TRUE;
4946 PetscCall(PetscShmgetAllocateArray(c->nz, sizeof(PetscInt), (void **)&c->j));
4947 PetscCall(PetscShmgetAllocateArray(m + 1, sizeof(PetscInt), (void **)&c->i));
4948 c->free_ij = PETSC_TRUE;
4949 if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64-bit conversion on the GPU and then copy to host (lazy) */
4950 THRUSTINTARRAY ii(Ccsr->row_offsets->size());
4951 THRUSTINTARRAY jj(Ccsr->column_indices->size());
4952 ii = *Ccsr->row_offsets;
4953 jj = *Ccsr->column_indices;
4954 PetscCallCUDA(cudaMemcpy(c->i, ii.data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
4955 PetscCallCUDA(cudaMemcpy(c->j, jj.data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
4956 } else {
4957 PetscCallCUDA(cudaMemcpy(c->i, Ccsr->row_offsets->data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
4958 PetscCallCUDA(cudaMemcpy(c->j, Ccsr->column_indices->data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
4959 }
4960 PetscCall(PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size()) * sizeof(PetscInt)));
4961 PetscCall(PetscMalloc1(m, &c->ilen));
4962 PetscCall(PetscMalloc1(m, &c->imax));
4963 c->maxnz = c->nz;
4964 c->nonzerorowcnt = 0;
4965 c->rmax = 0;
4966 for (i = 0; i < m; i++) {
4967 const PetscInt nn = c->i[i + 1] - c->i[i];
4968 c->ilen[i] = c->imax[i] = nn;
4969 c->nonzerorowcnt += (PetscInt)!!nn;
4970 c->rmax = PetscMax(c->rmax, nn);
4971 }
4972 PetscCall(PetscMalloc1(c->nz, &c->a));
4973 (*C)->nonzerostate++;
4974 PetscCall(PetscLayoutSetUp((*C)->rmap));
4975 PetscCall(PetscLayoutSetUp((*C)->cmap));
4976 Ccusp->nonzerostate = (*C)->nonzerostate;
4977 (*C)->preallocated = PETSC_TRUE;
4978 } else {
4979 PetscCheck((*C)->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, (*C)->rmap->n, B->rmap->n);
4980 c = (Mat_SeqAIJ *)(*C)->data;
4981 if (c->nz) {
4982 Ccusp = (Mat_SeqAIJCUSPARSE *)(*C)->spptr;
4983 PetscCheck(Ccusp->coords, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing coords");
4984 PetscCheck(Ccusp->format != MAT_CUSPARSE_ELL && Ccusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4985 PetscCheck(Ccusp->nonzerostate == (*C)->nonzerostate, PETSC_COMM_SELF, PETSC_ERR_COR, "Wrong nonzerostate");
4986 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
4987 PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
4988 PetscCheck(Acusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4989 PetscCheck(Bcusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4990 Acsr = (CsrMatrix *)Acusp->mat->mat;
4991 Bcsr = (CsrMatrix *)Bcusp->mat->mat;
4992 Ccsr = (CsrMatrix *)Ccusp->mat->mat;
4993 PetscCheck(Acsr->num_entries == (PetscInt)Acsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "A nnz %" PetscInt_FMT " != %" PetscInt_FMT, Acsr->num_entries, (PetscInt)Acsr->values->size());
4994 PetscCheck(Bcsr->num_entries == (PetscInt)Bcsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "B nnz %" PetscInt_FMT " != %" PetscInt_FMT, Bcsr->num_entries, (PetscInt)Bcsr->values->size());
4995 PetscCheck(Ccsr->num_entries == (PetscInt)Ccsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "C nnz %" PetscInt_FMT " != %" PetscInt_FMT, Ccsr->num_entries, (PetscInt)Ccsr->values->size());
4996 PetscCheck(Ccsr->num_entries == Acsr->num_entries + Bcsr->num_entries, PETSC_COMM_SELF, PETSC_ERR_COR, "C nnz %" PetscInt_FMT " != %" PetscInt_FMT " + %" PetscInt_FMT, Ccsr->num_entries, Acsr->num_entries, Bcsr->num_entries);
4997 PetscCheck(Ccusp->coords->size() == Ccsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "permSize %" PetscInt_FMT " != %" PetscInt_FMT, (PetscInt)Ccusp->coords->size(), (PetscInt)Ccsr->values->size());
4998 auto pmid = Ccusp->coords->begin();
4999 #if CCCL_VERSION >= 3001000
5000 cuda::std::advance(pmid, Acsr->num_entries);
5001 #else
5002 thrust::advance(pmid, Acsr->num_entries);
5003 #endif
5004 PetscCall(PetscLogGpuTimeBegin());
5005 auto zibait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->begin(), thrust::make_permutation_iterator(Ccsr->values->begin(), Ccusp->coords->begin())));
5006 auto zieait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->end(), thrust::make_permutation_iterator(Ccsr->values->begin(), pmid)));
5007 thrust::for_each(zibait, zieait, VecCUDAEquals());
5008 auto zibbit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->begin(), thrust::make_permutation_iterator(Ccsr->values->begin(), pmid)));
5009 auto ziebit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->end(), thrust::make_permutation_iterator(Ccsr->values->begin(), Ccusp->coords->end())));
5010 thrust::for_each(zibbit, ziebit, VecCUDAEquals());
5011 PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(*C, PETSC_FALSE));
5012 if (A->form_explicit_transpose && B->form_explicit_transpose && (*C)->form_explicit_transpose) {
5013 PetscCheck(Ccusp->matTranspose, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing transpose Mat_SeqAIJCUSPARSEMultStruct");
5014 PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
5015 CsrMatrix *AcsrT = AT ? (CsrMatrix *)Acusp->matTranspose->mat : NULL;
5016 CsrMatrix *BcsrT = BT ? (CsrMatrix *)Bcusp->matTranspose->mat : NULL;
5017 CsrMatrix *CcsrT = (CsrMatrix *)Ccusp->matTranspose->mat;
5018 auto vT = CcsrT->values->begin();
5019 if (AT) vT = thrust::copy(AcsrT->values->begin(), AcsrT->values->end(), vT);
5020 if (BT) thrust::copy(BcsrT->values->begin(), BcsrT->values->end(), vT);
5021 (*C)->transupdated = PETSC_TRUE;
5022 }
5023 PetscCall(PetscLogGpuTimeEnd());
5024 }
5025 }
5026 PetscCall(PetscObjectStateIncrease((PetscObject)*C));
5027 (*C)->assembled = PETSC_TRUE;
5028 (*C)->was_assembled = PETSC_FALSE;
5029 (*C)->offloadmask = PETSC_OFFLOAD_GPU;
5030 PetscFunctionReturn(PETSC_SUCCESS);
5031 }
5032
MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat A,PetscInt n,const PetscInt idx[],PetscScalar v[])5033 static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt idx[], PetscScalar v[])
5034 {
5035 bool dmem;
5036 const PetscScalar *av;
5037
5038 PetscFunctionBegin;
5039 dmem = isCudaMem(v);
5040 PetscCall(MatSeqAIJCUSPARSEGetArrayRead(A, &av));
5041 if (n && idx) {
5042 THRUSTINTARRAY widx(n);
5043 widx.assign(idx, idx + n);
5044 PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt)));
5045
5046 THRUSTARRAY *w = NULL;
5047 thrust::device_ptr<PetscScalar> dv;
5048 if (dmem) {
5049 dv = thrust::device_pointer_cast(v);
5050 } else {
5051 w = new THRUSTARRAY(n);
5052 dv = w->data();
5053 }
5054 thrust::device_ptr<const PetscScalar> dav = thrust::device_pointer_cast(av);
5055
5056 auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav, widx.begin()), dv));
5057 auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav, widx.end()), dv + n));
5058 thrust::for_each(zibit, zieit, VecCUDAEquals());
5059 if (w) PetscCallCUDA(cudaMemcpy(v, w->data().get(), n * sizeof(PetscScalar), cudaMemcpyDeviceToHost));
5060 delete w;
5061 } else {
5062 PetscCallCUDA(cudaMemcpy(v, av, n * sizeof(PetscScalar), dmem ? cudaMemcpyDeviceToDevice : cudaMemcpyDeviceToHost));
5063 }
5064 if (!dmem) PetscCall(PetscLogCpuToGpu(n * sizeof(PetscScalar)));
5065 PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(A, &av));
5066 PetscFunctionReturn(PETSC_SUCCESS);
5067 }
5068