1 /* 2 Defines the basic matrix operations for the AIJ (compressed row) 3 matrix storage format using the CUSPARSE library, 4 */ 5 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1 6 7 #include <petscconf.h> 8 #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/ 9 #include <../src/mat/impls/sbaij/seq/sbaij.h> 10 #include <../src/vec/vec/impls/dvecimpl.h> 11 #include <petsc/private/vecimpl.h> 12 #undef VecType 13 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h> 14 #include <thrust/adjacent_difference.h> 15 #if PETSC_CPP_VERSION >= 14 16 #define PETSC_HAVE_THRUST_ASYNC 1 17 // thrust::for_each(thrust::cuda::par.on()) requires C++14 18 #include <thrust/async/for_each.h> 19 #endif 20 #include <thrust/iterator/constant_iterator.h> 21 #include <thrust/remove.h> 22 #include <thrust/sort.h> 23 #include <thrust/unique.h> 24 25 const char *const MatCUSPARSEStorageFormats[] = {"CSR", "ELL", "HYB", "MatCUSPARSEStorageFormat", "MAT_CUSPARSE_", 0}; 26 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 27 /* The following are copied from cusparse.h in CUDA-11.0. In MatCUSPARSESpMVAlgorithms[] etc, we copy them in 28 0-based integer value order, since we want to use PetscOptionsEnum() to parse user command line options for them. 29 30 typedef enum { 31 CUSPARSE_MV_ALG_DEFAULT = 0, 32 CUSPARSE_COOMV_ALG = 1, 33 CUSPARSE_CSRMV_ALG1 = 2, 34 CUSPARSE_CSRMV_ALG2 = 3 35 } cusparseSpMVAlg_t; 36 37 typedef enum { 38 CUSPARSE_MM_ALG_DEFAULT CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_ALG_DEFAULT) = 0, 39 CUSPARSE_COOMM_ALG1 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG1) = 1, 40 CUSPARSE_COOMM_ALG2 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG2) = 2, 41 CUSPARSE_COOMM_ALG3 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG3) = 3, 42 CUSPARSE_CSRMM_ALG1 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_CSR_ALG1) = 4, 43 CUSPARSE_SPMM_ALG_DEFAULT = 0, 44 CUSPARSE_SPMM_COO_ALG1 = 1, 45 CUSPARSE_SPMM_COO_ALG2 = 2, 46 CUSPARSE_SPMM_COO_ALG3 = 3, 47 CUSPARSE_SPMM_COO_ALG4 = 5, 48 CUSPARSE_SPMM_CSR_ALG1 = 4, 49 CUSPARSE_SPMM_CSR_ALG2 = 6, 50 } cusparseSpMMAlg_t; 51 52 typedef enum { 53 CUSPARSE_CSR2CSC_ALG1 = 1, // faster than V2 (in general), deterministic 54 CUSPARSE_CSR2CSC_ALG2 = 2 // low memory requirement, non-deterministic 55 } cusparseCsr2CscAlg_t; 56 */ 57 const char *const MatCUSPARSESpMVAlgorithms[] = {"MV_ALG_DEFAULT", "COOMV_ALG", "CSRMV_ALG1", "CSRMV_ALG2", "cusparseSpMVAlg_t", "CUSPARSE_", 0}; 58 const char *const MatCUSPARSESpMMAlgorithms[] = {"ALG_DEFAULT", "COO_ALG1", "COO_ALG2", "COO_ALG3", "CSR_ALG1", "COO_ALG4", "CSR_ALG2", "cusparseSpMMAlg_t", "CUSPARSE_SPMM_", 0}; 59 const char *const MatCUSPARSECsr2CscAlgorithms[] = {"INVALID" /*cusparse does not have enum 0! We created one*/, "ALG1", "ALG2", "cusparseCsr2CscAlg_t", "CUSPARSE_CSR2CSC_", 0}; 60 #endif 61 62 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, const MatFactorInfo *); 63 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, const MatFactorInfo *); 64 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat, Mat, const MatFactorInfo *); 65 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, IS, const MatFactorInfo *); 66 #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0) 67 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat, Vec, Vec); 68 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat, Vec, Vec); 69 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat, Vec, Vec); 70 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat, Vec, Vec); 71 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **); 72 #endif 73 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat, PetscOptionItems *PetscOptionsObject); 74 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat, PetscScalar, Mat, MatStructure); 75 static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat, PetscScalar); 76 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat, Vec, Vec); 77 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec); 78 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat, Vec, Vec); 79 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec); 80 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat, Vec, Vec); 81 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec); 82 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec, PetscBool, PetscBool); 83 84 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **); 85 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **, MatCUSPARSEStorageFormat); 86 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors **); 87 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat); 88 89 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat); 90 static PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat, PetscBool); 91 92 static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat, PetscInt, const PetscInt[], PetscScalar[]); 93 static PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat, PetscCount, PetscInt[], PetscInt[]); 94 static PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat, const PetscScalar[], InsertMode); 95 96 PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A, MatCUSPARSEFormatOperation op, MatCUSPARSEStorageFormat format) 97 { 98 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr; 99 100 PetscFunctionBegin; 101 switch (op) { 102 case MAT_CUSPARSE_MULT: 103 cusparsestruct->format = format; 104 break; 105 case MAT_CUSPARSE_ALL: 106 cusparsestruct->format = format; 107 break; 108 default: 109 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.", op); 110 } 111 PetscFunctionReturn(PETSC_SUCCESS); 112 } 113 114 /*@ 115 MatCUSPARSESetFormat - Sets the storage format of `MATSEQCUSPARSE` matrices for a particular 116 operation. Only the `MatMult()` operation can use different GPU storage formats 117 118 Not Collective 119 120 Input Parameters: 121 + A - Matrix of type `MATSEQAIJCUSPARSE` 122 . op - `MatCUSPARSEFormatOperation`. `MATSEQAIJCUSPARSE` matrices support `MAT_CUSPARSE_MULT` and `MAT_CUSPARSE_ALL`. 123 `MATMPIAIJCUSPARSE` matrices support `MAT_CUSPARSE_MULT_DIAG`,`MAT_CUSPARSE_MULT_OFFDIAG`, and `MAT_CUSPARSE_ALL`. 124 - format - `MatCUSPARSEStorageFormat` (one of `MAT_CUSPARSE_CSR`, `MAT_CUSPARSE_ELL`, `MAT_CUSPARSE_HYB`.) 125 126 Level: intermediate 127 128 .seealso: [](ch_matrices), `Mat`, `MATSEQAIJCUSPARSE`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation` 129 @*/ 130 PetscErrorCode MatCUSPARSESetFormat(Mat A, MatCUSPARSEFormatOperation op, MatCUSPARSEStorageFormat format) 131 { 132 PetscFunctionBegin; 133 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 134 PetscTryMethod(A, "MatCUSPARSESetFormat_C", (Mat, MatCUSPARSEFormatOperation, MatCUSPARSEStorageFormat), (A, op, format)); 135 PetscFunctionReturn(PETSC_SUCCESS); 136 } 137 138 PETSC_INTERN PetscErrorCode MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE(Mat A, PetscBool use_cpu) 139 { 140 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr; 141 142 PetscFunctionBegin; 143 cusparsestruct->use_cpu_solve = use_cpu; 144 PetscFunctionReturn(PETSC_SUCCESS); 145 } 146 147 /*@ 148 MatCUSPARSESetUseCPUSolve - Sets to use CPU `MatSolve()`. 149 150 Input Parameters: 151 + A - Matrix of type `MATSEQAIJCUSPARSE` 152 - use_cpu - set flag for using the built-in CPU `MatSolve()` 153 154 Level: intermediate 155 156 Note: 157 The cuSparse LU solver currently computes the factors with the built-in CPU method 158 and moves the factors to the GPU for the solve. We have observed better performance keeping the data on the CPU and computing the solve there. 159 This method to specify if the solve is done on the CPU or GPU (GPU is the default). 160 161 .seealso: [](ch_matrices), `Mat`, `MatSolve()`, `MATSEQAIJCUSPARSE`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation` 162 @*/ 163 PetscErrorCode MatCUSPARSESetUseCPUSolve(Mat A, PetscBool use_cpu) 164 { 165 PetscFunctionBegin; 166 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 167 PetscTryMethod(A, "MatCUSPARSESetUseCPUSolve_C", (Mat, PetscBool), (A, use_cpu)); 168 PetscFunctionReturn(PETSC_SUCCESS); 169 } 170 171 static PetscErrorCode MatSetOption_SeqAIJCUSPARSE(Mat A, MatOption op, PetscBool flg) 172 { 173 PetscFunctionBegin; 174 switch (op) { 175 case MAT_FORM_EXPLICIT_TRANSPOSE: 176 /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */ 177 if (A->form_explicit_transpose && !flg) PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE)); 178 A->form_explicit_transpose = flg; 179 break; 180 default: 181 PetscCall(MatSetOption_SeqAIJ(A, op, flg)); 182 break; 183 } 184 PetscFunctionReturn(PETSC_SUCCESS); 185 } 186 187 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat A, PetscOptionItems *PetscOptionsObject) 188 { 189 MatCUSPARSEStorageFormat format; 190 PetscBool flg; 191 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr; 192 193 PetscFunctionBegin; 194 PetscOptionsHeadBegin(PetscOptionsObject, "SeqAIJCUSPARSE options"); 195 if (A->factortype == MAT_FACTOR_NONE) { 196 PetscCall(PetscOptionsEnum("-mat_cusparse_mult_storage_format", "sets storage format of (seq)aijcusparse gpu matrices for SpMV", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparsestruct->format, (PetscEnum *)&format, &flg)); 197 if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_MULT, format)); 198 199 PetscCall(PetscOptionsEnum("-mat_cusparse_storage_format", "sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparsestruct->format, (PetscEnum *)&format, &flg)); 200 if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_ALL, format)); 201 PetscCall(PetscOptionsBool("-mat_cusparse_use_cpu_solve", "Use CPU (I)LU solve", "MatCUSPARSESetUseCPUSolve", cusparsestruct->use_cpu_solve, &cusparsestruct->use_cpu_solve, &flg)); 202 if (flg) PetscCall(MatCUSPARSESetUseCPUSolve(A, cusparsestruct->use_cpu_solve)); 203 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 204 PetscCall(PetscOptionsEnum("-mat_cusparse_spmv_alg", "sets cuSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)", "cusparseSpMVAlg_t", MatCUSPARSESpMVAlgorithms, (PetscEnum)cusparsestruct->spmvAlg, (PetscEnum *)&cusparsestruct->spmvAlg, &flg)); 205 /* If user did use this option, check its consistency with cuSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatCUSPARSESpMVAlgorithms[] */ 206 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 207 PetscCheck(!flg || CUSPARSE_SPMV_CSR_ALG1 == 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly"); 208 #else 209 PetscCheck(!flg || CUSPARSE_CSRMV_ALG1 == 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly"); 210 #endif 211 PetscCall(PetscOptionsEnum("-mat_cusparse_spmm_alg", "sets cuSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)", "cusparseSpMMAlg_t", MatCUSPARSESpMMAlgorithms, (PetscEnum)cusparsestruct->spmmAlg, (PetscEnum *)&cusparsestruct->spmmAlg, &flg)); 212 PetscCheck(!flg || CUSPARSE_SPMM_CSR_ALG1 == 4, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseSpMMAlg_t has been changed but PETSc has not been updated accordingly"); 213 214 PetscCall( 215 PetscOptionsEnum("-mat_cusparse_csr2csc_alg", "sets cuSPARSE algorithm used in converting CSR matrices to CSC matrices", "cusparseCsr2CscAlg_t", MatCUSPARSECsr2CscAlgorithms, (PetscEnum)cusparsestruct->csr2cscAlg, (PetscEnum *)&cusparsestruct->csr2cscAlg, &flg)); 216 PetscCheck(!flg || CUSPARSE_CSR2CSC_ALG1 == 1, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseCsr2CscAlg_t has been changed but PETSc has not been updated accordingly"); 217 #endif 218 } 219 PetscOptionsHeadEnd(); 220 PetscFunctionReturn(PETSC_SUCCESS); 221 } 222 223 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 224 static PetscErrorCode MatSeqAIJCUSPARSEBuildFactoredMatrix_LU(Mat A) 225 { 226 Mat_SeqAIJ *a = static_cast<Mat_SeqAIJ *>(A->data); 227 PetscInt m = A->rmap->n; 228 Mat_SeqAIJCUSPARSETriFactors *fs = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr); 229 const PetscInt *Ai = a->i, *Aj = a->j, *Adiag = a->diag; 230 const MatScalar *Aa = a->a; 231 PetscInt *Mi, *Mj, Mnz; 232 PetscScalar *Ma; 233 234 PetscFunctionBegin; 235 if (A->offloadmask == PETSC_OFFLOAD_CPU) { // A's latest factors are on CPU 236 if (!fs->csrRowPtr) { // Is't the first time to do the setup? Use csrRowPtr since it is not null even when m=0 237 // Re-arrange the (skewed) factored matrix and put the result into M, a regular csr matrix on host 238 Mnz = (Ai[m] - Ai[0]) + (Adiag[0] - Adiag[m]); // Lnz (without the unit diagonal) + Unz (with the non-unit diagonal) 239 PetscCall(PetscMalloc1(m + 1, &Mi)); 240 PetscCall(PetscMalloc1(Mnz, &Mj)); // Mj is temp 241 PetscCall(PetscMalloc1(Mnz, &Ma)); 242 Mi[0] = 0; 243 for (PetscInt i = 0; i < m; i++) { 244 PetscInt llen = Ai[i + 1] - Ai[i]; 245 PetscInt ulen = Adiag[i] - Adiag[i + 1]; 246 PetscCall(PetscArraycpy(Mj + Mi[i], Aj + Ai[i], llen)); // entries of L 247 Mj[Mi[i] + llen] = i; // diagonal entry 248 PetscCall(PetscArraycpy(Mj + Mi[i] + llen + 1, Aj + Adiag[i + 1] + 1, ulen - 1)); // entries of U on the right of the diagonal 249 Mi[i + 1] = Mi[i] + llen + ulen; 250 } 251 // Copy M (L,U) from host to device 252 PetscCallCUDA(cudaMalloc(&fs->csrRowPtr, sizeof(*fs->csrRowPtr) * (m + 1))); 253 PetscCallCUDA(cudaMalloc(&fs->csrColIdx, sizeof(*fs->csrColIdx) * Mnz)); 254 PetscCallCUDA(cudaMalloc(&fs->csrVal, sizeof(*fs->csrVal) * Mnz)); 255 PetscCallCUDA(cudaMemcpy(fs->csrRowPtr, Mi, sizeof(*fs->csrRowPtr) * (m + 1), cudaMemcpyHostToDevice)); 256 PetscCallCUDA(cudaMemcpy(fs->csrColIdx, Mj, sizeof(*fs->csrColIdx) * Mnz, cudaMemcpyHostToDevice)); 257 258 // Create descriptors for L, U. See https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t 259 // cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always 260 // assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that 261 // all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine 262 // assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory. 263 cusparseFillMode_t fillMode = CUSPARSE_FILL_MODE_LOWER; 264 cusparseDiagType_t diagType = CUSPARSE_DIAG_TYPE_UNIT; 265 const cusparseIndexType_t indexType = PetscDefined(USE_64BIT_INDICES) ? CUSPARSE_INDEX_64I : CUSPARSE_INDEX_32I; 266 267 PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_L, m, m, Mnz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, indexType, indexType, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype)); 268 PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode))); 269 PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType))); 270 271 fillMode = CUSPARSE_FILL_MODE_UPPER; 272 diagType = CUSPARSE_DIAG_TYPE_NON_UNIT; 273 PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_U, m, m, Mnz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, indexType, indexType, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype)); 274 PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode))); 275 PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType))); 276 277 // Allocate work vectors in SpSv 278 PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(*fs->X) * m)); 279 PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(*fs->Y) * m)); 280 281 PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype)); 282 PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype)); 283 284 // Query buffer sizes for SpSV and then allocate buffers, temporarily assuming opA = CUSPARSE_OPERATION_NON_TRANSPOSE 285 PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_L)); 286 PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, &fs->spsvBufferSize_L)); 287 PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_U)); 288 PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, &fs->spsvBufferSize_U)); 289 PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_U, fs->spsvBufferSize_U)); 290 PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_L, fs->spsvBufferSize_L)); 291 292 // Record for reuse 293 fs->csrRowPtr_h = Mi; 294 fs->csrVal_h = Ma; 295 PetscCall(PetscFree(Mj)); 296 } 297 // Copy the value 298 Mi = fs->csrRowPtr_h; 299 Ma = fs->csrVal_h; 300 Mnz = Mi[m]; 301 for (PetscInt i = 0; i < m; i++) { 302 PetscInt llen = Ai[i + 1] - Ai[i]; 303 PetscInt ulen = Adiag[i] - Adiag[i + 1]; 304 PetscCall(PetscArraycpy(Ma + Mi[i], Aa + Ai[i], llen)); // entries of L 305 Ma[Mi[i] + llen] = (MatScalar)1.0 / Aa[Adiag[i]]; // recover the diagonal entry 306 PetscCall(PetscArraycpy(Ma + Mi[i] + llen + 1, Aa + Adiag[i + 1] + 1, ulen - 1)); // entries of U on the right of the diagonal 307 } 308 PetscCallCUDA(cudaMemcpy(fs->csrVal, Ma, sizeof(*Ma) * Mnz, cudaMemcpyHostToDevice)); 309 310 // Do cusparseSpSV_analysis(), which is numeric and requires valid and up-to-date matrix values 311 PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L)); 312 313 PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, fs->spsvBuffer_U)); 314 315 // L, U values have changed, reset the flag to indicate we need to redo cusparseSpSV_analysis() for transpose solve 316 fs->updatedTransposeSpSVAnalysis = PETSC_FALSE; 317 } 318 PetscFunctionReturn(PETSC_SUCCESS); 319 } 320 #else 321 static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A) 322 { 323 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 324 PetscInt n = A->rmap->n; 325 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 326 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr; 327 const PetscInt *ai = a->i, *aj = a->j, *vi; 328 const MatScalar *aa = a->a, *v; 329 PetscInt *AiLo, *AjLo; 330 PetscInt i, nz, nzLower, offset, rowOffset; 331 332 PetscFunctionBegin; 333 if (!n) PetscFunctionReturn(PETSC_SUCCESS); 334 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 335 try { 336 /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */ 337 nzLower = n + ai[n] - ai[1]; 338 if (!loTriFactor) { 339 PetscScalar *AALo; 340 341 PetscCallCUDA(cudaMallocHost((void **)&AALo, nzLower * sizeof(PetscScalar))); 342 343 /* Allocate Space for the lower triangular matrix */ 344 PetscCallCUDA(cudaMallocHost((void **)&AiLo, (n + 1) * sizeof(PetscInt))); 345 PetscCallCUDA(cudaMallocHost((void **)&AjLo, nzLower * sizeof(PetscInt))); 346 347 /* Fill the lower triangular matrix */ 348 AiLo[0] = (PetscInt)0; 349 AiLo[n] = nzLower; 350 AjLo[0] = (PetscInt)0; 351 AALo[0] = (MatScalar)1.0; 352 v = aa; 353 vi = aj; 354 offset = 1; 355 rowOffset = 1; 356 for (i = 1; i < n; i++) { 357 nz = ai[i + 1] - ai[i]; 358 /* additional 1 for the term on the diagonal */ 359 AiLo[i] = rowOffset; 360 rowOffset += nz + 1; 361 362 PetscCall(PetscArraycpy(&AjLo[offset], vi, nz)); 363 PetscCall(PetscArraycpy(&AALo[offset], v, nz)); 364 365 offset += nz; 366 AjLo[offset] = (PetscInt)i; 367 AALo[offset] = (MatScalar)1.0; 368 offset += 1; 369 370 v += nz; 371 vi += nz; 372 } 373 374 /* allocate space for the triangular factor information */ 375 PetscCall(PetscNew(&loTriFactor)); 376 loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 377 /* Create the matrix description */ 378 PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactor->descr)); 379 PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO)); 380 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 381 PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 382 #else 383 PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR)); 384 #endif 385 PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER)); 386 PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT)); 387 388 /* set the operation */ 389 loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 390 391 /* set the matrix */ 392 loTriFactor->csrMat = new CsrMatrix; 393 loTriFactor->csrMat->num_rows = n; 394 loTriFactor->csrMat->num_cols = n; 395 loTriFactor->csrMat->num_entries = nzLower; 396 397 loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n + 1); 398 loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo + n + 1); 399 400 loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower); 401 loTriFactor->csrMat->column_indices->assign(AjLo, AjLo + nzLower); 402 403 loTriFactor->csrMat->values = new THRUSTARRAY(nzLower); 404 loTriFactor->csrMat->values->assign(AALo, AALo + nzLower); 405 406 /* Create the solve analysis information */ 407 PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 408 PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactor->solveInfo)); 409 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 410 PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(), 411 loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, &loTriFactor->solveBufferSize)); 412 PetscCallCUDA(cudaMalloc(&loTriFactor->solveBuffer, loTriFactor->solveBufferSize)); 413 #endif 414 415 /* perform the solve analysis */ 416 PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(), 417 loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, loTriFactor->solvePolicy, loTriFactor->solveBuffer)); 418 PetscCallCUDA(WaitForCUDA()); 419 PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 420 421 /* assign the pointer */ 422 ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->loTriFactorPtr = loTriFactor; 423 loTriFactor->AA_h = AALo; 424 PetscCallCUDA(cudaFreeHost(AiLo)); 425 PetscCallCUDA(cudaFreeHost(AjLo)); 426 PetscCall(PetscLogCpuToGpu((n + 1 + nzLower) * sizeof(int) + nzLower * sizeof(PetscScalar))); 427 } else { /* update values only */ 428 if (!loTriFactor->AA_h) PetscCallCUDA(cudaMallocHost((void **)&loTriFactor->AA_h, nzLower * sizeof(PetscScalar))); 429 /* Fill the lower triangular matrix */ 430 loTriFactor->AA_h[0] = 1.0; 431 v = aa; 432 vi = aj; 433 offset = 1; 434 for (i = 1; i < n; i++) { 435 nz = ai[i + 1] - ai[i]; 436 PetscCall(PetscArraycpy(&loTriFactor->AA_h[offset], v, nz)); 437 offset += nz; 438 loTriFactor->AA_h[offset] = 1.0; 439 offset += 1; 440 v += nz; 441 } 442 loTriFactor->csrMat->values->assign(loTriFactor->AA_h, loTriFactor->AA_h + nzLower); 443 PetscCall(PetscLogCpuToGpu(nzLower * sizeof(PetscScalar))); 444 } 445 } catch (char *ex) { 446 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex); 447 } 448 } 449 PetscFunctionReturn(PETSC_SUCCESS); 450 } 451 452 static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A) 453 { 454 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 455 PetscInt n = A->rmap->n; 456 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 457 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr; 458 const PetscInt *aj = a->j, *adiag = a->diag, *vi; 459 const MatScalar *aa = a->a, *v; 460 PetscInt *AiUp, *AjUp; 461 PetscInt i, nz, nzUpper, offset; 462 463 PetscFunctionBegin; 464 if (!n) PetscFunctionReturn(PETSC_SUCCESS); 465 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 466 try { 467 /* next, figure out the number of nonzeros in the upper triangular matrix. */ 468 nzUpper = adiag[0] - adiag[n]; 469 if (!upTriFactor) { 470 PetscScalar *AAUp; 471 472 PetscCallCUDA(cudaMallocHost((void **)&AAUp, nzUpper * sizeof(PetscScalar))); 473 474 /* Allocate Space for the upper triangular matrix */ 475 PetscCallCUDA(cudaMallocHost((void **)&AiUp, (n + 1) * sizeof(PetscInt))); 476 PetscCallCUDA(cudaMallocHost((void **)&AjUp, nzUpper * sizeof(PetscInt))); 477 478 /* Fill the upper triangular matrix */ 479 AiUp[0] = (PetscInt)0; 480 AiUp[n] = nzUpper; 481 offset = nzUpper; 482 for (i = n - 1; i >= 0; i--) { 483 v = aa + adiag[i + 1] + 1; 484 vi = aj + adiag[i + 1] + 1; 485 486 /* number of elements NOT on the diagonal */ 487 nz = adiag[i] - adiag[i + 1] - 1; 488 489 /* decrement the offset */ 490 offset -= (nz + 1); 491 492 /* first, set the diagonal elements */ 493 AjUp[offset] = (PetscInt)i; 494 AAUp[offset] = (MatScalar)1. / v[nz]; 495 AiUp[i] = AiUp[i + 1] - (nz + 1); 496 497 PetscCall(PetscArraycpy(&AjUp[offset + 1], vi, nz)); 498 PetscCall(PetscArraycpy(&AAUp[offset + 1], v, nz)); 499 } 500 501 /* allocate space for the triangular factor information */ 502 PetscCall(PetscNew(&upTriFactor)); 503 upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 504 505 /* Create the matrix description */ 506 PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactor->descr)); 507 PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO)); 508 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 509 PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 510 #else 511 PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR)); 512 #endif 513 PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER)); 514 PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT)); 515 516 /* set the operation */ 517 upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 518 519 /* set the matrix */ 520 upTriFactor->csrMat = new CsrMatrix; 521 upTriFactor->csrMat->num_rows = n; 522 upTriFactor->csrMat->num_cols = n; 523 upTriFactor->csrMat->num_entries = nzUpper; 524 525 upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n + 1); 526 upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + n + 1); 527 528 upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper); 529 upTriFactor->csrMat->column_indices->assign(AjUp, AjUp + nzUpper); 530 531 upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper); 532 upTriFactor->csrMat->values->assign(AAUp, AAUp + nzUpper); 533 534 /* Create the solve analysis information */ 535 PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 536 PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactor->solveInfo)); 537 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 538 PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(), 539 upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, &upTriFactor->solveBufferSize)); 540 PetscCallCUDA(cudaMalloc(&upTriFactor->solveBuffer, upTriFactor->solveBufferSize)); 541 #endif 542 543 /* perform the solve analysis */ 544 PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(), 545 upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, upTriFactor->solvePolicy, upTriFactor->solveBuffer)); 546 547 PetscCallCUDA(WaitForCUDA()); 548 PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 549 550 /* assign the pointer */ 551 ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->upTriFactorPtr = upTriFactor; 552 upTriFactor->AA_h = AAUp; 553 PetscCallCUDA(cudaFreeHost(AiUp)); 554 PetscCallCUDA(cudaFreeHost(AjUp)); 555 PetscCall(PetscLogCpuToGpu((n + 1 + nzUpper) * sizeof(int) + nzUpper * sizeof(PetscScalar))); 556 } else { 557 if (!upTriFactor->AA_h) PetscCallCUDA(cudaMallocHost((void **)&upTriFactor->AA_h, nzUpper * sizeof(PetscScalar))); 558 /* Fill the upper triangular matrix */ 559 offset = nzUpper; 560 for (i = n - 1; i >= 0; i--) { 561 v = aa + adiag[i + 1] + 1; 562 563 /* number of elements NOT on the diagonal */ 564 nz = adiag[i] - adiag[i + 1] - 1; 565 566 /* decrement the offset */ 567 offset -= (nz + 1); 568 569 /* first, set the diagonal elements */ 570 upTriFactor->AA_h[offset] = 1. / v[nz]; 571 PetscCall(PetscArraycpy(&upTriFactor->AA_h[offset + 1], v, nz)); 572 } 573 upTriFactor->csrMat->values->assign(upTriFactor->AA_h, upTriFactor->AA_h + nzUpper); 574 PetscCall(PetscLogCpuToGpu(nzUpper * sizeof(PetscScalar))); 575 } 576 } catch (char *ex) { 577 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex); 578 } 579 } 580 PetscFunctionReturn(PETSC_SUCCESS); 581 } 582 #endif 583 584 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A) 585 { 586 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 587 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 588 IS isrow = a->row, iscol = a->icol; 589 PetscBool row_identity, col_identity; 590 PetscInt n = A->rmap->n; 591 592 PetscFunctionBegin; 593 PetscCheck(cusparseTriFactors, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors"); 594 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 595 PetscCall(MatSeqAIJCUSPARSEBuildFactoredMatrix_LU(A)); 596 #else 597 PetscCall(MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A)); 598 PetscCall(MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A)); 599 if (!cusparseTriFactors->workVector) cusparseTriFactors->workVector = new THRUSTARRAY(n); 600 #endif 601 602 cusparseTriFactors->nnz = a->nz; 603 604 A->offloadmask = PETSC_OFFLOAD_BOTH; // factored matrix is sync'ed to GPU 605 /* lower triangular indices */ 606 PetscCall(ISIdentity(isrow, &row_identity)); 607 if (!row_identity && !cusparseTriFactors->rpermIndices) { 608 const PetscInt *r; 609 610 PetscCall(ISGetIndices(isrow, &r)); 611 cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 612 cusparseTriFactors->rpermIndices->assign(r, r + n); 613 PetscCall(ISRestoreIndices(isrow, &r)); 614 PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt))); 615 } 616 617 /* upper triangular indices */ 618 PetscCall(ISIdentity(iscol, &col_identity)); 619 if (!col_identity && !cusparseTriFactors->cpermIndices) { 620 const PetscInt *c; 621 622 PetscCall(ISGetIndices(iscol, &c)); 623 cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 624 cusparseTriFactors->cpermIndices->assign(c, c + n); 625 PetscCall(ISRestoreIndices(iscol, &c)); 626 PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt))); 627 } 628 PetscFunctionReturn(PETSC_SUCCESS); 629 } 630 631 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 632 static PetscErrorCode MatSeqAIJCUSPARSEBuildFactoredMatrix_Cheolesky(Mat A) 633 { 634 Mat_SeqAIJ *a = static_cast<Mat_SeqAIJ *>(A->data); 635 PetscInt m = A->rmap->n; 636 Mat_SeqAIJCUSPARSETriFactors *fs = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr); 637 const PetscInt *Ai = a->i, *Aj = a->j, *Adiag = a->diag; 638 const MatScalar *Aa = a->a; 639 PetscInt *Mj, Mnz; 640 PetscScalar *Ma, *D; 641 642 PetscFunctionBegin; 643 if (A->offloadmask == PETSC_OFFLOAD_CPU) { // A's latest factors are on CPU 644 if (!fs->csrRowPtr) { // Is't the first time to do the setup? Use csrRowPtr since it is not null even m=0 645 // Re-arrange the (skewed) factored matrix and put the result into M, a regular csr matrix on host. 646 // See comments at MatICCFactorSymbolic_SeqAIJ() on the layout of the factored matrix (U) on host. 647 Mnz = Ai[m]; // Unz (with the unit diagonal) 648 PetscCall(PetscMalloc1(Mnz, &Ma)); 649 PetscCall(PetscMalloc1(Mnz, &Mj)); // Mj[] is temp 650 PetscCall(PetscMalloc1(m, &D)); // the diagonal 651 for (PetscInt i = 0; i < m; i++) { 652 PetscInt ulen = Ai[i + 1] - Ai[i]; 653 Mj[Ai[i]] = i; // diagonal entry 654 PetscCall(PetscArraycpy(Mj + Ai[i] + 1, Aj + Ai[i], ulen - 1)); // entries of U on the right of the diagonal 655 } 656 // Copy M (U) from host to device 657 PetscCallCUDA(cudaMalloc(&fs->csrRowPtr, sizeof(*fs->csrRowPtr) * (m + 1))); 658 PetscCallCUDA(cudaMalloc(&fs->csrColIdx, sizeof(*fs->csrColIdx) * Mnz)); 659 PetscCallCUDA(cudaMalloc(&fs->csrVal, sizeof(*fs->csrVal) * Mnz)); 660 PetscCallCUDA(cudaMalloc(&fs->diag, sizeof(*fs->diag) * m)); 661 PetscCallCUDA(cudaMemcpy(fs->csrRowPtr, Ai, sizeof(*Ai) * (m + 1), cudaMemcpyHostToDevice)); 662 PetscCallCUDA(cudaMemcpy(fs->csrColIdx, Mj, sizeof(*Mj) * Mnz, cudaMemcpyHostToDevice)); 663 664 // Create descriptors for L, U. See https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t 665 // cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always 666 // assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that 667 // all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine 668 // assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory. 669 cusparseFillMode_t fillMode = CUSPARSE_FILL_MODE_UPPER; 670 cusparseDiagType_t diagType = CUSPARSE_DIAG_TYPE_UNIT; // U is unit diagonal 671 const cusparseIndexType_t indexType = PetscDefined(USE_64BIT_INDICES) ? CUSPARSE_INDEX_64I : CUSPARSE_INDEX_32I; 672 673 PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_U, m, m, Mnz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, indexType, indexType, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype)); 674 PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode))); 675 PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType))); 676 677 // Allocate work vectors in SpSv 678 PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(*fs->X) * m)); 679 PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(*fs->Y) * m)); 680 681 PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype)); 682 PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype)); 683 684 // Query buffer sizes for SpSV and then allocate buffers 685 PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_U)); 686 PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, &fs->spsvBufferSize_U)); 687 PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_U, fs->spsvBufferSize_U)); 688 689 PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Ut)); // Ut solve uses the same matrix (spMatDescr_U), but different descr and buffer 690 PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Ut, &fs->spsvBufferSize_Ut)); 691 PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Ut, fs->spsvBufferSize_Ut)); 692 693 // Record for reuse 694 fs->csrVal_h = Ma; 695 fs->diag_h = D; 696 PetscCall(PetscFree(Mj)); 697 } 698 // Copy the value 699 Ma = fs->csrVal_h; 700 D = fs->diag_h; 701 Mnz = Ai[m]; 702 for (PetscInt i = 0; i < m; i++) { 703 D[i] = Aa[Adiag[i]]; // actually Aa[Adiag[i]] is the inverse of the diagonal 704 Ma[Ai[i]] = (MatScalar)1.0; // set the unit diagonal, which is cosmetic since cusparse does not really read it given CUSPARSE_DIAG_TYPE_UNIT 705 for (PetscInt k = 0; k < Ai[i + 1] - Ai[i] - 1; k++) Ma[Ai[i] + 1 + k] = -Aa[Ai[i] + k]; 706 } 707 PetscCallCUDA(cudaMemcpy(fs->csrVal, Ma, sizeof(*Ma) * Mnz, cudaMemcpyHostToDevice)); 708 PetscCallCUDA(cudaMemcpy(fs->diag, D, sizeof(*D) * m, cudaMemcpyHostToDevice)); 709 710 // Do cusparseSpSV_analysis(), which is numeric and requires valid and up-to-date matrix values 711 PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, fs->spsvBuffer_U)); 712 PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Ut, fs->spsvBuffer_Ut)); 713 } 714 PetscFunctionReturn(PETSC_SUCCESS); 715 } 716 717 // Solve Ut D U x = b 718 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_Cholesky(Mat A, Vec b, Vec x) 719 { 720 Mat_SeqAIJCUSPARSETriFactors *fs = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr); 721 Mat_SeqAIJ *aij = static_cast<Mat_SeqAIJ *>(A->data); 722 const PetscScalar *barray; 723 PetscScalar *xarray; 724 thrust::device_ptr<const PetscScalar> bGPU; 725 thrust::device_ptr<PetscScalar> xGPU; 726 const cusparseSpSVAlg_t alg = CUSPARSE_SPSV_ALG_DEFAULT; 727 PetscInt m = A->rmap->n; 728 729 PetscFunctionBegin; 730 PetscCall(PetscLogGpuTimeBegin()); 731 PetscCall(VecCUDAGetArrayWrite(x, &xarray)); 732 PetscCall(VecCUDAGetArrayRead(b, &barray)); 733 xGPU = thrust::device_pointer_cast(xarray); 734 bGPU = thrust::device_pointer_cast(barray); 735 736 // Reorder b with the row permutation if needed, and wrap the result in fs->X 737 if (fs->rpermIndices) { 738 PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->end()), thrust::device_pointer_cast(fs->X))); 739 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X)); 740 } else { 741 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray)); 742 } 743 744 // Solve Ut Y = X 745 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y)); 746 PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Ut)); 747 748 // Solve diag(D) Z = Y. Actually just do Y = Y*D since D is already inverted in MatCholeskyFactorNumeric_SeqAIJ(). 749 // It is basically a vector element-wise multiplication, but cublas does not have it! 750 PetscCallThrust(thrust::transform(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::device_pointer_cast(fs->Y), thrust::device_pointer_cast(fs->Y + m), thrust::device_pointer_cast(fs->diag), thrust::device_pointer_cast(fs->Y), thrust::multiplies<PetscScalar>())); 751 752 // Solve U X = Y 753 if (fs->cpermIndices) { // if need to permute, we need to use the intermediate buffer X 754 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X)); 755 } else { 756 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray)); 757 } 758 PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, alg, fs->spsvDescr_U)); 759 760 // Reorder X with the column permutation if needed, and put the result back to x 761 if (fs->cpermIndices) { 762 PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X), fs->cpermIndices->begin()), 763 thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X + m), fs->cpermIndices->end()), xGPU)); 764 } 765 766 PetscCall(VecCUDARestoreArrayRead(b, &barray)); 767 PetscCall(VecCUDARestoreArrayWrite(x, &xarray)); 768 PetscCall(PetscLogGpuTimeEnd()); 769 PetscCall(PetscLogGpuFlops(4.0 * aij->nz - A->rmap->n)); 770 PetscFunctionReturn(PETSC_SUCCESS); 771 } 772 #else 773 static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A) 774 { 775 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 776 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 777 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr; 778 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr; 779 PetscInt *AiUp, *AjUp; 780 PetscScalar *AAUp; 781 PetscScalar *AALo; 782 PetscInt nzUpper = a->nz, n = A->rmap->n, i, offset, nz, j; 783 Mat_SeqSBAIJ *b = (Mat_SeqSBAIJ *)A->data; 784 const PetscInt *ai = b->i, *aj = b->j, *vj; 785 const MatScalar *aa = b->a, *v; 786 787 PetscFunctionBegin; 788 if (!n) PetscFunctionReturn(PETSC_SUCCESS); 789 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 790 try { 791 PetscCallCUDA(cudaMallocHost((void **)&AAUp, nzUpper * sizeof(PetscScalar))); 792 PetscCallCUDA(cudaMallocHost((void **)&AALo, nzUpper * sizeof(PetscScalar))); 793 if (!upTriFactor && !loTriFactor) { 794 /* Allocate Space for the upper triangular matrix */ 795 PetscCallCUDA(cudaMallocHost((void **)&AiUp, (n + 1) * sizeof(PetscInt))); 796 PetscCallCUDA(cudaMallocHost((void **)&AjUp, nzUpper * sizeof(PetscInt))); 797 798 /* Fill the upper triangular matrix */ 799 AiUp[0] = (PetscInt)0; 800 AiUp[n] = nzUpper; 801 offset = 0; 802 for (i = 0; i < n; i++) { 803 /* set the pointers */ 804 v = aa + ai[i]; 805 vj = aj + ai[i]; 806 nz = ai[i + 1] - ai[i] - 1; /* exclude diag[i] */ 807 808 /* first, set the diagonal elements */ 809 AjUp[offset] = (PetscInt)i; 810 AAUp[offset] = (MatScalar)1.0 / v[nz]; 811 AiUp[i] = offset; 812 AALo[offset] = (MatScalar)1.0 / v[nz]; 813 814 offset += 1; 815 if (nz > 0) { 816 PetscCall(PetscArraycpy(&AjUp[offset], vj, nz)); 817 PetscCall(PetscArraycpy(&AAUp[offset], v, nz)); 818 for (j = offset; j < offset + nz; j++) { 819 AAUp[j] = -AAUp[j]; 820 AALo[j] = AAUp[j] / v[nz]; 821 } 822 offset += nz; 823 } 824 } 825 826 /* allocate space for the triangular factor information */ 827 PetscCall(PetscNew(&upTriFactor)); 828 upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 829 830 /* Create the matrix description */ 831 PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactor->descr)); 832 PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO)); 833 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 834 PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 835 #else 836 PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR)); 837 #endif 838 PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER)); 839 PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT)); 840 841 /* set the matrix */ 842 upTriFactor->csrMat = new CsrMatrix; 843 upTriFactor->csrMat->num_rows = A->rmap->n; 844 upTriFactor->csrMat->num_cols = A->cmap->n; 845 upTriFactor->csrMat->num_entries = a->nz; 846 847 upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1); 848 upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + A->rmap->n + 1); 849 850 upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 851 upTriFactor->csrMat->column_indices->assign(AjUp, AjUp + a->nz); 852 853 upTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 854 upTriFactor->csrMat->values->assign(AAUp, AAUp + a->nz); 855 856 /* set the operation */ 857 upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 858 859 /* Create the solve analysis information */ 860 PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 861 PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactor->solveInfo)); 862 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 863 PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(), 864 upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, &upTriFactor->solveBufferSize)); 865 PetscCallCUDA(cudaMalloc(&upTriFactor->solveBuffer, upTriFactor->solveBufferSize)); 866 #endif 867 868 /* perform the solve analysis */ 869 PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(), 870 upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, upTriFactor->solvePolicy, upTriFactor->solveBuffer)); 871 872 PetscCallCUDA(WaitForCUDA()); 873 PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 874 875 /* assign the pointer */ 876 ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->upTriFactorPtr = upTriFactor; 877 878 /* allocate space for the triangular factor information */ 879 PetscCall(PetscNew(&loTriFactor)); 880 loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 881 882 /* Create the matrix description */ 883 PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactor->descr)); 884 PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO)); 885 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 886 PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 887 #else 888 PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR)); 889 #endif 890 PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER)); 891 PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT)); 892 893 /* set the operation */ 894 loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE; 895 896 /* set the matrix */ 897 loTriFactor->csrMat = new CsrMatrix; 898 loTriFactor->csrMat->num_rows = A->rmap->n; 899 loTriFactor->csrMat->num_cols = A->cmap->n; 900 loTriFactor->csrMat->num_entries = a->nz; 901 902 loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1); 903 loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + A->rmap->n + 1); 904 905 loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 906 loTriFactor->csrMat->column_indices->assign(AjUp, AjUp + a->nz); 907 908 loTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 909 loTriFactor->csrMat->values->assign(AALo, AALo + a->nz); 910 911 /* Create the solve analysis information */ 912 PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 913 PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactor->solveInfo)); 914 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 915 PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(), 916 loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, &loTriFactor->solveBufferSize)); 917 PetscCallCUDA(cudaMalloc(&loTriFactor->solveBuffer, loTriFactor->solveBufferSize)); 918 #endif 919 920 /* perform the solve analysis */ 921 PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(), 922 loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, loTriFactor->solvePolicy, loTriFactor->solveBuffer)); 923 924 PetscCallCUDA(WaitForCUDA()); 925 PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 926 927 /* assign the pointer */ 928 ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->loTriFactorPtr = loTriFactor; 929 930 PetscCall(PetscLogCpuToGpu(2 * (((A->rmap->n + 1) + (a->nz)) * sizeof(int) + (a->nz) * sizeof(PetscScalar)))); 931 PetscCallCUDA(cudaFreeHost(AiUp)); 932 PetscCallCUDA(cudaFreeHost(AjUp)); 933 } else { 934 /* Fill the upper triangular matrix */ 935 offset = 0; 936 for (i = 0; i < n; i++) { 937 /* set the pointers */ 938 v = aa + ai[i]; 939 nz = ai[i + 1] - ai[i] - 1; /* exclude diag[i] */ 940 941 /* first, set the diagonal elements */ 942 AAUp[offset] = 1.0 / v[nz]; 943 AALo[offset] = 1.0 / v[nz]; 944 945 offset += 1; 946 if (nz > 0) { 947 PetscCall(PetscArraycpy(&AAUp[offset], v, nz)); 948 for (j = offset; j < offset + nz; j++) { 949 AAUp[j] = -AAUp[j]; 950 AALo[j] = AAUp[j] / v[nz]; 951 } 952 offset += nz; 953 } 954 } 955 PetscCheck(upTriFactor, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors"); 956 PetscCheck(loTriFactor, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors"); 957 upTriFactor->csrMat->values->assign(AAUp, AAUp + a->nz); 958 loTriFactor->csrMat->values->assign(AALo, AALo + a->nz); 959 PetscCall(PetscLogCpuToGpu(2 * (a->nz) * sizeof(PetscScalar))); 960 } 961 PetscCallCUDA(cudaFreeHost(AAUp)); 962 PetscCallCUDA(cudaFreeHost(AALo)); 963 } catch (char *ex) { 964 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex); 965 } 966 } 967 PetscFunctionReturn(PETSC_SUCCESS); 968 } 969 #endif 970 971 static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A) 972 { 973 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 974 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 975 IS ip = a->row; 976 PetscBool perm_identity; 977 PetscInt n = A->rmap->n; 978 979 PetscFunctionBegin; 980 PetscCheck(cusparseTriFactors, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors"); 981 982 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 983 PetscCall(MatSeqAIJCUSPARSEBuildFactoredMatrix_Cheolesky(A)); 984 #else 985 PetscCall(MatSeqAIJCUSPARSEBuildICCTriMatrices(A)); 986 if (!cusparseTriFactors->workVector) cusparseTriFactors->workVector = new THRUSTARRAY(n); 987 #endif 988 cusparseTriFactors->nnz = (a->nz - n) * 2 + n; 989 990 A->offloadmask = PETSC_OFFLOAD_BOTH; 991 992 /* lower triangular indices */ 993 PetscCall(ISIdentity(ip, &perm_identity)); 994 if (!perm_identity) { 995 IS iip; 996 const PetscInt *irip, *rip; 997 998 PetscCall(ISInvertPermutation(ip, PETSC_DECIDE, &iip)); 999 PetscCall(ISGetIndices(iip, &irip)); 1000 PetscCall(ISGetIndices(ip, &rip)); 1001 cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 1002 cusparseTriFactors->rpermIndices->assign(rip, rip + n); 1003 cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 1004 cusparseTriFactors->cpermIndices->assign(irip, irip + n); 1005 PetscCall(ISRestoreIndices(iip, &irip)); 1006 PetscCall(ISDestroy(&iip)); 1007 PetscCall(ISRestoreIndices(ip, &rip)); 1008 PetscCall(PetscLogCpuToGpu(2. * n * sizeof(PetscInt))); 1009 } 1010 PetscFunctionReturn(PETSC_SUCCESS); 1011 } 1012 1013 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B, Mat A, const MatFactorInfo *info) 1014 { 1015 PetscFunctionBegin; 1016 PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A)); 1017 PetscCall(MatCholeskyFactorNumeric_SeqAIJ(B, A, info)); 1018 B->offloadmask = PETSC_OFFLOAD_CPU; 1019 1020 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 1021 B->ops->solve = MatSolve_SeqAIJCUSPARSE_Cholesky; 1022 B->ops->solvetranspose = MatSolve_SeqAIJCUSPARSE_Cholesky; 1023 #else 1024 /* determine which version of MatSolve needs to be used. */ 1025 Mat_SeqAIJ *b = (Mat_SeqAIJ *)B->data; 1026 IS ip = b->row; 1027 PetscBool perm_identity; 1028 1029 PetscCall(ISIdentity(ip, &perm_identity)); 1030 if (perm_identity) { 1031 B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 1032 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 1033 } else { 1034 B->ops->solve = MatSolve_SeqAIJCUSPARSE; 1035 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 1036 } 1037 #endif 1038 B->ops->matsolve = NULL; 1039 B->ops->matsolvetranspose = NULL; 1040 1041 /* get the triangular factors */ 1042 PetscCall(MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B)); 1043 PetscFunctionReturn(PETSC_SUCCESS); 1044 } 1045 1046 #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0) 1047 static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A) 1048 { 1049 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 1050 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr; 1051 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr; 1052 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT; 1053 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT; 1054 cusparseIndexBase_t indexBase; 1055 cusparseMatrixType_t matrixType; 1056 cusparseFillMode_t fillMode; 1057 cusparseDiagType_t diagType; 1058 1059 PetscFunctionBegin; 1060 /* allocate space for the transpose of the lower triangular factor */ 1061 PetscCall(PetscNew(&loTriFactorT)); 1062 loTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 1063 1064 /* set the matrix descriptors of the lower triangular factor */ 1065 matrixType = cusparseGetMatType(loTriFactor->descr); 1066 indexBase = cusparseGetMatIndexBase(loTriFactor->descr); 1067 fillMode = cusparseGetMatFillMode(loTriFactor->descr) == CUSPARSE_FILL_MODE_UPPER ? CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 1068 diagType = cusparseGetMatDiagType(loTriFactor->descr); 1069 1070 /* Create the matrix description */ 1071 PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactorT->descr)); 1072 PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactorT->descr, indexBase)); 1073 PetscCallCUSPARSE(cusparseSetMatType(loTriFactorT->descr, matrixType)); 1074 PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactorT->descr, fillMode)); 1075 PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactorT->descr, diagType)); 1076 1077 /* set the operation */ 1078 loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 1079 1080 /* allocate GPU space for the CSC of the lower triangular factor*/ 1081 loTriFactorT->csrMat = new CsrMatrix; 1082 loTriFactorT->csrMat->num_rows = loTriFactor->csrMat->num_cols; 1083 loTriFactorT->csrMat->num_cols = loTriFactor->csrMat->num_rows; 1084 loTriFactorT->csrMat->num_entries = loTriFactor->csrMat->num_entries; 1085 loTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_rows + 1); 1086 loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_entries); 1087 loTriFactorT->csrMat->values = new THRUSTARRAY(loTriFactorT->csrMat->num_entries); 1088 1089 /* compute the transpose of the lower triangular factor, i.e. the CSC */ 1090 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 1091 PetscCallCUSPARSE(cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, loTriFactor->csrMat->values->data().get(), 1092 loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), 1093 loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, &loTriFactor->csr2cscBufferSize)); 1094 PetscCallCUDA(cudaMalloc(&loTriFactor->csr2cscBuffer, loTriFactor->csr2cscBufferSize)); 1095 #endif 1096 1097 PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0)); 1098 { 1099 // there is no clean way to have PetscCallCUSPARSE wrapping this function... 1100 auto stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 1101 loTriFactor->csrMat->column_indices->data().get(), loTriFactorT->csrMat->values->data().get(), 1102 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 1103 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, loTriFactor->csr2cscBuffer); 1104 #else 1105 loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase); 1106 #endif 1107 PetscCallCUSPARSE(stat); 1108 } 1109 1110 PetscCallCUDA(WaitForCUDA()); 1111 PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0)); 1112 1113 /* Create the solve analysis information */ 1114 PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 1115 PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactorT->solveInfo)); 1116 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 1117 PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(), 1118 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, &loTriFactorT->solveBufferSize)); 1119 PetscCallCUDA(cudaMalloc(&loTriFactorT->solveBuffer, loTriFactorT->solveBufferSize)); 1120 #endif 1121 1122 /* perform the solve analysis */ 1123 PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(), 1124 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer)); 1125 1126 PetscCallCUDA(WaitForCUDA()); 1127 PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 1128 1129 /* assign the pointer */ 1130 ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->loTriFactorPtrTranspose = loTriFactorT; 1131 1132 /*********************************************/ 1133 /* Now the Transpose of the Upper Tri Factor */ 1134 /*********************************************/ 1135 1136 /* allocate space for the transpose of the upper triangular factor */ 1137 PetscCall(PetscNew(&upTriFactorT)); 1138 upTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 1139 1140 /* set the matrix descriptors of the upper triangular factor */ 1141 matrixType = cusparseGetMatType(upTriFactor->descr); 1142 indexBase = cusparseGetMatIndexBase(upTriFactor->descr); 1143 fillMode = cusparseGetMatFillMode(upTriFactor->descr) == CUSPARSE_FILL_MODE_UPPER ? CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 1144 diagType = cusparseGetMatDiagType(upTriFactor->descr); 1145 1146 /* Create the matrix description */ 1147 PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactorT->descr)); 1148 PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactorT->descr, indexBase)); 1149 PetscCallCUSPARSE(cusparseSetMatType(upTriFactorT->descr, matrixType)); 1150 PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactorT->descr, fillMode)); 1151 PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactorT->descr, diagType)); 1152 1153 /* set the operation */ 1154 upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 1155 1156 /* allocate GPU space for the CSC of the upper triangular factor*/ 1157 upTriFactorT->csrMat = new CsrMatrix; 1158 upTriFactorT->csrMat->num_rows = upTriFactor->csrMat->num_cols; 1159 upTriFactorT->csrMat->num_cols = upTriFactor->csrMat->num_rows; 1160 upTriFactorT->csrMat->num_entries = upTriFactor->csrMat->num_entries; 1161 upTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_rows + 1); 1162 upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_entries); 1163 upTriFactorT->csrMat->values = new THRUSTARRAY(upTriFactorT->csrMat->num_entries); 1164 1165 /* compute the transpose of the upper triangular factor, i.e. the CSC */ 1166 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 1167 PetscCallCUSPARSE(cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, upTriFactor->csrMat->values->data().get(), 1168 upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), 1169 upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, &upTriFactor->csr2cscBufferSize)); 1170 PetscCallCUDA(cudaMalloc(&upTriFactor->csr2cscBuffer, upTriFactor->csr2cscBufferSize)); 1171 #endif 1172 1173 PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0)); 1174 { 1175 // there is no clean way to have PetscCallCUSPARSE wrapping this function... 1176 auto stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 1177 upTriFactor->csrMat->column_indices->data().get(), upTriFactorT->csrMat->values->data().get(), 1178 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 1179 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, upTriFactor->csr2cscBuffer); 1180 #else 1181 upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase); 1182 #endif 1183 PetscCallCUSPARSE(stat); 1184 } 1185 1186 PetscCallCUDA(WaitForCUDA()); 1187 PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0)); 1188 1189 /* Create the solve analysis information */ 1190 PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 1191 PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactorT->solveInfo)); 1192 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 1193 PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(), 1194 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, &upTriFactorT->solveBufferSize)); 1195 PetscCallCUDA(cudaMalloc(&upTriFactorT->solveBuffer, upTriFactorT->solveBufferSize)); 1196 #endif 1197 1198 /* perform the solve analysis */ 1199 /* christ, would it have killed you to put this stuff in a function????????? */ 1200 PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(), 1201 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, upTriFactorT->solvePolicy, upTriFactorT->solveBuffer)); 1202 1203 PetscCallCUDA(WaitForCUDA()); 1204 PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 1205 1206 /* assign the pointer */ 1207 ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->upTriFactorPtrTranspose = upTriFactorT; 1208 PetscFunctionReturn(PETSC_SUCCESS); 1209 } 1210 #endif 1211 1212 struct PetscScalarToPetscInt { 1213 __host__ __device__ PetscInt operator()(PetscScalar s) { return (PetscInt)PetscRealPart(s); } 1214 }; 1215 1216 static PetscErrorCode MatSeqAIJCUSPARSEFormExplicitTranspose(Mat A) 1217 { 1218 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr; 1219 Mat_SeqAIJCUSPARSEMultStruct *matstruct, *matstructT; 1220 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 1221 cusparseStatus_t stat; 1222 cusparseIndexBase_t indexBase; 1223 1224 PetscFunctionBegin; 1225 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 1226 matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->mat; 1227 PetscCheck(matstruct, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing mat struct"); 1228 matstructT = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->matTranspose; 1229 PetscCheck(!A->transupdated || matstructT, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing matTranspose struct"); 1230 if (A->transupdated) PetscFunctionReturn(PETSC_SUCCESS); 1231 PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0)); 1232 PetscCall(PetscLogGpuTimeBegin()); 1233 if (cusparsestruct->format != MAT_CUSPARSE_CSR) PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE)); 1234 if (!cusparsestruct->matTranspose) { /* create cusparse matrix */ 1235 matstructT = new Mat_SeqAIJCUSPARSEMultStruct; 1236 PetscCallCUSPARSE(cusparseCreateMatDescr(&matstructT->descr)); 1237 indexBase = cusparseGetMatIndexBase(matstruct->descr); 1238 PetscCallCUSPARSE(cusparseSetMatIndexBase(matstructT->descr, indexBase)); 1239 PetscCallCUSPARSE(cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 1240 1241 /* set alpha and beta */ 1242 PetscCallCUDA(cudaMalloc((void **)&matstructT->alpha_one, sizeof(PetscScalar))); 1243 PetscCallCUDA(cudaMalloc((void **)&matstructT->beta_zero, sizeof(PetscScalar))); 1244 PetscCallCUDA(cudaMalloc((void **)&matstructT->beta_one, sizeof(PetscScalar))); 1245 PetscCallCUDA(cudaMemcpy(matstructT->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 1246 PetscCallCUDA(cudaMemcpy(matstructT->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 1247 PetscCallCUDA(cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 1248 1249 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 1250 CsrMatrix *matrixT = new CsrMatrix; 1251 matstructT->mat = matrixT; 1252 matrixT->num_rows = A->cmap->n; 1253 matrixT->num_cols = A->rmap->n; 1254 matrixT->num_entries = a->nz; 1255 matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows + 1); 1256 matrixT->column_indices = new THRUSTINTARRAY32(a->nz); 1257 matrixT->values = new THRUSTARRAY(a->nz); 1258 1259 if (!cusparsestruct->rowoffsets_gpu) cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 1260 cusparsestruct->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1); 1261 1262 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 1263 #if PETSC_PKG_CUDA_VERSION_GE(11, 2, 1) 1264 stat = cusparseCreateCsr(&matstructT->matDescr, matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), matrixT->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, /* row offset, col idx type due to THRUSTINTARRAY32 */ 1265 indexBase, cusparse_scalartype); 1266 PetscCallCUSPARSE(stat); 1267 #else 1268 /* cusparse-11.x returns errors with zero-sized matrices until 11.2.1, 1269 see https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html#cusparse-11.2.1 1270 1271 I don't know what a proper value should be for matstructT->matDescr with empty matrices, so I just set 1272 it to NULL to blow it up if one relies on it. Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2, 1273 when nnz = 0, matrixT->row_offsets[] should be filled with indexBase. So I also set it accordingly. 1274 */ 1275 if (matrixT->num_entries) { 1276 stat = cusparseCreateCsr(&matstructT->matDescr, matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), matrixT->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, indexBase, cusparse_scalartype); 1277 PetscCallCUSPARSE(stat); 1278 1279 } else { 1280 matstructT->matDescr = NULL; 1281 matrixT->row_offsets->assign(matrixT->row_offsets->size(), indexBase); 1282 } 1283 #endif 1284 #endif 1285 } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) { 1286 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 1287 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 1288 #else 1289 CsrMatrix *temp = new CsrMatrix; 1290 CsrMatrix *tempT = new CsrMatrix; 1291 /* First convert HYB to CSR */ 1292 temp->num_rows = A->rmap->n; 1293 temp->num_cols = A->cmap->n; 1294 temp->num_entries = a->nz; 1295 temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1); 1296 temp->column_indices = new THRUSTINTARRAY32(a->nz); 1297 temp->values = new THRUSTARRAY(a->nz); 1298 1299 stat = cusparse_hyb2csr(cusparsestruct->handle, matstruct->descr, (cusparseHybMat_t)matstruct->mat, temp->values->data().get(), temp->row_offsets->data().get(), temp->column_indices->data().get()); 1300 PetscCallCUSPARSE(stat); 1301 1302 /* Next, convert CSR to CSC (i.e. the matrix transpose) */ 1303 tempT->num_rows = A->rmap->n; 1304 tempT->num_cols = A->cmap->n; 1305 tempT->num_entries = a->nz; 1306 tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1); 1307 tempT->column_indices = new THRUSTINTARRAY32(a->nz); 1308 tempT->values = new THRUSTARRAY(a->nz); 1309 1310 stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows, temp->num_cols, temp->num_entries, temp->values->data().get(), temp->row_offsets->data().get(), temp->column_indices->data().get(), tempT->values->data().get(), 1311 tempT->column_indices->data().get(), tempT->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase); 1312 PetscCallCUSPARSE(stat); 1313 1314 /* Last, convert CSC to HYB */ 1315 cusparseHybMat_t hybMat; 1316 PetscCallCUSPARSE(cusparseCreateHybMat(&hybMat)); 1317 cusparseHybPartition_t partition = cusparsestruct->format == MAT_CUSPARSE_ELL ? CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 1318 stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n, matstructT->descr, tempT->values->data().get(), tempT->row_offsets->data().get(), tempT->column_indices->data().get(), hybMat, 0, partition); 1319 PetscCallCUSPARSE(stat); 1320 1321 /* assign the pointer */ 1322 matstructT->mat = hybMat; 1323 A->transupdated = PETSC_TRUE; 1324 /* delete temporaries */ 1325 if (tempT) { 1326 if (tempT->values) delete (THRUSTARRAY *)tempT->values; 1327 if (tempT->column_indices) delete (THRUSTINTARRAY32 *)tempT->column_indices; 1328 if (tempT->row_offsets) delete (THRUSTINTARRAY32 *)tempT->row_offsets; 1329 delete (CsrMatrix *)tempT; 1330 } 1331 if (temp) { 1332 if (temp->values) delete (THRUSTARRAY *)temp->values; 1333 if (temp->column_indices) delete (THRUSTINTARRAY32 *)temp->column_indices; 1334 if (temp->row_offsets) delete (THRUSTINTARRAY32 *)temp->row_offsets; 1335 delete (CsrMatrix *)temp; 1336 } 1337 #endif 1338 } 1339 } 1340 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { /* transpose mat struct may be already present, update data */ 1341 CsrMatrix *matrix = (CsrMatrix *)matstruct->mat; 1342 CsrMatrix *matrixT = (CsrMatrix *)matstructT->mat; 1343 PetscCheck(matrix, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix"); 1344 PetscCheck(matrix->row_offsets, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix rows"); 1345 PetscCheck(matrix->column_indices, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix cols"); 1346 PetscCheck(matrix->values, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix values"); 1347 PetscCheck(matrixT, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT"); 1348 PetscCheck(matrixT->row_offsets, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT rows"); 1349 PetscCheck(matrixT->column_indices, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT cols"); 1350 PetscCheck(matrixT->values, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT values"); 1351 if (!cusparsestruct->rowoffsets_gpu) { /* this may be absent when we did not construct the transpose with csr2csc */ 1352 cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 1353 cusparsestruct->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1); 1354 PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt))); 1355 } 1356 if (!cusparsestruct->csr2csc_i) { 1357 THRUSTARRAY csr2csc_a(matrix->num_entries); 1358 PetscCallThrust(thrust::sequence(thrust::device, csr2csc_a.begin(), csr2csc_a.end(), 0.0)); 1359 1360 indexBase = cusparseGetMatIndexBase(matstruct->descr); 1361 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 1362 void *csr2cscBuffer; 1363 size_t csr2cscBufferSize; 1364 stat = cusparseCsr2cscEx2_bufferSize(cusparsestruct->handle, A->rmap->n, A->cmap->n, matrix->num_entries, matrix->values->data().get(), cusparsestruct->rowoffsets_gpu->data().get(), matrix->column_indices->data().get(), matrixT->values->data().get(), 1365 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, cusparsestruct->csr2cscAlg, &csr2cscBufferSize); 1366 PetscCallCUSPARSE(stat); 1367 PetscCallCUDA(cudaMalloc(&csr2cscBuffer, csr2cscBufferSize)); 1368 #endif 1369 1370 if (matrix->num_entries) { 1371 /* When there are no nonzeros, this routine mistakenly returns CUSPARSE_STATUS_INVALID_VALUE in 1372 mat_tests-ex62_15_mpiaijcusparse on ranks 0 and 2 with CUDA-11. But CUDA-10 is OK. 1373 I checked every parameters and they were just fine. I have no clue why cusparse complains. 1374 1375 Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2, when nnz = 0, matrixT->row_offsets[] 1376 should be filled with indexBase. So I just take a shortcut here. 1377 */ 1378 stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n, A->cmap->n, matrix->num_entries, csr2csc_a.data().get(), cusparsestruct->rowoffsets_gpu->data().get(), matrix->column_indices->data().get(), matrixT->values->data().get(), 1379 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 1380 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, cusparsestruct->csr2cscAlg, csr2cscBuffer); 1381 PetscCallCUSPARSE(stat); 1382 #else 1383 matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase); 1384 PetscCallCUSPARSE(stat); 1385 #endif 1386 } else { 1387 matrixT->row_offsets->assign(matrixT->row_offsets->size(), indexBase); 1388 } 1389 1390 cusparsestruct->csr2csc_i = new THRUSTINTARRAY(matrix->num_entries); 1391 PetscCallThrust(thrust::transform(thrust::device, matrixT->values->begin(), matrixT->values->end(), cusparsestruct->csr2csc_i->begin(), PetscScalarToPetscInt())); 1392 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 1393 PetscCallCUDA(cudaFree(csr2cscBuffer)); 1394 #endif 1395 } 1396 PetscCallThrust( 1397 thrust::copy(thrust::device, thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->begin()), thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->end()), matrixT->values->begin())); 1398 } 1399 PetscCall(PetscLogGpuTimeEnd()); 1400 PetscCall(PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0)); 1401 /* the compressed row indices is not used for matTranspose */ 1402 matstructT->cprowIndices = NULL; 1403 /* assign the pointer */ 1404 ((Mat_SeqAIJCUSPARSE *)A->spptr)->matTranspose = matstructT; 1405 A->transupdated = PETSC_TRUE; 1406 PetscFunctionReturn(PETSC_SUCCESS); 1407 } 1408 1409 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 1410 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_LU(Mat A, Vec b, Vec x) 1411 { 1412 const PetscScalar *barray; 1413 PetscScalar *xarray; 1414 thrust::device_ptr<const PetscScalar> bGPU; 1415 thrust::device_ptr<PetscScalar> xGPU; 1416 Mat_SeqAIJCUSPARSETriFactors *fs = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr); 1417 const Mat_SeqAIJ *aij = static_cast<Mat_SeqAIJ *>(A->data); 1418 const cusparseOperation_t op = CUSPARSE_OPERATION_NON_TRANSPOSE; 1419 const cusparseSpSVAlg_t alg = CUSPARSE_SPSV_ALG_DEFAULT; 1420 PetscInt m = A->rmap->n; 1421 1422 PetscFunctionBegin; 1423 PetscCall(PetscLogGpuTimeBegin()); 1424 PetscCall(VecCUDAGetArrayWrite(x, &xarray)); 1425 PetscCall(VecCUDAGetArrayRead(b, &barray)); 1426 xGPU = thrust::device_pointer_cast(xarray); 1427 bGPU = thrust::device_pointer_cast(barray); 1428 1429 // Reorder b with the row permutation if needed, and wrap the result in fs->X 1430 if (fs->rpermIndices) { 1431 PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->end()), thrust::device_pointer_cast(fs->X))); 1432 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X)); 1433 } else { 1434 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray)); 1435 } 1436 1437 // Solve L Y = X 1438 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y)); 1439 // Note that cusparseSpSV_solve() secretly uses the external buffer used in cusparseSpSV_analysis()! 1440 PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, op, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_L)); 1441 1442 // Solve U X = Y 1443 if (fs->cpermIndices) { 1444 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X)); 1445 } else { 1446 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray)); 1447 } 1448 PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, op, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, alg, fs->spsvDescr_U)); 1449 1450 // Reorder X with the column permutation if needed, and put the result back to x 1451 if (fs->cpermIndices) { 1452 PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X), fs->cpermIndices->begin()), 1453 thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X + m), fs->cpermIndices->end()), xGPU)); 1454 } 1455 PetscCall(VecCUDARestoreArrayRead(b, &barray)); 1456 PetscCall(VecCUDARestoreArrayWrite(x, &xarray)); 1457 PetscCall(PetscLogGpuTimeEnd()); 1458 PetscCall(PetscLogGpuFlops(2.0 * aij->nz - m)); 1459 PetscFunctionReturn(PETSC_SUCCESS); 1460 } 1461 1462 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_LU(Mat A, Vec b, Vec x) 1463 { 1464 Mat_SeqAIJCUSPARSETriFactors *fs = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr); 1465 Mat_SeqAIJ *aij = static_cast<Mat_SeqAIJ *>(A->data); 1466 const PetscScalar *barray; 1467 PetscScalar *xarray; 1468 thrust::device_ptr<const PetscScalar> bGPU; 1469 thrust::device_ptr<PetscScalar> xGPU; 1470 const cusparseOperation_t opA = CUSPARSE_OPERATION_TRANSPOSE; 1471 const cusparseSpSVAlg_t alg = CUSPARSE_SPSV_ALG_DEFAULT; 1472 PetscInt m = A->rmap->n; 1473 1474 PetscFunctionBegin; 1475 PetscCall(PetscLogGpuTimeBegin()); 1476 if (!fs->createdTransposeSpSVDescr) { // Call MatSolveTranspose() for the first time 1477 PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Lt)); 1478 PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* The matrix is still L. We only do transpose solve with it */ 1479 fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Lt, &fs->spsvBufferSize_Lt)); 1480 1481 PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Ut)); 1482 PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Ut, &fs->spsvBufferSize_Ut)); 1483 PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Lt, fs->spsvBufferSize_Lt)); 1484 PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Ut, fs->spsvBufferSize_Ut)); 1485 fs->createdTransposeSpSVDescr = PETSC_TRUE; 1486 } 1487 1488 if (!fs->updatedTransposeSpSVAnalysis) { 1489 PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Lt, fs->spsvBuffer_Lt)); 1490 1491 PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Ut, fs->spsvBuffer_Ut)); 1492 fs->updatedTransposeSpSVAnalysis = PETSC_TRUE; 1493 } 1494 1495 PetscCall(VecCUDAGetArrayWrite(x, &xarray)); 1496 PetscCall(VecCUDAGetArrayRead(b, &barray)); 1497 xGPU = thrust::device_pointer_cast(xarray); 1498 bGPU = thrust::device_pointer_cast(barray); 1499 1500 // Reorder b with the row permutation if needed, and wrap the result in fs->X 1501 if (fs->rpermIndices) { 1502 PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->end()), thrust::device_pointer_cast(fs->X))); 1503 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X)); 1504 } else { 1505 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray)); 1506 } 1507 1508 // Solve Ut Y = X 1509 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y)); 1510 PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Ut)); 1511 1512 // Solve Lt X = Y 1513 if (fs->cpermIndices) { // if need to permute, we need to use the intermediate buffer X 1514 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X)); 1515 } else { 1516 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray)); 1517 } 1518 PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, alg, fs->spsvDescr_Lt)); 1519 1520 // Reorder X with the column permutation if needed, and put the result back to x 1521 if (fs->cpermIndices) { 1522 PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X), fs->cpermIndices->begin()), 1523 thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X + m), fs->cpermIndices->end()), xGPU)); 1524 } 1525 1526 PetscCall(VecCUDARestoreArrayRead(b, &barray)); 1527 PetscCall(VecCUDARestoreArrayWrite(x, &xarray)); 1528 PetscCall(PetscLogGpuTimeEnd()); 1529 PetscCall(PetscLogGpuFlops(2.0 * aij->nz - A->rmap->n)); 1530 PetscFunctionReturn(PETSC_SUCCESS); 1531 } 1532 #else 1533 /* Why do we need to analyze the transposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */ 1534 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A, Vec bb, Vec xx) 1535 { 1536 PetscInt n = xx->map->n; 1537 const PetscScalar *barray; 1538 PetscScalar *xarray; 1539 thrust::device_ptr<const PetscScalar> bGPU; 1540 thrust::device_ptr<PetscScalar> xGPU; 1541 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 1542 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose; 1543 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose; 1544 THRUSTARRAY *tempGPU = (THRUSTARRAY *)cusparseTriFactors->workVector; 1545 1546 PetscFunctionBegin; 1547 /* Analyze the matrix and create the transpose ... on the fly */ 1548 if (!loTriFactorT && !upTriFactorT) { 1549 PetscCall(MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A)); 1550 loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose; 1551 upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose; 1552 } 1553 1554 /* Get the GPU pointers */ 1555 PetscCall(VecCUDAGetArrayWrite(xx, &xarray)); 1556 PetscCall(VecCUDAGetArrayRead(bb, &barray)); 1557 xGPU = thrust::device_pointer_cast(xarray); 1558 bGPU = thrust::device_pointer_cast(barray); 1559 1560 PetscCall(PetscLogGpuTimeBegin()); 1561 /* First, reorder with the row permutation */ 1562 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU + n, cusparseTriFactors->rpermIndices->end()), xGPU); 1563 1564 /* First, solve U */ 1565 PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(), 1566 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, xarray, tempGPU->data().get(), upTriFactorT->solvePolicy, upTriFactorT->solveBuffer)); 1567 1568 /* Then, solve L */ 1569 PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(), 1570 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, tempGPU->data().get(), xarray, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer)); 1571 1572 /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */ 1573 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()), thrust::make_permutation_iterator(xGPU + n, cusparseTriFactors->cpermIndices->end()), tempGPU->begin()); 1574 1575 /* Copy the temporary to the full solution. */ 1576 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), tempGPU->begin(), tempGPU->end(), xGPU); 1577 1578 /* restore */ 1579 PetscCall(VecCUDARestoreArrayRead(bb, &barray)); 1580 PetscCall(VecCUDARestoreArrayWrite(xx, &xarray)); 1581 PetscCall(PetscLogGpuTimeEnd()); 1582 PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n)); 1583 PetscFunctionReturn(PETSC_SUCCESS); 1584 } 1585 1586 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A, Vec bb, Vec xx) 1587 { 1588 const PetscScalar *barray; 1589 PetscScalar *xarray; 1590 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 1591 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose; 1592 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose; 1593 THRUSTARRAY *tempGPU = (THRUSTARRAY *)cusparseTriFactors->workVector; 1594 1595 PetscFunctionBegin; 1596 /* Analyze the matrix and create the transpose ... on the fly */ 1597 if (!loTriFactorT && !upTriFactorT) { 1598 PetscCall(MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A)); 1599 loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose; 1600 upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose; 1601 } 1602 1603 /* Get the GPU pointers */ 1604 PetscCall(VecCUDAGetArrayWrite(xx, &xarray)); 1605 PetscCall(VecCUDAGetArrayRead(bb, &barray)); 1606 1607 PetscCall(PetscLogGpuTimeBegin()); 1608 /* First, solve U */ 1609 PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(), 1610 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, barray, tempGPU->data().get(), upTriFactorT->solvePolicy, upTriFactorT->solveBuffer)); 1611 1612 /* Then, solve L */ 1613 PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(), 1614 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, tempGPU->data().get(), xarray, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer)); 1615 1616 /* restore */ 1617 PetscCall(VecCUDARestoreArrayRead(bb, &barray)); 1618 PetscCall(VecCUDARestoreArrayWrite(xx, &xarray)); 1619 PetscCall(PetscLogGpuTimeEnd()); 1620 PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n)); 1621 PetscFunctionReturn(PETSC_SUCCESS); 1622 } 1623 1624 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A, Vec bb, Vec xx) 1625 { 1626 const PetscScalar *barray; 1627 PetscScalar *xarray; 1628 thrust::device_ptr<const PetscScalar> bGPU; 1629 thrust::device_ptr<PetscScalar> xGPU; 1630 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 1631 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr; 1632 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr; 1633 THRUSTARRAY *tempGPU = (THRUSTARRAY *)cusparseTriFactors->workVector; 1634 1635 PetscFunctionBegin; 1636 /* Get the GPU pointers */ 1637 PetscCall(VecCUDAGetArrayWrite(xx, &xarray)); 1638 PetscCall(VecCUDAGetArrayRead(bb, &barray)); 1639 xGPU = thrust::device_pointer_cast(xarray); 1640 bGPU = thrust::device_pointer_cast(barray); 1641 1642 PetscCall(PetscLogGpuTimeBegin()); 1643 /* First, reorder with the row permutation */ 1644 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()), tempGPU->begin()); 1645 1646 /* Next, solve L */ 1647 PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactor->descr, loTriFactor->csrMat->values->data().get(), 1648 loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, tempGPU->data().get(), xarray, loTriFactor->solvePolicy, loTriFactor->solveBuffer)); 1649 1650 /* Then, solve U */ 1651 PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactor->descr, upTriFactor->csrMat->values->data().get(), 1652 upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, xarray, tempGPU->data().get(), upTriFactor->solvePolicy, upTriFactor->solveBuffer)); 1653 1654 /* Last, reorder with the column permutation */ 1655 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()), thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()), xGPU); 1656 1657 PetscCall(VecCUDARestoreArrayRead(bb, &barray)); 1658 PetscCall(VecCUDARestoreArrayWrite(xx, &xarray)); 1659 PetscCall(PetscLogGpuTimeEnd()); 1660 PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n)); 1661 PetscFunctionReturn(PETSC_SUCCESS); 1662 } 1663 1664 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A, Vec bb, Vec xx) 1665 { 1666 const PetscScalar *barray; 1667 PetscScalar *xarray; 1668 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 1669 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr; 1670 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr; 1671 THRUSTARRAY *tempGPU = (THRUSTARRAY *)cusparseTriFactors->workVector; 1672 1673 PetscFunctionBegin; 1674 /* Get the GPU pointers */ 1675 PetscCall(VecCUDAGetArrayWrite(xx, &xarray)); 1676 PetscCall(VecCUDAGetArrayRead(bb, &barray)); 1677 1678 PetscCall(PetscLogGpuTimeBegin()); 1679 /* First, solve L */ 1680 PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactor->descr, loTriFactor->csrMat->values->data().get(), 1681 loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, barray, tempGPU->data().get(), loTriFactor->solvePolicy, loTriFactor->solveBuffer)); 1682 1683 /* Next, solve U */ 1684 PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactor->descr, upTriFactor->csrMat->values->data().get(), 1685 upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, tempGPU->data().get(), xarray, upTriFactor->solvePolicy, upTriFactor->solveBuffer)); 1686 1687 PetscCall(VecCUDARestoreArrayRead(bb, &barray)); 1688 PetscCall(VecCUDARestoreArrayWrite(xx, &xarray)); 1689 PetscCall(PetscLogGpuTimeEnd()); 1690 PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n)); 1691 PetscFunctionReturn(PETSC_SUCCESS); 1692 } 1693 #endif 1694 1695 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 1696 static PetscErrorCode MatILUFactorNumeric_SeqAIJCUSPARSE_ILU0(Mat fact, Mat A, const MatFactorInfo *) 1697 { 1698 Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr; 1699 Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data; 1700 Mat_SeqAIJCUSPARSE *Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 1701 CsrMatrix *Acsr; 1702 PetscInt m, nz; 1703 PetscBool flg; 1704 1705 PetscFunctionBegin; 1706 if (PetscDefined(USE_DEBUG)) { 1707 PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg)); 1708 PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name); 1709 } 1710 1711 /* Copy A's value to fact */ 1712 m = fact->rmap->n; 1713 nz = aij->nz; 1714 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 1715 Acsr = (CsrMatrix *)Acusp->mat->mat; 1716 PetscCallCUDA(cudaMemcpyAsync(fs->csrVal, Acsr->values->data().get(), sizeof(PetscScalar) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream)); 1717 1718 PetscCall(PetscLogGpuTimeBegin()); 1719 /* Factorize fact inplace */ 1720 if (m) 1721 PetscCallCUSPARSE(cusparseXcsrilu02(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */ 1722 fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ilu0Info_M, fs->policy_M, fs->factBuffer_M)); 1723 if (PetscDefined(USE_DEBUG)) { 1724 int numerical_zero; 1725 cusparseStatus_t status; 1726 status = cusparseXcsrilu02_zeroPivot(fs->handle, fs->ilu0Info_M, &numerical_zero); 1727 PetscAssert(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Numerical zero pivot detected in csrilu02: A(%d,%d) is zero", numerical_zero, numerical_zero); 1728 } 1729 1730 /* cusparseSpSV_analysis() is numeric, i.e., it requires valid matrix values, therefore, we do it after cusparseXcsrilu02() 1731 See discussion at https://github.com/NVIDIA/CUDALibrarySamples/issues/78 1732 */ 1733 PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L)); 1734 1735 PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, fs->spsvBuffer_U)); 1736 1737 /* L, U values have changed, reset the flag to indicate we need to redo cusparseSpSV_analysis() for transpose solve */ 1738 fs->updatedTransposeSpSVAnalysis = PETSC_FALSE; 1739 1740 fact->offloadmask = PETSC_OFFLOAD_GPU; 1741 fact->ops->solve = MatSolve_SeqAIJCUSPARSE_LU; // spMatDescr_L/U uses 32-bit indices, but cusparseSpSV_solve() supports both 32 and 64. The info is encoded in cusparseSpMatDescr_t. 1742 fact->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_LU; 1743 fact->ops->matsolve = NULL; 1744 fact->ops->matsolvetranspose = NULL; 1745 PetscCall(PetscLogGpuTimeEnd()); 1746 PetscCall(PetscLogGpuFlops(fs->numericFactFlops)); 1747 PetscFunctionReturn(PETSC_SUCCESS); 1748 } 1749 1750 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0(Mat fact, Mat A, IS, IS, const MatFactorInfo *info) 1751 { 1752 Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr; 1753 Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data; 1754 PetscInt m, nz; 1755 1756 PetscFunctionBegin; 1757 if (PetscDefined(USE_DEBUG)) { 1758 PetscInt i; 1759 PetscBool flg, missing; 1760 1761 PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg)); 1762 PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name); 1763 PetscCheck(A->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Must be square matrix, rows %" PetscInt_FMT " columns %" PetscInt_FMT, A->rmap->n, A->cmap->n); 1764 PetscCall(MatMissingDiagonal(A, &missing, &i)); 1765 PetscCheck(!missing, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entry %" PetscInt_FMT, i); 1766 } 1767 1768 /* Free the old stale stuff */ 1769 PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&fs)); 1770 1771 /* Copy over A's meta data to fact. Note that we also allocated fact's i,j,a on host, 1772 but they will not be used. Allocate them just for easy debugging. 1773 */ 1774 PetscCall(MatDuplicateNoCreate_SeqAIJ(fact, A, MAT_DO_NOT_COPY_VALUES, PETSC_TRUE /*malloc*/)); 1775 1776 fact->offloadmask = PETSC_OFFLOAD_BOTH; 1777 fact->factortype = MAT_FACTOR_ILU; 1778 fact->info.factor_mallocs = 0; 1779 fact->info.fill_ratio_given = info->fill; 1780 fact->info.fill_ratio_needed = 1.0; 1781 1782 aij->row = NULL; 1783 aij->col = NULL; 1784 1785 /* ====================================================================== */ 1786 /* Copy A's i, j to fact and also allocate the value array of fact. */ 1787 /* We'll do in-place factorization on fact */ 1788 /* ====================================================================== */ 1789 const int *Ai, *Aj; 1790 1791 m = fact->rmap->n; 1792 nz = aij->nz; 1793 1794 PetscCallCUDA(cudaMalloc((void **)&fs->csrRowPtr32, sizeof(*fs->csrRowPtr32) * (m + 1))); 1795 PetscCallCUDA(cudaMalloc((void **)&fs->csrColIdx32, sizeof(*fs->csrColIdx32) * nz)); 1796 PetscCallCUDA(cudaMalloc((void **)&fs->csrVal, sizeof(*fs->csrVal) * nz)); 1797 PetscCall(MatSeqAIJCUSPARSEGetIJ(A, PETSC_FALSE, &Ai, &Aj)); /* Do not use compressed Ai. The returned Ai, Aj are 32-bit */ 1798 PetscCallCUDA(cudaMemcpyAsync(fs->csrRowPtr32, Ai, sizeof(*Ai) * (m + 1), cudaMemcpyDeviceToDevice, PetscDefaultCudaStream)); 1799 PetscCallCUDA(cudaMemcpyAsync(fs->csrColIdx32, Aj, sizeof(*Aj) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream)); 1800 1801 /* ====================================================================== */ 1802 /* Create descriptors for M, L, U */ 1803 /* ====================================================================== */ 1804 cusparseFillMode_t fillMode; 1805 cusparseDiagType_t diagType; 1806 1807 PetscCallCUSPARSE(cusparseCreateMatDescr(&fs->matDescr_M)); 1808 PetscCallCUSPARSE(cusparseSetMatIndexBase(fs->matDescr_M, CUSPARSE_INDEX_BASE_ZERO)); 1809 PetscCallCUSPARSE(cusparseSetMatType(fs->matDescr_M, CUSPARSE_MATRIX_TYPE_GENERAL)); 1810 1811 /* https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t 1812 cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always 1813 assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that 1814 all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine 1815 assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory. 1816 */ 1817 fillMode = CUSPARSE_FILL_MODE_LOWER; 1818 diagType = CUSPARSE_DIAG_TYPE_UNIT; 1819 PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_L, m, m, nz, fs->csrRowPtr32, fs->csrColIdx32, fs->csrVal, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype)); 1820 PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode))); 1821 PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType))); 1822 1823 fillMode = CUSPARSE_FILL_MODE_UPPER; 1824 diagType = CUSPARSE_DIAG_TYPE_NON_UNIT; 1825 PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_U, m, m, nz, fs->csrRowPtr32, fs->csrColIdx32, fs->csrVal, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype)); 1826 PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode))); 1827 PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType))); 1828 1829 /* ========================================================================= */ 1830 /* Query buffer sizes for csrilu0, SpSV and allocate buffers */ 1831 /* ========================================================================= */ 1832 PetscCallCUSPARSE(cusparseCreateCsrilu02Info(&fs->ilu0Info_M)); 1833 if (m) 1834 PetscCallCUSPARSE(cusparseXcsrilu02_bufferSize(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */ 1835 fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ilu0Info_M, &fs->factBufferSize_M)); 1836 1837 PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(PetscScalar) * m)); 1838 PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(PetscScalar) * m)); 1839 1840 PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype)); 1841 PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype)); 1842 1843 PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_L)); 1844 PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, &fs->spsvBufferSize_L)); 1845 1846 PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_U)); 1847 PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, &fs->spsvBufferSize_U)); 1848 1849 /* From my experiment with the example at https://github.com/NVIDIA/CUDALibrarySamples/tree/master/cuSPARSE/bicgstab, 1850 and discussion at https://github.com/NVIDIA/CUDALibrarySamples/issues/77, 1851 spsvBuffer_L/U can not be shared (i.e., the same) for our case, but factBuffer_M can share with either of spsvBuffer_L/U. 1852 To save memory, we make factBuffer_M share with the bigger of spsvBuffer_L/U. 1853 */ 1854 if (fs->spsvBufferSize_L > fs->spsvBufferSize_U) { 1855 PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_L, (size_t)fs->factBufferSize_M))); 1856 fs->spsvBuffer_L = fs->factBuffer_M; 1857 PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_U, fs->spsvBufferSize_U)); 1858 } else { 1859 PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_U, (size_t)fs->factBufferSize_M))); 1860 fs->spsvBuffer_U = fs->factBuffer_M; 1861 PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_L, fs->spsvBufferSize_L)); 1862 } 1863 1864 /* ========================================================================== */ 1865 /* Perform analysis of ilu0 on M, SpSv on L and U */ 1866 /* The lower(upper) triangular part of M has the same sparsity pattern as L(U)*/ 1867 /* ========================================================================== */ 1868 int structural_zero; 1869 cusparseStatus_t status; 1870 1871 fs->policy_M = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 1872 if (m) 1873 PetscCallCUSPARSE(cusparseXcsrilu02_analysis(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */ 1874 fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ilu0Info_M, fs->policy_M, fs->factBuffer_M)); 1875 if (PetscDefined(USE_DEBUG)) { 1876 /* Function cusparseXcsrilu02_zeroPivot() is a blocking call. It calls cudaDeviceSynchronize() to make sure all previous kernels are done. */ 1877 status = cusparseXcsrilu02_zeroPivot(fs->handle, fs->ilu0Info_M, &structural_zero); 1878 PetscCheck(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Structural zero pivot detected in csrilu02: A(%d,%d) is missing", structural_zero, structural_zero); 1879 } 1880 1881 /* Estimate FLOPs of the numeric factorization */ 1882 { 1883 Mat_SeqAIJ *Aseq = (Mat_SeqAIJ *)A->data; 1884 PetscInt *Ai, *Adiag, nzRow, nzLeft; 1885 PetscLogDouble flops = 0.0; 1886 1887 PetscCall(MatMarkDiagonal_SeqAIJ(A)); 1888 Ai = Aseq->i; 1889 Adiag = Aseq->diag; 1890 for (PetscInt i = 0; i < m; i++) { 1891 if (Ai[i] < Adiag[i] && Adiag[i] < Ai[i + 1]) { /* There are nonzeros left to the diagonal of row i */ 1892 nzRow = Ai[i + 1] - Ai[i]; 1893 nzLeft = Adiag[i] - Ai[i]; 1894 /* We want to eliminate nonzeros left to the diagonal one by one. Assume each time, nonzeros right 1895 and include the eliminated one will be updated, which incurs a multiplication and an addition. 1896 */ 1897 nzLeft = (nzRow - 1) / 2; 1898 flops += nzLeft * (2.0 * nzRow - nzLeft + 1); 1899 } 1900 } 1901 fs->numericFactFlops = flops; 1902 } 1903 fact->ops->lufactornumeric = MatILUFactorNumeric_SeqAIJCUSPARSE_ILU0; 1904 PetscFunctionReturn(PETSC_SUCCESS); 1905 } 1906 1907 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_ICC0(Mat fact, Vec b, Vec x) 1908 { 1909 Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr; 1910 Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data; 1911 const PetscScalar *barray; 1912 PetscScalar *xarray; 1913 1914 PetscFunctionBegin; 1915 PetscCall(VecCUDAGetArrayWrite(x, &xarray)); 1916 PetscCall(VecCUDAGetArrayRead(b, &barray)); 1917 PetscCall(PetscLogGpuTimeBegin()); 1918 1919 /* Solve L*y = b */ 1920 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray)); 1921 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y)); 1922 PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* L Y = X */ 1923 fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L)); 1924 1925 /* Solve Lt*x = y */ 1926 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray)); 1927 PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* Lt X = Y */ 1928 fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt)); 1929 1930 PetscCall(VecCUDARestoreArrayRead(b, &barray)); 1931 PetscCall(VecCUDARestoreArrayWrite(x, &xarray)); 1932 1933 PetscCall(PetscLogGpuTimeEnd()); 1934 PetscCall(PetscLogGpuFlops(2.0 * aij->nz - fact->rmap->n)); 1935 PetscFunctionReturn(PETSC_SUCCESS); 1936 } 1937 1938 static PetscErrorCode MatICCFactorNumeric_SeqAIJCUSPARSE_ICC0(Mat fact, Mat A, const MatFactorInfo *) 1939 { 1940 Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr; 1941 Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data; 1942 Mat_SeqAIJCUSPARSE *Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 1943 CsrMatrix *Acsr; 1944 PetscInt m, nz; 1945 PetscBool flg; 1946 1947 PetscFunctionBegin; 1948 if (PetscDefined(USE_DEBUG)) { 1949 PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg)); 1950 PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name); 1951 } 1952 1953 /* Copy A's value to fact */ 1954 m = fact->rmap->n; 1955 nz = aij->nz; 1956 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 1957 Acsr = (CsrMatrix *)Acusp->mat->mat; 1958 PetscCallCUDA(cudaMemcpyAsync(fs->csrVal, Acsr->values->data().get(), sizeof(PetscScalar) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream)); 1959 1960 /* Factorize fact inplace */ 1961 /* https://docs.nvidia.com/cuda/cusparse/index.html#csric02_solve 1962 Function csric02() only takes the lower triangular part of matrix A to perform factorization. 1963 The matrix type must be CUSPARSE_MATRIX_TYPE_GENERAL, the fill mode and diagonal type are ignored, 1964 and the strictly upper triangular part is ignored and never touched. It does not matter if A is Hermitian or not. 1965 In other words, from the point of view of csric02() A is Hermitian and only the lower triangular part is provided. 1966 */ 1967 if (m) PetscCallCUSPARSE(cusparseXcsric02(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ic0Info_M, fs->policy_M, fs->factBuffer_M)); 1968 if (PetscDefined(USE_DEBUG)) { 1969 int numerical_zero; 1970 cusparseStatus_t status; 1971 status = cusparseXcsric02_zeroPivot(fs->handle, fs->ic0Info_M, &numerical_zero); 1972 PetscAssert(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Numerical zero pivot detected in csric02: A(%d,%d) is zero", numerical_zero, numerical_zero); 1973 } 1974 1975 PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L)); 1976 1977 /* Note that cusparse reports this error if we use double and CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE 1978 ** On entry to cusparseSpSV_analysis(): conjugate transpose (opA) is not supported for matA data type, current -> CUDA_R_64F 1979 */ 1980 PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, fs->spsvBuffer_Lt)); 1981 1982 fact->offloadmask = PETSC_OFFLOAD_GPU; 1983 fact->ops->solve = MatSolve_SeqAIJCUSPARSE_ICC0; 1984 fact->ops->solvetranspose = MatSolve_SeqAIJCUSPARSE_ICC0; 1985 fact->ops->matsolve = NULL; 1986 fact->ops->matsolvetranspose = NULL; 1987 PetscCall(PetscLogGpuFlops(fs->numericFactFlops)); 1988 PetscFunctionReturn(PETSC_SUCCESS); 1989 } 1990 1991 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE_ICC0(Mat fact, Mat A, IS, const MatFactorInfo *info) 1992 { 1993 Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr; 1994 Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data; 1995 PetscInt m, nz; 1996 1997 PetscFunctionBegin; 1998 if (PetscDefined(USE_DEBUG)) { 1999 PetscInt i; 2000 PetscBool flg, missing; 2001 2002 PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg)); 2003 PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name); 2004 PetscCheck(A->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Must be square matrix, rows %" PetscInt_FMT " columns %" PetscInt_FMT, A->rmap->n, A->cmap->n); 2005 PetscCall(MatMissingDiagonal(A, &missing, &i)); 2006 PetscCheck(!missing, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entry %" PetscInt_FMT, i); 2007 } 2008 2009 /* Free the old stale stuff */ 2010 PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&fs)); 2011 2012 /* Copy over A's meta data to fact. Note that we also allocated fact's i,j,a on host, 2013 but they will not be used. Allocate them just for easy debugging. 2014 */ 2015 PetscCall(MatDuplicateNoCreate_SeqAIJ(fact, A, MAT_DO_NOT_COPY_VALUES, PETSC_TRUE /*malloc*/)); 2016 2017 fact->offloadmask = PETSC_OFFLOAD_BOTH; 2018 fact->factortype = MAT_FACTOR_ICC; 2019 fact->info.factor_mallocs = 0; 2020 fact->info.fill_ratio_given = info->fill; 2021 fact->info.fill_ratio_needed = 1.0; 2022 2023 aij->row = NULL; 2024 aij->col = NULL; 2025 2026 /* ====================================================================== */ 2027 /* Copy A's i, j to fact and also allocate the value array of fact. */ 2028 /* We'll do in-place factorization on fact */ 2029 /* ====================================================================== */ 2030 const int *Ai, *Aj; 2031 2032 m = fact->rmap->n; 2033 nz = aij->nz; 2034 2035 PetscCallCUDA(cudaMalloc((void **)&fs->csrRowPtr32, sizeof(*fs->csrRowPtr32) * (m + 1))); 2036 PetscCallCUDA(cudaMalloc((void **)&fs->csrColIdx32, sizeof(*fs->csrColIdx32) * nz)); 2037 PetscCallCUDA(cudaMalloc((void **)&fs->csrVal, sizeof(PetscScalar) * nz)); 2038 PetscCall(MatSeqAIJCUSPARSEGetIJ(A, PETSC_FALSE, &Ai, &Aj)); /* Do not use compressed Ai */ 2039 PetscCallCUDA(cudaMemcpyAsync(fs->csrRowPtr32, Ai, sizeof(*Ai) * (m + 1), cudaMemcpyDeviceToDevice, PetscDefaultCudaStream)); 2040 PetscCallCUDA(cudaMemcpyAsync(fs->csrColIdx32, Aj, sizeof(*Aj) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream)); 2041 2042 /* ====================================================================== */ 2043 /* Create mat descriptors for M, L */ 2044 /* ====================================================================== */ 2045 cusparseFillMode_t fillMode; 2046 cusparseDiagType_t diagType; 2047 2048 PetscCallCUSPARSE(cusparseCreateMatDescr(&fs->matDescr_M)); 2049 PetscCallCUSPARSE(cusparseSetMatIndexBase(fs->matDescr_M, CUSPARSE_INDEX_BASE_ZERO)); 2050 PetscCallCUSPARSE(cusparseSetMatType(fs->matDescr_M, CUSPARSE_MATRIX_TYPE_GENERAL)); 2051 2052 /* https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t 2053 cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always 2054 assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that 2055 all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine 2056 assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory. 2057 */ 2058 fillMode = CUSPARSE_FILL_MODE_LOWER; 2059 diagType = CUSPARSE_DIAG_TYPE_NON_UNIT; 2060 PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_L, m, m, nz, fs->csrRowPtr32, fs->csrColIdx32, fs->csrVal, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype)); 2061 PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode))); 2062 PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType))); 2063 2064 /* ========================================================================= */ 2065 /* Query buffer sizes for csric0, SpSV of L and Lt, and allocate buffers */ 2066 /* ========================================================================= */ 2067 PetscCallCUSPARSE(cusparseCreateCsric02Info(&fs->ic0Info_M)); 2068 if (m) PetscCallCUSPARSE(cusparseXcsric02_bufferSize(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ic0Info_M, &fs->factBufferSize_M)); 2069 2070 PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(PetscScalar) * m)); 2071 PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(PetscScalar) * m)); 2072 2073 PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype)); 2074 PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype)); 2075 2076 PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_L)); 2077 PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, &fs->spsvBufferSize_L)); 2078 2079 PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Lt)); 2080 PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, &fs->spsvBufferSize_Lt)); 2081 2082 /* To save device memory, we make the factorization buffer share with one of the solver buffer. 2083 See also comments in MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0(). 2084 */ 2085 if (fs->spsvBufferSize_L > fs->spsvBufferSize_Lt) { 2086 PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_L, (size_t)fs->factBufferSize_M))); 2087 fs->spsvBuffer_L = fs->factBuffer_M; 2088 PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Lt, fs->spsvBufferSize_Lt)); 2089 } else { 2090 PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_Lt, (size_t)fs->factBufferSize_M))); 2091 fs->spsvBuffer_Lt = fs->factBuffer_M; 2092 PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_L, fs->spsvBufferSize_L)); 2093 } 2094 2095 /* ========================================================================== */ 2096 /* Perform analysis of ic0 on M */ 2097 /* The lower triangular part of M has the same sparsity pattern as L */ 2098 /* ========================================================================== */ 2099 int structural_zero; 2100 cusparseStatus_t status; 2101 2102 fs->policy_M = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 2103 if (m) PetscCallCUSPARSE(cusparseXcsric02_analysis(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ic0Info_M, fs->policy_M, fs->factBuffer_M)); 2104 if (PetscDefined(USE_DEBUG)) { 2105 /* Function cusparseXcsric02_zeroPivot() is a blocking call. It calls cudaDeviceSynchronize() to make sure all previous kernels are done. */ 2106 status = cusparseXcsric02_zeroPivot(fs->handle, fs->ic0Info_M, &structural_zero); 2107 PetscCheck(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Structural zero pivot detected in csric02: A(%d,%d) is missing", structural_zero, structural_zero); 2108 } 2109 2110 /* Estimate FLOPs of the numeric factorization */ 2111 { 2112 Mat_SeqAIJ *Aseq = (Mat_SeqAIJ *)A->data; 2113 PetscInt *Ai, nzRow, nzLeft; 2114 PetscLogDouble flops = 0.0; 2115 2116 Ai = Aseq->i; 2117 for (PetscInt i = 0; i < m; i++) { 2118 nzRow = Ai[i + 1] - Ai[i]; 2119 if (nzRow > 1) { 2120 /* We want to eliminate nonzeros left to the diagonal one by one. Assume each time, nonzeros right 2121 and include the eliminated one will be updated, which incurs a multiplication and an addition. 2122 */ 2123 nzLeft = (nzRow - 1) / 2; 2124 flops += nzLeft * (2.0 * nzRow - nzLeft + 1); 2125 } 2126 } 2127 fs->numericFactFlops = flops; 2128 } 2129 fact->ops->choleskyfactornumeric = MatICCFactorNumeric_SeqAIJCUSPARSE_ICC0; 2130 PetscFunctionReturn(PETSC_SUCCESS); 2131 } 2132 #endif 2133 2134 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B, Mat A, const MatFactorInfo *info) 2135 { 2136 // use_cpu_solve is a field in Mat_SeqAIJCUSPARSE. B, a factored matrix, uses Mat_SeqAIJCUSPARSETriFactors. 2137 Mat_SeqAIJCUSPARSE *cusparsestruct = static_cast<Mat_SeqAIJCUSPARSE *>(A->spptr); 2138 2139 PetscFunctionBegin; 2140 PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A)); 2141 PetscCall(MatLUFactorNumeric_SeqAIJ(B, A, info)); 2142 B->offloadmask = PETSC_OFFLOAD_CPU; 2143 2144 if (!cusparsestruct->use_cpu_solve) { 2145 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 2146 B->ops->solve = MatSolve_SeqAIJCUSPARSE_LU; 2147 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_LU; 2148 #else 2149 /* determine which version of MatSolve needs to be used. */ 2150 Mat_SeqAIJ *b = (Mat_SeqAIJ *)B->data; 2151 IS isrow = b->row, iscol = b->col; 2152 PetscBool row_identity, col_identity; 2153 2154 PetscCall(ISIdentity(isrow, &row_identity)); 2155 PetscCall(ISIdentity(iscol, &col_identity)); 2156 if (row_identity && col_identity) { 2157 B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 2158 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 2159 } else { 2160 B->ops->solve = MatSolve_SeqAIJCUSPARSE; 2161 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 2162 } 2163 #endif 2164 } 2165 B->ops->matsolve = NULL; 2166 B->ops->matsolvetranspose = NULL; 2167 2168 /* get the triangular factors */ 2169 if (!cusparsestruct->use_cpu_solve) PetscCall(MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B)); 2170 PetscFunctionReturn(PETSC_SUCCESS); 2171 } 2172 2173 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info) 2174 { 2175 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(B->spptr); 2176 2177 PetscFunctionBegin; 2178 PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors)); 2179 PetscCall(MatLUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info)); 2180 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 2181 PetscFunctionReturn(PETSC_SUCCESS); 2182 } 2183 2184 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info) 2185 { 2186 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr; 2187 2188 PetscFunctionBegin; 2189 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 2190 PetscBool row_identity = PETSC_FALSE, col_identity = PETSC_FALSE; 2191 if (cusparseTriFactors->factorizeOnDevice) { 2192 PetscCall(ISIdentity(isrow, &row_identity)); 2193 PetscCall(ISIdentity(iscol, &col_identity)); 2194 } 2195 if (!info->levels && row_identity && col_identity) { 2196 PetscCall(MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0(B, A, isrow, iscol, info)); 2197 } else 2198 #endif 2199 { 2200 PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors)); 2201 PetscCall(MatILUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info)); 2202 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 2203 } 2204 PetscFunctionReturn(PETSC_SUCCESS); 2205 } 2206 2207 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS perm, const MatFactorInfo *info) 2208 { 2209 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr; 2210 2211 PetscFunctionBegin; 2212 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 2213 PetscBool perm_identity = PETSC_FALSE; 2214 if (cusparseTriFactors->factorizeOnDevice) PetscCall(ISIdentity(perm, &perm_identity)); 2215 if (!info->levels && perm_identity) { 2216 PetscCall(MatICCFactorSymbolic_SeqAIJCUSPARSE_ICC0(B, A, perm, info)); 2217 } else 2218 #endif 2219 { 2220 PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors)); 2221 PetscCall(MatICCFactorSymbolic_SeqAIJ(B, A, perm, info)); 2222 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 2223 } 2224 PetscFunctionReturn(PETSC_SUCCESS); 2225 } 2226 2227 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS perm, const MatFactorInfo *info) 2228 { 2229 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr; 2230 2231 PetscFunctionBegin; 2232 PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors)); 2233 PetscCall(MatCholeskyFactorSymbolic_SeqAIJ(B, A, perm, info)); 2234 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 2235 PetscFunctionReturn(PETSC_SUCCESS); 2236 } 2237 2238 static PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat, MatSolverType *type) 2239 { 2240 PetscFunctionBegin; 2241 *type = MATSOLVERCUSPARSE; 2242 PetscFunctionReturn(PETSC_SUCCESS); 2243 } 2244 2245 /*MC 2246 MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices 2247 on a single GPU of type, `MATSEQAIJCUSPARSE`. Currently supported 2248 algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer 2249 performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the 2250 CuSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these 2251 algorithms are not recommended. This class does NOT support direct solver operations. 2252 2253 Level: beginner 2254 2255 .seealso: [](ch_matrices), `Mat`, `MATSEQAIJCUSPARSE`, `PCFactorSetMatSolverType()`, `MatSolverType`, `MatCreateSeqAIJCUSPARSE()`, 2256 `MATAIJCUSPARSE`, `MatCreateAIJCUSPARSE()`, `MatCUSPARSESetFormat()`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation` 2257 M*/ 2258 2259 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A, MatFactorType ftype, Mat *B) 2260 { 2261 PetscInt n = A->rmap->n; 2262 PetscBool factOnDevice, factOnHost; 2263 char *prefix; 2264 char factPlace[32] = "device"; /* the default */ 2265 2266 PetscFunctionBegin; 2267 PetscCall(MatCreate(PetscObjectComm((PetscObject)A), B)); 2268 PetscCall(MatSetSizes(*B, n, n, n, n)); 2269 (*B)->factortype = ftype; // factortype makes MatSetType() allocate spptr of type Mat_SeqAIJCUSPARSETriFactors 2270 PetscCall(MatSetType(*B, MATSEQAIJCUSPARSE)); 2271 2272 prefix = (*B)->factorprefix ? (*B)->factorprefix : ((PetscObject)A)->prefix; 2273 PetscOptionsBegin(PetscObjectComm((PetscObject)*B), prefix, "MatGetFactor", "Mat"); 2274 PetscCall(PetscOptionsString("-mat_factor_bind_factorization", "Do matrix factorization on host or device when possible", "MatGetFactor", NULL, factPlace, sizeof(factPlace), NULL)); 2275 PetscOptionsEnd(); 2276 PetscCall(PetscStrcasecmp("device", factPlace, &factOnDevice)); 2277 PetscCall(PetscStrcasecmp("host", factPlace, &factOnHost)); 2278 PetscCheck(factOnDevice || factOnHost, PetscObjectComm((PetscObject)*B), PETSC_ERR_ARG_OUTOFRANGE, "Wrong option %s to -mat_factor_bind_factorization <string>. Only host and device are allowed", factPlace); 2279 ((Mat_SeqAIJCUSPARSETriFactors *)(*B)->spptr)->factorizeOnDevice = factOnDevice; 2280 2281 if (A->boundtocpu && A->bindingpropagates) PetscCall(MatBindToCPU(*B, PETSC_TRUE)); 2282 if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) { 2283 PetscCall(MatSetBlockSizesFromMats(*B, A, A)); 2284 if (!A->boundtocpu) { 2285 (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE; 2286 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJCUSPARSE; 2287 } else { 2288 (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJ; 2289 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJ; 2290 } 2291 PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_LU])); 2292 PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILU])); 2293 PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILUDT])); 2294 } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) { 2295 if (!A->boundtocpu) { 2296 (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJCUSPARSE; 2297 (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE; 2298 } else { 2299 (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJ; 2300 (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJ; 2301 } 2302 PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_CHOLESKY])); 2303 PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ICC])); 2304 } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Factor type not supported for CUSPARSE Matrix Types"); 2305 2306 PetscCall(MatSeqAIJSetPreallocation(*B, MAT_SKIP_ALLOCATION, NULL)); 2307 (*B)->canuseordering = PETSC_TRUE; 2308 PetscCall(PetscObjectComposeFunction((PetscObject)*B, "MatFactorGetSolverType_C", MatFactorGetSolverType_seqaij_cusparse)); 2309 PetscFunctionReturn(PETSC_SUCCESS); 2310 } 2311 2312 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat A) 2313 { 2314 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 2315 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 2316 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 2317 Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 2318 #endif 2319 2320 PetscFunctionBegin; 2321 if (A->offloadmask == PETSC_OFFLOAD_GPU) { 2322 PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyFromGPU, A, 0, 0, 0)); 2323 if (A->factortype == MAT_FACTOR_NONE) { 2324 CsrMatrix *matrix = (CsrMatrix *)cusp->mat->mat; 2325 PetscCallCUDA(cudaMemcpy(a->a, matrix->values->data().get(), a->nz * sizeof(PetscScalar), cudaMemcpyDeviceToHost)); 2326 } 2327 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 2328 else if (fs->csrVal) { 2329 /* We have a factorized matrix on device and are able to copy it to host */ 2330 PetscCallCUDA(cudaMemcpy(a->a, fs->csrVal, a->nz * sizeof(PetscScalar), cudaMemcpyDeviceToHost)); 2331 } 2332 #endif 2333 else 2334 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for copying this type of factorized matrix from device to host"); 2335 PetscCall(PetscLogGpuToCpu(a->nz * sizeof(PetscScalar))); 2336 PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyFromGPU, A, 0, 0, 0)); 2337 A->offloadmask = PETSC_OFFLOAD_BOTH; 2338 } 2339 PetscFunctionReturn(PETSC_SUCCESS); 2340 } 2341 2342 static PetscErrorCode MatSeqAIJGetArray_SeqAIJCUSPARSE(Mat A, PetscScalar *array[]) 2343 { 2344 PetscFunctionBegin; 2345 PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A)); 2346 *array = ((Mat_SeqAIJ *)A->data)->a; 2347 PetscFunctionReturn(PETSC_SUCCESS); 2348 } 2349 2350 static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJCUSPARSE(Mat A, PetscScalar *array[]) 2351 { 2352 PetscFunctionBegin; 2353 A->offloadmask = PETSC_OFFLOAD_CPU; 2354 *array = NULL; 2355 PetscFunctionReturn(PETSC_SUCCESS); 2356 } 2357 2358 static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJCUSPARSE(Mat A, const PetscScalar *array[]) 2359 { 2360 PetscFunctionBegin; 2361 PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A)); 2362 *array = ((Mat_SeqAIJ *)A->data)->a; 2363 PetscFunctionReturn(PETSC_SUCCESS); 2364 } 2365 2366 static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE(Mat, const PetscScalar *array[]) 2367 { 2368 PetscFunctionBegin; 2369 *array = NULL; 2370 PetscFunctionReturn(PETSC_SUCCESS); 2371 } 2372 2373 static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE(Mat A, PetscScalar *array[]) 2374 { 2375 PetscFunctionBegin; 2376 *array = ((Mat_SeqAIJ *)A->data)->a; 2377 PetscFunctionReturn(PETSC_SUCCESS); 2378 } 2379 2380 static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE(Mat A, PetscScalar *array[]) 2381 { 2382 PetscFunctionBegin; 2383 A->offloadmask = PETSC_OFFLOAD_CPU; 2384 *array = NULL; 2385 PetscFunctionReturn(PETSC_SUCCESS); 2386 } 2387 2388 static PetscErrorCode MatSeqAIJGetCSRAndMemType_SeqAIJCUSPARSE(Mat A, const PetscInt **i, const PetscInt **j, PetscScalar **a, PetscMemType *mtype) 2389 { 2390 Mat_SeqAIJCUSPARSE *cusp; 2391 CsrMatrix *matrix; 2392 2393 PetscFunctionBegin; 2394 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 2395 PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "Not for factored matrix"); 2396 cusp = static_cast<Mat_SeqAIJCUSPARSE *>(A->spptr); 2397 PetscCheck(cusp != NULL, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "cusp is NULL"); 2398 matrix = (CsrMatrix *)cusp->mat->mat; 2399 2400 if (i) { 2401 #if !defined(PETSC_USE_64BIT_INDICES) 2402 *i = matrix->row_offsets->data().get(); 2403 #else 2404 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSparse does not supported 64-bit indices"); 2405 #endif 2406 } 2407 if (j) { 2408 #if !defined(PETSC_USE_64BIT_INDICES) 2409 *j = matrix->column_indices->data().get(); 2410 #else 2411 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSparse does not supported 64-bit indices"); 2412 #endif 2413 } 2414 if (a) *a = matrix->values->data().get(); 2415 if (mtype) *mtype = PETSC_MEMTYPE_CUDA; 2416 PetscFunctionReturn(PETSC_SUCCESS); 2417 } 2418 2419 PETSC_INTERN PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A) 2420 { 2421 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr; 2422 Mat_SeqAIJCUSPARSEMultStruct *matstruct = cusparsestruct->mat; 2423 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 2424 PetscInt m = A->rmap->n, *ii, *ridx, tmp; 2425 cusparseStatus_t stat; 2426 PetscBool both = PETSC_TRUE; 2427 2428 PetscFunctionBegin; 2429 PetscCheck(!A->boundtocpu, PETSC_COMM_SELF, PETSC_ERR_GPU, "Cannot copy to GPU"); 2430 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 2431 if (A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) { /* Copy values only */ 2432 CsrMatrix *matrix; 2433 matrix = (CsrMatrix *)cusparsestruct->mat->mat; 2434 2435 PetscCheck(!a->nz || a->a, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR values"); 2436 PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyToGPU, A, 0, 0, 0)); 2437 matrix->values->assign(a->a, a->a + a->nz); 2438 PetscCallCUDA(WaitForCUDA()); 2439 PetscCall(PetscLogCpuToGpu(a->nz * sizeof(PetscScalar))); 2440 PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyToGPU, A, 0, 0, 0)); 2441 PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_FALSE)); 2442 } else { 2443 PetscInt nnz; 2444 PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyToGPU, A, 0, 0, 0)); 2445 PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat, cusparsestruct->format)); 2446 PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE)); 2447 delete cusparsestruct->workVector; 2448 delete cusparsestruct->rowoffsets_gpu; 2449 cusparsestruct->workVector = NULL; 2450 cusparsestruct->rowoffsets_gpu = NULL; 2451 try { 2452 if (a->compressedrow.use) { 2453 m = a->compressedrow.nrows; 2454 ii = a->compressedrow.i; 2455 ridx = a->compressedrow.rindex; 2456 } else { 2457 m = A->rmap->n; 2458 ii = a->i; 2459 ridx = NULL; 2460 } 2461 PetscCheck(ii, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR row data"); 2462 if (!a->a) { 2463 nnz = ii[m]; 2464 both = PETSC_FALSE; 2465 } else nnz = a->nz; 2466 PetscCheck(!nnz || a->j, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR column data"); 2467 2468 /* create cusparse matrix */ 2469 cusparsestruct->nrows = m; 2470 matstruct = new Mat_SeqAIJCUSPARSEMultStruct; 2471 PetscCallCUSPARSE(cusparseCreateMatDescr(&matstruct->descr)); 2472 PetscCallCUSPARSE(cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO)); 2473 PetscCallCUSPARSE(cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 2474 2475 PetscCallCUDA(cudaMalloc((void **)&matstruct->alpha_one, sizeof(PetscScalar))); 2476 PetscCallCUDA(cudaMalloc((void **)&matstruct->beta_zero, sizeof(PetscScalar))); 2477 PetscCallCUDA(cudaMalloc((void **)&matstruct->beta_one, sizeof(PetscScalar))); 2478 PetscCallCUDA(cudaMemcpy(matstruct->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 2479 PetscCallCUDA(cudaMemcpy(matstruct->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 2480 PetscCallCUDA(cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 2481 PetscCallCUSPARSE(cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE)); 2482 2483 /* Build a hybrid/ellpack matrix if this option is chosen for the storage */ 2484 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 2485 /* set the matrix */ 2486 CsrMatrix *mat = new CsrMatrix; 2487 mat->num_rows = m; 2488 mat->num_cols = A->cmap->n; 2489 mat->num_entries = nnz; 2490 PetscCallCXX(mat->row_offsets = new THRUSTINTARRAY32(m + 1)); 2491 mat->row_offsets->assign(ii, ii + m + 1); 2492 2493 PetscCallCXX(mat->column_indices = new THRUSTINTARRAY32(nnz)); 2494 mat->column_indices->assign(a->j, a->j + nnz); 2495 2496 PetscCallCXX(mat->values = new THRUSTARRAY(nnz)); 2497 if (a->a) mat->values->assign(a->a, a->a + nnz); 2498 2499 /* assign the pointer */ 2500 matstruct->mat = mat; 2501 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 2502 if (mat->num_rows) { /* cusparse errors on empty matrices! */ 2503 stat = cusparseCreateCsr(&matstruct->matDescr, mat->num_rows, mat->num_cols, mat->num_entries, mat->row_offsets->data().get(), mat->column_indices->data().get(), mat->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */ 2504 CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype); 2505 PetscCallCUSPARSE(stat); 2506 } 2507 #endif 2508 } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) { 2509 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 2510 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 2511 #else 2512 CsrMatrix *mat = new CsrMatrix; 2513 mat->num_rows = m; 2514 mat->num_cols = A->cmap->n; 2515 mat->num_entries = nnz; 2516 PetscCallCXX(mat->row_offsets = new THRUSTINTARRAY32(m + 1)); 2517 mat->row_offsets->assign(ii, ii + m + 1); 2518 2519 PetscCallCXX(mat->column_indices = new THRUSTINTARRAY32(nnz)); 2520 mat->column_indices->assign(a->j, a->j + nnz); 2521 2522 PetscCallCXX(mat->values = new THRUSTARRAY(nnz)); 2523 if (a->a) mat->values->assign(a->a, a->a + nnz); 2524 2525 cusparseHybMat_t hybMat; 2526 PetscCallCUSPARSE(cusparseCreateHybMat(&hybMat)); 2527 cusparseHybPartition_t partition = cusparsestruct->format == MAT_CUSPARSE_ELL ? CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 2528 stat = cusparse_csr2hyb(cusparsestruct->handle, mat->num_rows, mat->num_cols, matstruct->descr, mat->values->data().get(), mat->row_offsets->data().get(), mat->column_indices->data().get(), hybMat, 0, partition); 2529 PetscCallCUSPARSE(stat); 2530 /* assign the pointer */ 2531 matstruct->mat = hybMat; 2532 2533 if (mat) { 2534 if (mat->values) delete (THRUSTARRAY *)mat->values; 2535 if (mat->column_indices) delete (THRUSTINTARRAY32 *)mat->column_indices; 2536 if (mat->row_offsets) delete (THRUSTINTARRAY32 *)mat->row_offsets; 2537 delete (CsrMatrix *)mat; 2538 } 2539 #endif 2540 } 2541 2542 /* assign the compressed row indices */ 2543 if (a->compressedrow.use) { 2544 PetscCallCXX(cusparsestruct->workVector = new THRUSTARRAY(m)); 2545 PetscCallCXX(matstruct->cprowIndices = new THRUSTINTARRAY(m)); 2546 matstruct->cprowIndices->assign(ridx, ridx + m); 2547 tmp = m; 2548 } else { 2549 cusparsestruct->workVector = NULL; 2550 matstruct->cprowIndices = NULL; 2551 tmp = 0; 2552 } 2553 PetscCall(PetscLogCpuToGpu(((m + 1) + (a->nz)) * sizeof(int) + tmp * sizeof(PetscInt) + (3 + (a->nz)) * sizeof(PetscScalar))); 2554 2555 /* assign the pointer */ 2556 cusparsestruct->mat = matstruct; 2557 } catch (char *ex) { 2558 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex); 2559 } 2560 PetscCallCUDA(WaitForCUDA()); 2561 PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyToGPU, A, 0, 0, 0)); 2562 cusparsestruct->nonzerostate = A->nonzerostate; 2563 } 2564 if (both) A->offloadmask = PETSC_OFFLOAD_BOTH; 2565 } 2566 PetscFunctionReturn(PETSC_SUCCESS); 2567 } 2568 2569 struct VecCUDAPlusEquals { 2570 template <typename Tuple> 2571 __host__ __device__ void operator()(Tuple t) 2572 { 2573 thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t); 2574 } 2575 }; 2576 2577 struct VecCUDAEquals { 2578 template <typename Tuple> 2579 __host__ __device__ void operator()(Tuple t) 2580 { 2581 thrust::get<1>(t) = thrust::get<0>(t); 2582 } 2583 }; 2584 2585 struct VecCUDAEqualsReverse { 2586 template <typename Tuple> 2587 __host__ __device__ void operator()(Tuple t) 2588 { 2589 thrust::get<0>(t) = thrust::get<1>(t); 2590 } 2591 }; 2592 2593 struct MatMatCusparse { 2594 PetscBool cisdense; 2595 PetscScalar *Bt; 2596 Mat X; 2597 PetscBool reusesym; /* Cusparse does not have split symbolic and numeric phases for sparse matmat operations */ 2598 PetscLogDouble flops; 2599 CsrMatrix *Bcsr; 2600 2601 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 2602 cusparseSpMatDescr_t matSpBDescr; 2603 PetscBool initialized; /* C = alpha op(A) op(B) + beta C */ 2604 cusparseDnMatDescr_t matBDescr; 2605 cusparseDnMatDescr_t matCDescr; 2606 PetscInt Blda, Clda; /* Record leading dimensions of B and C here to detect changes*/ 2607 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 2608 void *dBuffer4; 2609 void *dBuffer5; 2610 #endif 2611 size_t mmBufferSize; 2612 void *mmBuffer; 2613 void *mmBuffer2; /* SpGEMM WorkEstimation buffer */ 2614 cusparseSpGEMMDescr_t spgemmDesc; 2615 #endif 2616 }; 2617 2618 static PetscErrorCode MatDestroy_MatMatCusparse(void *data) 2619 { 2620 MatMatCusparse *mmdata = (MatMatCusparse *)data; 2621 2622 PetscFunctionBegin; 2623 PetscCallCUDA(cudaFree(mmdata->Bt)); 2624 delete mmdata->Bcsr; 2625 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 2626 if (mmdata->matSpBDescr) PetscCallCUSPARSE(cusparseDestroySpMat(mmdata->matSpBDescr)); 2627 if (mmdata->matBDescr) PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matBDescr)); 2628 if (mmdata->matCDescr) PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matCDescr)); 2629 if (mmdata->spgemmDesc) PetscCallCUSPARSE(cusparseSpGEMM_destroyDescr(mmdata->spgemmDesc)); 2630 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 2631 if (mmdata->dBuffer4) PetscCallCUDA(cudaFree(mmdata->dBuffer4)); 2632 if (mmdata->dBuffer5) PetscCallCUDA(cudaFree(mmdata->dBuffer5)); 2633 #endif 2634 if (mmdata->mmBuffer) PetscCallCUDA(cudaFree(mmdata->mmBuffer)); 2635 if (mmdata->mmBuffer2) PetscCallCUDA(cudaFree(mmdata->mmBuffer2)); 2636 #endif 2637 PetscCall(MatDestroy(&mmdata->X)); 2638 PetscCall(PetscFree(data)); 2639 PetscFunctionReturn(PETSC_SUCCESS); 2640 } 2641 2642 #include <../src/mat/impls/dense/seq/dense.h> // MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Internal() 2643 2644 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C) 2645 { 2646 Mat_Product *product = C->product; 2647 Mat A, B; 2648 PetscInt m, n, blda, clda; 2649 PetscBool flg, biscuda; 2650 Mat_SeqAIJCUSPARSE *cusp; 2651 cusparseStatus_t stat; 2652 cusparseOperation_t opA; 2653 const PetscScalar *barray; 2654 PetscScalar *carray; 2655 MatMatCusparse *mmdata; 2656 Mat_SeqAIJCUSPARSEMultStruct *mat; 2657 CsrMatrix *csrmat; 2658 2659 PetscFunctionBegin; 2660 MatCheckProduct(C, 1); 2661 PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data empty"); 2662 mmdata = (MatMatCusparse *)product->data; 2663 A = product->A; 2664 B = product->B; 2665 PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg)); 2666 PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name); 2667 /* currently CopyToGpu does not copy if the matrix is bound to CPU 2668 Instead of silently accepting the wrong answer, I prefer to raise the error */ 2669 PetscCheck(!A->boundtocpu, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 2670 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 2671 cusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 2672 switch (product->type) { 2673 case MATPRODUCT_AB: 2674 case MATPRODUCT_PtAP: 2675 mat = cusp->mat; 2676 opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2677 m = A->rmap->n; 2678 n = B->cmap->n; 2679 break; 2680 case MATPRODUCT_AtB: 2681 if (!A->form_explicit_transpose) { 2682 mat = cusp->mat; 2683 opA = CUSPARSE_OPERATION_TRANSPOSE; 2684 } else { 2685 PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A)); 2686 mat = cusp->matTranspose; 2687 opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2688 } 2689 m = A->cmap->n; 2690 n = B->cmap->n; 2691 break; 2692 case MATPRODUCT_ABt: 2693 case MATPRODUCT_RARt: 2694 mat = cusp->mat; 2695 opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2696 m = A->rmap->n; 2697 n = B->rmap->n; 2698 break; 2699 default: 2700 SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]); 2701 } 2702 PetscCheck(mat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing Mat_SeqAIJCUSPARSEMultStruct"); 2703 csrmat = (CsrMatrix *)mat->mat; 2704 /* if the user passed a CPU matrix, copy the data to the GPU */ 2705 PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQDENSECUDA, &biscuda)); 2706 if (!biscuda) PetscCall(MatConvert(B, MATSEQDENSECUDA, MAT_INPLACE_MATRIX, &B)); 2707 PetscCall(MatDenseGetArrayReadAndMemType(B, &barray, nullptr)); 2708 2709 PetscCall(MatDenseGetLDA(B, &blda)); 2710 if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) { 2711 PetscCall(MatDenseGetArrayWriteAndMemType(mmdata->X, &carray, nullptr)); 2712 PetscCall(MatDenseGetLDA(mmdata->X, &clda)); 2713 } else { 2714 PetscCall(MatDenseGetArrayWriteAndMemType(C, &carray, nullptr)); 2715 PetscCall(MatDenseGetLDA(C, &clda)); 2716 } 2717 2718 PetscCall(PetscLogGpuTimeBegin()); 2719 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 2720 cusparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE; 2721 /* (re)allocate mmBuffer if not initialized or LDAs are different */ 2722 if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) { 2723 size_t mmBufferSize; 2724 if (mmdata->initialized && mmdata->Blda != blda) { 2725 PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matBDescr)); 2726 mmdata->matBDescr = NULL; 2727 } 2728 if (!mmdata->matBDescr) { 2729 PetscCallCUSPARSE(cusparseCreateDnMat(&mmdata->matBDescr, B->rmap->n, B->cmap->n, blda, (void *)barray, cusparse_scalartype, CUSPARSE_ORDER_COL)); 2730 mmdata->Blda = blda; 2731 } 2732 2733 if (mmdata->initialized && mmdata->Clda != clda) { 2734 PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matCDescr)); 2735 mmdata->matCDescr = NULL; 2736 } 2737 if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */ 2738 PetscCallCUSPARSE(cusparseCreateDnMat(&mmdata->matCDescr, m, n, clda, (void *)carray, cusparse_scalartype, CUSPARSE_ORDER_COL)); 2739 mmdata->Clda = clda; 2740 } 2741 2742 if (!mat->matDescr) { 2743 stat = cusparseCreateCsr(&mat->matDescr, csrmat->num_rows, csrmat->num_cols, csrmat->num_entries, csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(), csrmat->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */ 2744 CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype); 2745 PetscCallCUSPARSE(stat); 2746 } 2747 stat = cusparseSpMM_bufferSize(cusp->handle, opA, opB, mat->alpha_one, mat->matDescr, mmdata->matBDescr, mat->beta_zero, mmdata->matCDescr, cusparse_scalartype, cusp->spmmAlg, &mmBufferSize); 2748 PetscCallCUSPARSE(stat); 2749 if ((mmdata->mmBuffer && mmdata->mmBufferSize < mmBufferSize) || !mmdata->mmBuffer) { 2750 PetscCallCUDA(cudaFree(mmdata->mmBuffer)); 2751 PetscCallCUDA(cudaMalloc(&mmdata->mmBuffer, mmBufferSize)); 2752 mmdata->mmBufferSize = mmBufferSize; 2753 } 2754 mmdata->initialized = PETSC_TRUE; 2755 } else { 2756 /* to be safe, always update pointers of the mats */ 2757 PetscCallCUSPARSE(cusparseSpMatSetValues(mat->matDescr, csrmat->values->data().get())); 2758 PetscCallCUSPARSE(cusparseDnMatSetValues(mmdata->matBDescr, (void *)barray)); 2759 PetscCallCUSPARSE(cusparseDnMatSetValues(mmdata->matCDescr, (void *)carray)); 2760 } 2761 2762 /* do cusparseSpMM, which supports transpose on B */ 2763 stat = cusparseSpMM(cusp->handle, opA, opB, mat->alpha_one, mat->matDescr, mmdata->matBDescr, mat->beta_zero, mmdata->matCDescr, cusparse_scalartype, cusp->spmmAlg, mmdata->mmBuffer); 2764 PetscCallCUSPARSE(stat); 2765 #else 2766 PetscInt k; 2767 /* cusparseXcsrmm does not support transpose on B */ 2768 if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) { 2769 cublasHandle_t cublasv2handle; 2770 cublasStatus_t cerr; 2771 2772 PetscCall(PetscCUBLASGetHandle(&cublasv2handle)); 2773 cerr = cublasXgeam(cublasv2handle, CUBLAS_OP_T, CUBLAS_OP_T, B->cmap->n, B->rmap->n, &PETSC_CUSPARSE_ONE, barray, blda, &PETSC_CUSPARSE_ZERO, barray, blda, mmdata->Bt, B->cmap->n); 2774 PetscCallCUBLAS(cerr); 2775 blda = B->cmap->n; 2776 k = B->cmap->n; 2777 } else { 2778 k = B->rmap->n; 2779 } 2780 2781 /* perform the MatMat operation, op(A) is m x k, op(B) is k x n */ 2782 stat = cusparse_csr_spmm(cusp->handle, opA, m, n, k, csrmat->num_entries, mat->alpha_one, mat->descr, csrmat->values->data().get(), csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(), mmdata->Bt ? mmdata->Bt : barray, blda, mat->beta_zero, carray, clda); 2783 PetscCallCUSPARSE(stat); 2784 #endif 2785 PetscCall(PetscLogGpuTimeEnd()); 2786 PetscCall(PetscLogGpuFlops(n * 2.0 * csrmat->num_entries)); 2787 PetscCall(MatDenseRestoreArrayReadAndMemType(B, &barray)); 2788 if (product->type == MATPRODUCT_RARt) { 2789 PetscCall(MatDenseRestoreArrayWriteAndMemType(mmdata->X, &carray)); 2790 PetscCall(MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Internal(B, mmdata->X, C, PETSC_FALSE, PETSC_FALSE)); 2791 } else if (product->type == MATPRODUCT_PtAP) { 2792 PetscCall(MatDenseRestoreArrayWriteAndMemType(mmdata->X, &carray)); 2793 PetscCall(MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Internal(B, mmdata->X, C, PETSC_TRUE, PETSC_FALSE)); 2794 } else { 2795 PetscCall(MatDenseRestoreArrayWriteAndMemType(C, &carray)); 2796 } 2797 if (mmdata->cisdense) PetscCall(MatConvert(C, MATSEQDENSE, MAT_INPLACE_MATRIX, &C)); 2798 if (!biscuda) PetscCall(MatConvert(B, MATSEQDENSE, MAT_INPLACE_MATRIX, &B)); 2799 PetscFunctionReturn(PETSC_SUCCESS); 2800 } 2801 2802 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C) 2803 { 2804 Mat_Product *product = C->product; 2805 Mat A, B; 2806 PetscInt m, n; 2807 PetscBool cisdense, flg; 2808 MatMatCusparse *mmdata; 2809 Mat_SeqAIJCUSPARSE *cusp; 2810 2811 PetscFunctionBegin; 2812 MatCheckProduct(C, 1); 2813 PetscCheck(!C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data not empty"); 2814 A = product->A; 2815 B = product->B; 2816 PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg)); 2817 PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name); 2818 cusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 2819 PetscCheck(cusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format"); 2820 switch (product->type) { 2821 case MATPRODUCT_AB: 2822 m = A->rmap->n; 2823 n = B->cmap->n; 2824 break; 2825 case MATPRODUCT_AtB: 2826 m = A->cmap->n; 2827 n = B->cmap->n; 2828 break; 2829 case MATPRODUCT_ABt: 2830 m = A->rmap->n; 2831 n = B->rmap->n; 2832 break; 2833 case MATPRODUCT_PtAP: 2834 m = B->cmap->n; 2835 n = B->cmap->n; 2836 break; 2837 case MATPRODUCT_RARt: 2838 m = B->rmap->n; 2839 n = B->rmap->n; 2840 break; 2841 default: 2842 SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]); 2843 } 2844 PetscCall(MatSetSizes(C, m, n, m, n)); 2845 /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */ 2846 PetscCall(PetscObjectTypeCompare((PetscObject)C, MATSEQDENSE, &cisdense)); 2847 PetscCall(MatSetType(C, MATSEQDENSECUDA)); 2848 2849 /* product data */ 2850 PetscCall(PetscNew(&mmdata)); 2851 mmdata->cisdense = cisdense; 2852 #if PETSC_PKG_CUDA_VERSION_LT(11, 0, 0) 2853 /* cusparseXcsrmm does not support transpose on B, so we allocate buffer to store B^T */ 2854 if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) PetscCallCUDA(cudaMalloc((void **)&mmdata->Bt, (size_t)B->rmap->n * (size_t)B->cmap->n * sizeof(PetscScalar))); 2855 #endif 2856 /* for these products we need intermediate storage */ 2857 if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) { 2858 PetscCall(MatCreate(PetscObjectComm((PetscObject)C), &mmdata->X)); 2859 PetscCall(MatSetType(mmdata->X, MATSEQDENSECUDA)); 2860 if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */ 2861 PetscCall(MatSetSizes(mmdata->X, A->rmap->n, B->rmap->n, A->rmap->n, B->rmap->n)); 2862 } else { 2863 PetscCall(MatSetSizes(mmdata->X, A->rmap->n, B->cmap->n, A->rmap->n, B->cmap->n)); 2864 } 2865 } 2866 C->product->data = mmdata; 2867 C->product->destroy = MatDestroy_MatMatCusparse; 2868 2869 C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA; 2870 PetscFunctionReturn(PETSC_SUCCESS); 2871 } 2872 2873 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C) 2874 { 2875 Mat_Product *product = C->product; 2876 Mat A, B; 2877 Mat_SeqAIJCUSPARSE *Acusp, *Bcusp, *Ccusp; 2878 Mat_SeqAIJ *c = (Mat_SeqAIJ *)C->data; 2879 Mat_SeqAIJCUSPARSEMultStruct *Amat, *Bmat, *Cmat; 2880 CsrMatrix *Acsr, *Bcsr, *Ccsr; 2881 PetscBool flg; 2882 cusparseStatus_t stat; 2883 MatProductType ptype; 2884 MatMatCusparse *mmdata; 2885 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 2886 cusparseSpMatDescr_t BmatSpDescr; 2887 #endif 2888 cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE, opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */ 2889 2890 PetscFunctionBegin; 2891 MatCheckProduct(C, 1); 2892 PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data empty"); 2893 PetscCall(PetscObjectTypeCompare((PetscObject)C, MATSEQAIJCUSPARSE, &flg)); 2894 PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for C of type %s", ((PetscObject)C)->type_name); 2895 mmdata = (MatMatCusparse *)C->product->data; 2896 A = product->A; 2897 B = product->B; 2898 if (mmdata->reusesym) { /* this happens when api_user is true, meaning that the matrix values have been already computed in the MatProductSymbolic phase */ 2899 mmdata->reusesym = PETSC_FALSE; 2900 Ccusp = (Mat_SeqAIJCUSPARSE *)C->spptr; 2901 PetscCheck(Ccusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format"); 2902 Cmat = Ccusp->mat; 2903 PetscCheck(Cmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C mult struct for product type %s", MatProductTypes[C->product->type]); 2904 Ccsr = (CsrMatrix *)Cmat->mat; 2905 PetscCheck(Ccsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C CSR struct"); 2906 goto finalize; 2907 } 2908 if (!c->nz) goto finalize; 2909 PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg)); 2910 PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name); 2911 PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQAIJCUSPARSE, &flg)); 2912 PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for B of type %s", ((PetscObject)B)->type_name); 2913 PetscCheck(!A->boundtocpu, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 2914 PetscCheck(!B->boundtocpu, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 2915 Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 2916 Bcusp = (Mat_SeqAIJCUSPARSE *)B->spptr; 2917 Ccusp = (Mat_SeqAIJCUSPARSE *)C->spptr; 2918 PetscCheck(Acusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format"); 2919 PetscCheck(Bcusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format"); 2920 PetscCheck(Ccusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format"); 2921 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 2922 PetscCall(MatSeqAIJCUSPARSECopyToGPU(B)); 2923 2924 ptype = product->type; 2925 if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) { 2926 ptype = MATPRODUCT_AB; 2927 PetscCheck(product->symbolic_used_the_fact_A_is_symmetric, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Symbolic should have been built using the fact that A is symmetric"); 2928 } 2929 if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) { 2930 ptype = MATPRODUCT_AB; 2931 PetscCheck(product->symbolic_used_the_fact_B_is_symmetric, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Symbolic should have been built using the fact that B is symmetric"); 2932 } 2933 switch (ptype) { 2934 case MATPRODUCT_AB: 2935 Amat = Acusp->mat; 2936 Bmat = Bcusp->mat; 2937 break; 2938 case MATPRODUCT_AtB: 2939 Amat = Acusp->matTranspose; 2940 Bmat = Bcusp->mat; 2941 break; 2942 case MATPRODUCT_ABt: 2943 Amat = Acusp->mat; 2944 Bmat = Bcusp->matTranspose; 2945 break; 2946 default: 2947 SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]); 2948 } 2949 Cmat = Ccusp->mat; 2950 PetscCheck(Amat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A mult struct for product type %s", MatProductTypes[ptype]); 2951 PetscCheck(Bmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B mult struct for product type %s", MatProductTypes[ptype]); 2952 PetscCheck(Cmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C mult struct for product type %s", MatProductTypes[ptype]); 2953 Acsr = (CsrMatrix *)Amat->mat; 2954 Bcsr = mmdata->Bcsr ? mmdata->Bcsr : (CsrMatrix *)Bmat->mat; /* B may be in compressed row storage */ 2955 Ccsr = (CsrMatrix *)Cmat->mat; 2956 PetscCheck(Acsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A CSR struct"); 2957 PetscCheck(Bcsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B CSR struct"); 2958 PetscCheck(Ccsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C CSR struct"); 2959 PetscCall(PetscLogGpuTimeBegin()); 2960 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 2961 BmatSpDescr = mmdata->Bcsr ? mmdata->matSpBDescr : Bmat->matDescr; /* B may be in compressed row storage */ 2962 PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE)); 2963 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 2964 stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc); 2965 PetscCallCUSPARSE(stat); 2966 #else 2967 stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer); 2968 PetscCallCUSPARSE(stat); 2969 stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc); 2970 PetscCallCUSPARSE(stat); 2971 #endif 2972 #else 2973 stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr, Bcsr->num_entries, 2974 Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get()); 2975 PetscCallCUSPARSE(stat); 2976 #endif 2977 PetscCall(PetscLogGpuFlops(mmdata->flops)); 2978 PetscCallCUDA(WaitForCUDA()); 2979 PetscCall(PetscLogGpuTimeEnd()); 2980 C->offloadmask = PETSC_OFFLOAD_GPU; 2981 finalize: 2982 /* shorter version of MatAssemblyEnd_SeqAIJ */ 2983 PetscCall(PetscInfo(C, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n", C->rmap->n, C->cmap->n, c->nz)); 2984 PetscCall(PetscInfo(C, "Number of mallocs during MatSetValues() is 0\n")); 2985 PetscCall(PetscInfo(C, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", c->rmax)); 2986 c->reallocs = 0; 2987 C->info.mallocs += 0; 2988 C->info.nz_unneeded = 0; 2989 C->assembled = C->was_assembled = PETSC_TRUE; 2990 C->num_ass++; 2991 PetscFunctionReturn(PETSC_SUCCESS); 2992 } 2993 2994 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C) 2995 { 2996 Mat_Product *product = C->product; 2997 Mat A, B; 2998 Mat_SeqAIJCUSPARSE *Acusp, *Bcusp, *Ccusp; 2999 Mat_SeqAIJ *a, *b, *c; 3000 Mat_SeqAIJCUSPARSEMultStruct *Amat, *Bmat, *Cmat; 3001 CsrMatrix *Acsr, *Bcsr, *Ccsr; 3002 PetscInt i, j, m, n, k; 3003 PetscBool flg; 3004 cusparseStatus_t stat; 3005 MatProductType ptype; 3006 MatMatCusparse *mmdata; 3007 PetscLogDouble flops; 3008 PetscBool biscompressed, ciscompressed; 3009 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3010 int64_t C_num_rows1, C_num_cols1, C_nnz1; 3011 cusparseSpMatDescr_t BmatSpDescr; 3012 #else 3013 int cnz; 3014 #endif 3015 cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE, opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */ 3016 3017 PetscFunctionBegin; 3018 MatCheckProduct(C, 1); 3019 PetscCheck(!C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data not empty"); 3020 A = product->A; 3021 B = product->B; 3022 PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg)); 3023 PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name); 3024 PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQAIJCUSPARSE, &flg)); 3025 PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for B of type %s", ((PetscObject)B)->type_name); 3026 a = (Mat_SeqAIJ *)A->data; 3027 b = (Mat_SeqAIJ *)B->data; 3028 /* product data */ 3029 PetscCall(PetscNew(&mmdata)); 3030 C->product->data = mmdata; 3031 C->product->destroy = MatDestroy_MatMatCusparse; 3032 3033 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 3034 PetscCall(MatSeqAIJCUSPARSECopyToGPU(B)); 3035 Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr; /* Access spptr after MatSeqAIJCUSPARSECopyToGPU, not before */ 3036 Bcusp = (Mat_SeqAIJCUSPARSE *)B->spptr; 3037 PetscCheck(Acusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format"); 3038 PetscCheck(Bcusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format"); 3039 3040 ptype = product->type; 3041 if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) { 3042 ptype = MATPRODUCT_AB; 3043 product->symbolic_used_the_fact_A_is_symmetric = PETSC_TRUE; 3044 } 3045 if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) { 3046 ptype = MATPRODUCT_AB; 3047 product->symbolic_used_the_fact_B_is_symmetric = PETSC_TRUE; 3048 } 3049 biscompressed = PETSC_FALSE; 3050 ciscompressed = PETSC_FALSE; 3051 switch (ptype) { 3052 case MATPRODUCT_AB: 3053 m = A->rmap->n; 3054 n = B->cmap->n; 3055 k = A->cmap->n; 3056 Amat = Acusp->mat; 3057 Bmat = Bcusp->mat; 3058 if (a->compressedrow.use) ciscompressed = PETSC_TRUE; 3059 if (b->compressedrow.use) biscompressed = PETSC_TRUE; 3060 break; 3061 case MATPRODUCT_AtB: 3062 m = A->cmap->n; 3063 n = B->cmap->n; 3064 k = A->rmap->n; 3065 PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A)); 3066 Amat = Acusp->matTranspose; 3067 Bmat = Bcusp->mat; 3068 if (b->compressedrow.use) biscompressed = PETSC_TRUE; 3069 break; 3070 case MATPRODUCT_ABt: 3071 m = A->rmap->n; 3072 n = B->rmap->n; 3073 k = A->cmap->n; 3074 PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(B)); 3075 Amat = Acusp->mat; 3076 Bmat = Bcusp->matTranspose; 3077 if (a->compressedrow.use) ciscompressed = PETSC_TRUE; 3078 break; 3079 default: 3080 SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]); 3081 } 3082 3083 /* create cusparse matrix */ 3084 PetscCall(MatSetSizes(C, m, n, m, n)); 3085 PetscCall(MatSetType(C, MATSEQAIJCUSPARSE)); 3086 c = (Mat_SeqAIJ *)C->data; 3087 Ccusp = (Mat_SeqAIJCUSPARSE *)C->spptr; 3088 Cmat = new Mat_SeqAIJCUSPARSEMultStruct; 3089 Ccsr = new CsrMatrix; 3090 3091 c->compressedrow.use = ciscompressed; 3092 if (c->compressedrow.use) { /* if a is in compressed row, than c will be in compressed row format */ 3093 c->compressedrow.nrows = a->compressedrow.nrows; 3094 PetscCall(PetscMalloc2(c->compressedrow.nrows + 1, &c->compressedrow.i, c->compressedrow.nrows, &c->compressedrow.rindex)); 3095 PetscCall(PetscArraycpy(c->compressedrow.rindex, a->compressedrow.rindex, c->compressedrow.nrows)); 3096 Ccusp->workVector = new THRUSTARRAY(c->compressedrow.nrows); 3097 Cmat->cprowIndices = new THRUSTINTARRAY(c->compressedrow.nrows); 3098 Cmat->cprowIndices->assign(c->compressedrow.rindex, c->compressedrow.rindex + c->compressedrow.nrows); 3099 } else { 3100 c->compressedrow.nrows = 0; 3101 c->compressedrow.i = NULL; 3102 c->compressedrow.rindex = NULL; 3103 Ccusp->workVector = NULL; 3104 Cmat->cprowIndices = NULL; 3105 } 3106 Ccusp->nrows = ciscompressed ? c->compressedrow.nrows : m; 3107 Ccusp->mat = Cmat; 3108 Ccusp->mat->mat = Ccsr; 3109 Ccsr->num_rows = Ccusp->nrows; 3110 Ccsr->num_cols = n; 3111 Ccsr->row_offsets = new THRUSTINTARRAY32(Ccusp->nrows + 1); 3112 PetscCallCUSPARSE(cusparseCreateMatDescr(&Cmat->descr)); 3113 PetscCallCUSPARSE(cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO)); 3114 PetscCallCUSPARSE(cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 3115 PetscCallCUDA(cudaMalloc((void **)&Cmat->alpha_one, sizeof(PetscScalar))); 3116 PetscCallCUDA(cudaMalloc((void **)&Cmat->beta_zero, sizeof(PetscScalar))); 3117 PetscCallCUDA(cudaMalloc((void **)&Cmat->beta_one, sizeof(PetscScalar))); 3118 PetscCallCUDA(cudaMemcpy(Cmat->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 3119 PetscCallCUDA(cudaMemcpy(Cmat->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 3120 PetscCallCUDA(cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 3121 if (!Ccsr->num_rows || !Ccsr->num_cols || !a->nz || !b->nz) { /* cusparse raise errors in different calls when matrices have zero rows/columns! */ 3122 PetscCallThrust(thrust::fill(thrust::device, Ccsr->row_offsets->begin(), Ccsr->row_offsets->end(), 0)); 3123 c->nz = 0; 3124 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 3125 Ccsr->values = new THRUSTARRAY(c->nz); 3126 goto finalizesym; 3127 } 3128 3129 PetscCheck(Amat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A mult struct for product type %s", MatProductTypes[ptype]); 3130 PetscCheck(Bmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B mult struct for product type %s", MatProductTypes[ptype]); 3131 Acsr = (CsrMatrix *)Amat->mat; 3132 if (!biscompressed) { 3133 Bcsr = (CsrMatrix *)Bmat->mat; 3134 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3135 BmatSpDescr = Bmat->matDescr; 3136 #endif 3137 } else { /* we need to use row offsets for the full matrix */ 3138 CsrMatrix *cBcsr = (CsrMatrix *)Bmat->mat; 3139 Bcsr = new CsrMatrix; 3140 Bcsr->num_rows = B->rmap->n; 3141 Bcsr->num_cols = cBcsr->num_cols; 3142 Bcsr->num_entries = cBcsr->num_entries; 3143 Bcsr->column_indices = cBcsr->column_indices; 3144 Bcsr->values = cBcsr->values; 3145 if (!Bcusp->rowoffsets_gpu) { 3146 Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1); 3147 Bcusp->rowoffsets_gpu->assign(b->i, b->i + B->rmap->n + 1); 3148 PetscCall(PetscLogCpuToGpu((B->rmap->n + 1) * sizeof(PetscInt))); 3149 } 3150 Bcsr->row_offsets = Bcusp->rowoffsets_gpu; 3151 mmdata->Bcsr = Bcsr; 3152 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3153 if (Bcsr->num_rows && Bcsr->num_cols) { 3154 stat = cusparseCreateCsr(&mmdata->matSpBDescr, Bcsr->num_rows, Bcsr->num_cols, Bcsr->num_entries, Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Bcsr->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype); 3155 PetscCallCUSPARSE(stat); 3156 } 3157 BmatSpDescr = mmdata->matSpBDescr; 3158 #endif 3159 } 3160 PetscCheck(Acsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A CSR struct"); 3161 PetscCheck(Bcsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B CSR struct"); 3162 /* precompute flops count */ 3163 if (ptype == MATPRODUCT_AB) { 3164 for (i = 0, flops = 0; i < A->rmap->n; i++) { 3165 const PetscInt st = a->i[i]; 3166 const PetscInt en = a->i[i + 1]; 3167 for (j = st; j < en; j++) { 3168 const PetscInt brow = a->j[j]; 3169 flops += 2. * (b->i[brow + 1] - b->i[brow]); 3170 } 3171 } 3172 } else if (ptype == MATPRODUCT_AtB) { 3173 for (i = 0, flops = 0; i < A->rmap->n; i++) { 3174 const PetscInt anzi = a->i[i + 1] - a->i[i]; 3175 const PetscInt bnzi = b->i[i + 1] - b->i[i]; 3176 flops += (2. * anzi) * bnzi; 3177 } 3178 } else { /* TODO */ 3179 flops = 0.; 3180 } 3181 3182 mmdata->flops = flops; 3183 PetscCall(PetscLogGpuTimeBegin()); 3184 3185 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3186 PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE)); 3187 // cuda-12.2 requires non-null csrRowOffsets 3188 stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, 0, Ccsr->row_offsets->data().get(), NULL, NULL, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype); 3189 PetscCallCUSPARSE(stat); 3190 PetscCallCUSPARSE(cusparseSpGEMM_createDescr(&mmdata->spgemmDesc)); 3191 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 3192 { 3193 /* cusparseSpGEMMreuse has more reasonable APIs than cusparseSpGEMM, so we prefer to use it. 3194 We follow the sample code at https://github.com/NVIDIA/CUDALibrarySamples/blob/master/cuSPARSE/spgemm_reuse 3195 */ 3196 void *dBuffer1 = NULL; 3197 void *dBuffer2 = NULL; 3198 void *dBuffer3 = NULL; 3199 /* dBuffer4, dBuffer5 are needed by cusparseSpGEMMreuse_compute, and therefore are stored in mmdata */ 3200 size_t bufferSize1 = 0; 3201 size_t bufferSize2 = 0; 3202 size_t bufferSize3 = 0; 3203 size_t bufferSize4 = 0; 3204 size_t bufferSize5 = 0; 3205 3206 /* ask bufferSize1 bytes for external memory */ 3207 stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize1, NULL); 3208 PetscCallCUSPARSE(stat); 3209 PetscCallCUDA(cudaMalloc((void **)&dBuffer1, bufferSize1)); 3210 /* inspect the matrices A and B to understand the memory requirement for the next step */ 3211 stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize1, dBuffer1); 3212 PetscCallCUSPARSE(stat); 3213 3214 stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize2, NULL, &bufferSize3, NULL, &bufferSize4, NULL); 3215 PetscCallCUSPARSE(stat); 3216 PetscCallCUDA(cudaMalloc((void **)&dBuffer2, bufferSize2)); 3217 PetscCallCUDA(cudaMalloc((void **)&dBuffer3, bufferSize3)); 3218 PetscCallCUDA(cudaMalloc((void **)&mmdata->dBuffer4, bufferSize4)); 3219 stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize2, dBuffer2, &bufferSize3, dBuffer3, &bufferSize4, mmdata->dBuffer4); 3220 PetscCallCUSPARSE(stat); 3221 PetscCallCUDA(cudaFree(dBuffer1)); 3222 PetscCallCUDA(cudaFree(dBuffer2)); 3223 3224 /* get matrix C non-zero entries C_nnz1 */ 3225 PetscCallCUSPARSE(cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1)); 3226 c->nz = (PetscInt)C_nnz1; 3227 /* allocate matrix C */ 3228 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 3229 PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 3230 Ccsr->values = new THRUSTARRAY(c->nz); 3231 PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 3232 /* update matC with the new pointers */ 3233 stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get()); 3234 PetscCallCUSPARSE(stat); 3235 3236 stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize5, NULL); 3237 PetscCallCUSPARSE(stat); 3238 PetscCallCUDA(cudaMalloc((void **)&mmdata->dBuffer5, bufferSize5)); 3239 stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize5, mmdata->dBuffer5); 3240 PetscCallCUSPARSE(stat); 3241 PetscCallCUDA(cudaFree(dBuffer3)); 3242 stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc); 3243 PetscCallCUSPARSE(stat); 3244 PetscCall(PetscInfo(C, "Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n", MatProductTypes[ptype], m, n, k, a->nz, b->nz, c->nz, bufferSize4 / 1024, bufferSize5 / 1024)); 3245 } 3246 #else 3247 size_t bufSize2; 3248 /* ask bufferSize bytes for external memory */ 3249 stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufSize2, NULL); 3250 PetscCallCUSPARSE(stat); 3251 PetscCallCUDA(cudaMalloc((void **)&mmdata->mmBuffer2, bufSize2)); 3252 /* inspect the matrices A and B to understand the memory requirement for the next step */ 3253 stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufSize2, mmdata->mmBuffer2); 3254 PetscCallCUSPARSE(stat); 3255 /* ask bufferSize again bytes for external memory */ 3256 stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, NULL); 3257 PetscCallCUSPARSE(stat); 3258 /* The CUSPARSE documentation is not clear, nor the API 3259 We need both buffers to perform the operations properly! 3260 mmdata->mmBuffer2 does not appear anywhere in the compute/copy API 3261 it only appears for the workEstimation stuff, but it seems it is needed in compute, so probably the address 3262 is stored in the descriptor! What a messy API... */ 3263 PetscCallCUDA(cudaMalloc((void **)&mmdata->mmBuffer, mmdata->mmBufferSize)); 3264 /* compute the intermediate product of A * B */ 3265 stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer); 3266 PetscCallCUSPARSE(stat); 3267 /* get matrix C non-zero entries C_nnz1 */ 3268 PetscCallCUSPARSE(cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1)); 3269 c->nz = (PetscInt)C_nnz1; 3270 PetscCall(PetscInfo(C, "Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n", MatProductTypes[ptype], m, n, k, a->nz, b->nz, c->nz, bufSize2 / 1024, 3271 mmdata->mmBufferSize / 1024)); 3272 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 3273 PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 3274 Ccsr->values = new THRUSTARRAY(c->nz); 3275 PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 3276 stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get()); 3277 PetscCallCUSPARSE(stat); 3278 stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc); 3279 PetscCallCUSPARSE(stat); 3280 #endif // PETSC_PKG_CUDA_VERSION_GE(11,4,0) 3281 #else 3282 PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_HOST)); 3283 stat = cusparseXcsrgemmNnz(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr, Bcsr->num_entries, 3284 Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->row_offsets->data().get(), &cnz); 3285 PetscCallCUSPARSE(stat); 3286 c->nz = cnz; 3287 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 3288 PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 3289 Ccsr->values = new THRUSTARRAY(c->nz); 3290 PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 3291 3292 PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE)); 3293 /* with the old gemm interface (removed from 11.0 on) we cannot compute the symbolic factorization only. 3294 I have tried using the gemm2 interface (alpha * A * B + beta * D), which allows to do symbolic by passing NULL for values, but it seems quite buggy when 3295 D is NULL, despite the fact that CUSPARSE documentation claims it is supported! */ 3296 stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr, Bcsr->num_entries, 3297 Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get()); 3298 PetscCallCUSPARSE(stat); 3299 #endif 3300 PetscCall(PetscLogGpuFlops(mmdata->flops)); 3301 PetscCall(PetscLogGpuTimeEnd()); 3302 finalizesym: 3303 c->free_a = PETSC_TRUE; 3304 PetscCall(PetscShmgetAllocateArray(c->nz, sizeof(PetscInt), (void **)&c->j)); 3305 PetscCall(PetscShmgetAllocateArray(m + 1, sizeof(PetscInt), (void **)&c->i)); 3306 c->free_ij = PETSC_TRUE; 3307 if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64-bit conversion on the GPU and then copy to host (lazy) */ 3308 PetscInt *d_i = c->i; 3309 THRUSTINTARRAY ii(Ccsr->row_offsets->size()); 3310 THRUSTINTARRAY jj(Ccsr->column_indices->size()); 3311 ii = *Ccsr->row_offsets; 3312 jj = *Ccsr->column_indices; 3313 if (ciscompressed) d_i = c->compressedrow.i; 3314 PetscCallCUDA(cudaMemcpy(d_i, ii.data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 3315 PetscCallCUDA(cudaMemcpy(c->j, jj.data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 3316 } else { 3317 PetscInt *d_i = c->i; 3318 if (ciscompressed) d_i = c->compressedrow.i; 3319 PetscCallCUDA(cudaMemcpy(d_i, Ccsr->row_offsets->data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 3320 PetscCallCUDA(cudaMemcpy(c->j, Ccsr->column_indices->data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 3321 } 3322 if (ciscompressed) { /* need to expand host row offsets */ 3323 PetscInt r = 0; 3324 c->i[0] = 0; 3325 for (k = 0; k < c->compressedrow.nrows; k++) { 3326 const PetscInt next = c->compressedrow.rindex[k]; 3327 const PetscInt old = c->compressedrow.i[k]; 3328 for (; r < next; r++) c->i[r + 1] = old; 3329 } 3330 for (; r < m; r++) c->i[r + 1] = c->compressedrow.i[c->compressedrow.nrows]; 3331 } 3332 PetscCall(PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size()) * sizeof(PetscInt))); 3333 PetscCall(PetscMalloc1(m, &c->ilen)); 3334 PetscCall(PetscMalloc1(m, &c->imax)); 3335 c->maxnz = c->nz; 3336 c->nonzerorowcnt = 0; 3337 c->rmax = 0; 3338 for (k = 0; k < m; k++) { 3339 const PetscInt nn = c->i[k + 1] - c->i[k]; 3340 c->ilen[k] = c->imax[k] = nn; 3341 c->nonzerorowcnt += (PetscInt) !!nn; 3342 c->rmax = PetscMax(c->rmax, nn); 3343 } 3344 PetscCall(MatMarkDiagonal_SeqAIJ(C)); 3345 PetscCall(PetscMalloc1(c->nz, &c->a)); 3346 Ccsr->num_entries = c->nz; 3347 3348 C->nonzerostate++; 3349 PetscCall(PetscLayoutSetUp(C->rmap)); 3350 PetscCall(PetscLayoutSetUp(C->cmap)); 3351 Ccusp->nonzerostate = C->nonzerostate; 3352 C->offloadmask = PETSC_OFFLOAD_UNALLOCATED; 3353 C->preallocated = PETSC_TRUE; 3354 C->assembled = PETSC_FALSE; 3355 C->was_assembled = PETSC_FALSE; 3356 if (product->api_user && A->offloadmask == PETSC_OFFLOAD_BOTH && B->offloadmask == PETSC_OFFLOAD_BOTH) { /* flag the matrix C values as computed, so that the numeric phase will only call MatAssembly */ 3357 mmdata->reusesym = PETSC_TRUE; 3358 C->offloadmask = PETSC_OFFLOAD_GPU; 3359 } 3360 C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE; 3361 PetscFunctionReturn(PETSC_SUCCESS); 3362 } 3363 3364 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat); 3365 3366 /* handles sparse or dense B */ 3367 static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat mat) 3368 { 3369 Mat_Product *product = mat->product; 3370 PetscBool isdense = PETSC_FALSE, Biscusp = PETSC_FALSE, Ciscusp = PETSC_TRUE; 3371 3372 PetscFunctionBegin; 3373 MatCheckProduct(mat, 1); 3374 PetscCall(PetscObjectBaseTypeCompare((PetscObject)product->B, MATSEQDENSE, &isdense)); 3375 if (!product->A->boundtocpu && !product->B->boundtocpu) PetscCall(PetscObjectTypeCompare((PetscObject)product->B, MATSEQAIJCUSPARSE, &Biscusp)); 3376 if (product->type == MATPRODUCT_ABC) { 3377 Ciscusp = PETSC_FALSE; 3378 if (!product->C->boundtocpu) PetscCall(PetscObjectTypeCompare((PetscObject)product->C, MATSEQAIJCUSPARSE, &Ciscusp)); 3379 } 3380 if (Biscusp && Ciscusp) { /* we can always select the CPU backend */ 3381 PetscBool usecpu = PETSC_FALSE; 3382 switch (product->type) { 3383 case MATPRODUCT_AB: 3384 if (product->api_user) { 3385 PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatMatMult", "Mat"); 3386 PetscCall(PetscOptionsBool("-matmatmult_backend_cpu", "Use CPU code", "MatMatMult", usecpu, &usecpu, NULL)); 3387 PetscOptionsEnd(); 3388 } else { 3389 PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_AB", "Mat"); 3390 PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatMatMult", usecpu, &usecpu, NULL)); 3391 PetscOptionsEnd(); 3392 } 3393 break; 3394 case MATPRODUCT_AtB: 3395 if (product->api_user) { 3396 PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatTransposeMatMult", "Mat"); 3397 PetscCall(PetscOptionsBool("-mattransposematmult_backend_cpu", "Use CPU code", "MatTransposeMatMult", usecpu, &usecpu, NULL)); 3398 PetscOptionsEnd(); 3399 } else { 3400 PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_AtB", "Mat"); 3401 PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatTransposeMatMult", usecpu, &usecpu, NULL)); 3402 PetscOptionsEnd(); 3403 } 3404 break; 3405 case MATPRODUCT_PtAP: 3406 if (product->api_user) { 3407 PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatPtAP", "Mat"); 3408 PetscCall(PetscOptionsBool("-matptap_backend_cpu", "Use CPU code", "MatPtAP", usecpu, &usecpu, NULL)); 3409 PetscOptionsEnd(); 3410 } else { 3411 PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_PtAP", "Mat"); 3412 PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatPtAP", usecpu, &usecpu, NULL)); 3413 PetscOptionsEnd(); 3414 } 3415 break; 3416 case MATPRODUCT_RARt: 3417 if (product->api_user) { 3418 PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatRARt", "Mat"); 3419 PetscCall(PetscOptionsBool("-matrart_backend_cpu", "Use CPU code", "MatRARt", usecpu, &usecpu, NULL)); 3420 PetscOptionsEnd(); 3421 } else { 3422 PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_RARt", "Mat"); 3423 PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatRARt", usecpu, &usecpu, NULL)); 3424 PetscOptionsEnd(); 3425 } 3426 break; 3427 case MATPRODUCT_ABC: 3428 if (product->api_user) { 3429 PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatMatMatMult", "Mat"); 3430 PetscCall(PetscOptionsBool("-matmatmatmult_backend_cpu", "Use CPU code", "MatMatMatMult", usecpu, &usecpu, NULL)); 3431 PetscOptionsEnd(); 3432 } else { 3433 PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_ABC", "Mat"); 3434 PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatMatMatMult", usecpu, &usecpu, NULL)); 3435 PetscOptionsEnd(); 3436 } 3437 break; 3438 default: 3439 break; 3440 } 3441 if (usecpu) Biscusp = Ciscusp = PETSC_FALSE; 3442 } 3443 /* dispatch */ 3444 if (isdense) { 3445 switch (product->type) { 3446 case MATPRODUCT_AB: 3447 case MATPRODUCT_AtB: 3448 case MATPRODUCT_ABt: 3449 case MATPRODUCT_PtAP: 3450 case MATPRODUCT_RARt: 3451 if (product->A->boundtocpu) { 3452 PetscCall(MatProductSetFromOptions_SeqAIJ_SeqDense(mat)); 3453 } else { 3454 mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA; 3455 } 3456 break; 3457 case MATPRODUCT_ABC: 3458 mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; 3459 break; 3460 default: 3461 break; 3462 } 3463 } else if (Biscusp && Ciscusp) { 3464 switch (product->type) { 3465 case MATPRODUCT_AB: 3466 case MATPRODUCT_AtB: 3467 case MATPRODUCT_ABt: 3468 mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE; 3469 break; 3470 case MATPRODUCT_PtAP: 3471 case MATPRODUCT_RARt: 3472 case MATPRODUCT_ABC: 3473 mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; 3474 break; 3475 default: 3476 break; 3477 } 3478 } else { /* fallback for AIJ */ 3479 PetscCall(MatProductSetFromOptions_SeqAIJ(mat)); 3480 } 3481 PetscFunctionReturn(PETSC_SUCCESS); 3482 } 3483 3484 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy) 3485 { 3486 PetscFunctionBegin; 3487 PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, NULL, yy, PETSC_FALSE, PETSC_FALSE)); 3488 PetscFunctionReturn(PETSC_SUCCESS); 3489 } 3490 3491 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz) 3492 { 3493 PetscFunctionBegin; 3494 PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, yy, zz, PETSC_FALSE, PETSC_FALSE)); 3495 PetscFunctionReturn(PETSC_SUCCESS); 3496 } 3497 3498 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy) 3499 { 3500 PetscFunctionBegin; 3501 PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, NULL, yy, PETSC_TRUE, PETSC_TRUE)); 3502 PetscFunctionReturn(PETSC_SUCCESS); 3503 } 3504 3505 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz) 3506 { 3507 PetscFunctionBegin; 3508 PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, yy, zz, PETSC_TRUE, PETSC_TRUE)); 3509 PetscFunctionReturn(PETSC_SUCCESS); 3510 } 3511 3512 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy) 3513 { 3514 PetscFunctionBegin; 3515 PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, NULL, yy, PETSC_TRUE, PETSC_FALSE)); 3516 PetscFunctionReturn(PETSC_SUCCESS); 3517 } 3518 3519 __global__ static void ScatterAdd(PetscInt n, PetscInt *idx, const PetscScalar *x, PetscScalar *y) 3520 { 3521 int i = blockIdx.x * blockDim.x + threadIdx.x; 3522 if (i < n) y[idx[i]] += x[i]; 3523 } 3524 3525 /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */ 3526 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz, PetscBool trans, PetscBool herm) 3527 { 3528 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 3529 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr; 3530 Mat_SeqAIJCUSPARSEMultStruct *matstruct; 3531 PetscScalar *xarray, *zarray, *dptr, *beta, *xptr; 3532 cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 3533 PetscBool compressed; 3534 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3535 PetscInt nx, ny; 3536 #endif 3537 3538 PetscFunctionBegin; 3539 PetscCheck(!herm || trans, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Hermitian and not transpose not supported"); 3540 if (!a->nz) { 3541 if (yy) PetscCall(VecSeq_CUDA::Copy(yy, zz)); 3542 else PetscCall(VecSeq_CUDA::Set(zz, 0)); 3543 PetscFunctionReturn(PETSC_SUCCESS); 3544 } 3545 /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */ 3546 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 3547 if (!trans) { 3548 matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->mat; 3549 PetscCheck(matstruct, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "SeqAIJCUSPARSE does not have a 'mat' (need to fix)"); 3550 } else { 3551 if (herm || !A->form_explicit_transpose) { 3552 opA = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE; 3553 matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->mat; 3554 } else { 3555 if (!cusparsestruct->matTranspose) PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A)); 3556 matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->matTranspose; 3557 } 3558 } 3559 /* Does the matrix use compressed rows (i.e., drop zero rows)? */ 3560 compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE; 3561 3562 try { 3563 PetscCall(VecCUDAGetArrayRead(xx, (const PetscScalar **)&xarray)); 3564 if (yy == zz) PetscCall(VecCUDAGetArray(zz, &zarray)); /* read & write zz, so need to get up-to-date zarray on GPU */ 3565 else PetscCall(VecCUDAGetArrayWrite(zz, &zarray)); /* write zz, so no need to init zarray on GPU */ 3566 3567 PetscCall(PetscLogGpuTimeBegin()); 3568 if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) { 3569 /* z = A x + beta y. 3570 If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax. 3571 When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call. 3572 */ 3573 xptr = xarray; 3574 dptr = compressed ? cusparsestruct->workVector->data().get() : zarray; 3575 beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero; 3576 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3577 /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is 3578 allocated to accommodate different uses. So we get the length info directly from mat. 3579 */ 3580 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 3581 CsrMatrix *mat = (CsrMatrix *)matstruct->mat; 3582 nx = mat->num_cols; 3583 ny = mat->num_rows; 3584 } 3585 #endif 3586 } else { 3587 /* z = A^T x + beta y 3588 If A is compressed, then we need a work vector as the shorter version of x to compute A^T x. 3589 Note A^Tx is of full length, so we set beta to 1.0 if y exists. 3590 */ 3591 xptr = compressed ? cusparsestruct->workVector->data().get() : xarray; 3592 dptr = zarray; 3593 beta = yy ? matstruct->beta_one : matstruct->beta_zero; 3594 if (compressed) { /* Scatter x to work vector */ 3595 thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray); 3596 3597 thrust::for_each( 3598 #if PetscDefined(HAVE_THRUST_ASYNC) 3599 thrust::cuda::par.on(PetscDefaultCudaStream), 3600 #endif 3601 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))), 3602 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), VecCUDAEqualsReverse()); 3603 } 3604 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3605 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 3606 CsrMatrix *mat = (CsrMatrix *)matstruct->mat; 3607 nx = mat->num_rows; 3608 ny = mat->num_cols; 3609 } 3610 #endif 3611 } 3612 3613 /* csr_spmv does y = alpha op(A) x + beta y */ 3614 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 3615 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3616 PetscCheck(opA >= 0 && opA <= 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE ABI on cusparseOperation_t has changed and PETSc has not been updated accordingly"); 3617 if (!matstruct->cuSpMV[opA].initialized) { /* built on demand */ 3618 PetscCallCUSPARSE(cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecXDescr, nx, xptr, cusparse_scalartype)); 3619 PetscCallCUSPARSE(cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecYDescr, ny, dptr, cusparse_scalartype)); 3620 PetscCallCUSPARSE( 3621 cusparseSpMV_bufferSize(cusparsestruct->handle, opA, matstruct->alpha_one, matstruct->matDescr, matstruct->cuSpMV[opA].vecXDescr, beta, matstruct->cuSpMV[opA].vecYDescr, cusparse_scalartype, cusparsestruct->spmvAlg, &matstruct->cuSpMV[opA].spmvBufferSize)); 3622 PetscCallCUDA(cudaMalloc(&matstruct->cuSpMV[opA].spmvBuffer, matstruct->cuSpMV[opA].spmvBufferSize)); 3623 3624 matstruct->cuSpMV[opA].initialized = PETSC_TRUE; 3625 } else { 3626 /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */ 3627 PetscCallCUSPARSE(cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecXDescr, xptr)); 3628 PetscCallCUSPARSE(cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecYDescr, dptr)); 3629 } 3630 3631 PetscCallCUSPARSE(cusparseSpMV(cusparsestruct->handle, opA, matstruct->alpha_one, matstruct->matDescr, /* built in MatSeqAIJCUSPARSECopyToGPU() or MatSeqAIJCUSPARSEFormExplicitTranspose() */ 3632 matstruct->cuSpMV[opA].vecXDescr, beta, matstruct->cuSpMV[opA].vecYDescr, cusparse_scalartype, cusparsestruct->spmvAlg, matstruct->cuSpMV[opA].spmvBuffer)); 3633 #else 3634 CsrMatrix *mat = (CsrMatrix *)matstruct->mat; 3635 PetscCallCUSPARSE(cusparse_csr_spmv(cusparsestruct->handle, opA, mat->num_rows, mat->num_cols, mat->num_entries, matstruct->alpha_one, matstruct->descr, mat->values->data().get(), mat->row_offsets->data().get(), mat->column_indices->data().get(), xptr, beta, dptr)); 3636 #endif 3637 } else { 3638 if (cusparsestruct->nrows) { 3639 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3640 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 3641 #else 3642 cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat; 3643 PetscCallCUSPARSE(cusparse_hyb_spmv(cusparsestruct->handle, opA, matstruct->alpha_one, matstruct->descr, hybMat, xptr, beta, dptr)); 3644 #endif 3645 } 3646 } 3647 PetscCall(PetscLogGpuTimeEnd()); 3648 3649 if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) { 3650 if (yy) { /* MatMultAdd: zz = A*xx + yy */ 3651 if (compressed) { /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */ 3652 PetscCall(VecSeq_CUDA::Copy(yy, zz)); /* zz = yy */ 3653 } else if (zz != yy) { /* A is not compressed. zz already contains A*xx, and we just need to add yy */ 3654 PetscCall(VecSeq_CUDA::AXPY(zz, 1.0, yy)); /* zz += yy */ 3655 } 3656 } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */ 3657 PetscCall(VecSeq_CUDA::Set(zz, 0)); 3658 } 3659 3660 /* ScatterAdd the result from work vector into the full vector when A is compressed */ 3661 if (compressed) { 3662 PetscCall(PetscLogGpuTimeBegin()); 3663 /* I wanted to make this for_each asynchronous but failed. thrust::async::for_each() returns an event (internally registered) 3664 and in the destructor of the scope, it will call cudaStreamSynchronize() on this stream. One has to store all events to 3665 prevent that. So I just add a ScatterAdd kernel. 3666 */ 3667 #if 0 3668 thrust::device_ptr<PetscScalar> zptr = thrust::device_pointer_cast(zarray); 3669 thrust::async::for_each(thrust::cuda::par.on(cusparsestruct->stream), 3670 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))), 3671 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), 3672 VecCUDAPlusEquals()); 3673 #else 3674 PetscInt n = matstruct->cprowIndices->size(); 3675 ScatterAdd<<<(n + 255) / 256, 256, 0, PetscDefaultCudaStream>>>(n, matstruct->cprowIndices->data().get(), cusparsestruct->workVector->data().get(), zarray); 3676 #endif 3677 PetscCall(PetscLogGpuTimeEnd()); 3678 } 3679 } else { 3680 if (yy && yy != zz) PetscCall(VecSeq_CUDA::AXPY(zz, 1.0, yy)); /* zz += yy */ 3681 } 3682 PetscCall(VecCUDARestoreArrayRead(xx, (const PetscScalar **)&xarray)); 3683 if (yy == zz) PetscCall(VecCUDARestoreArray(zz, &zarray)); 3684 else PetscCall(VecCUDARestoreArrayWrite(zz, &zarray)); 3685 } catch (char *ex) { 3686 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex); 3687 } 3688 if (yy) { 3689 PetscCall(PetscLogGpuFlops(2.0 * a->nz)); 3690 } else { 3691 PetscCall(PetscLogGpuFlops(2.0 * a->nz - a->nonzerorowcnt)); 3692 } 3693 PetscFunctionReturn(PETSC_SUCCESS); 3694 } 3695 3696 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz) 3697 { 3698 PetscFunctionBegin; 3699 PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, yy, zz, PETSC_TRUE, PETSC_FALSE)); 3700 PetscFunctionReturn(PETSC_SUCCESS); 3701 } 3702 3703 static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A, MatAssemblyType mode) 3704 { 3705 PetscFunctionBegin; 3706 PetscCall(MatAssemblyEnd_SeqAIJ(A, mode)); 3707 PetscFunctionReturn(PETSC_SUCCESS); 3708 } 3709 3710 /*@ 3711 MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in `MATAIJCUSPARSE` (compressed row) format 3712 (the default parallel PETSc format). 3713 3714 Collective 3715 3716 Input Parameters: 3717 + comm - MPI communicator, set to `PETSC_COMM_SELF` 3718 . m - number of rows 3719 . n - number of columns 3720 . nz - number of nonzeros per row (same for all rows), ignored if `nnz` is provide 3721 - nnz - array containing the number of nonzeros in the various rows (possibly different for each row) or `NULL` 3722 3723 Output Parameter: 3724 . A - the matrix 3725 3726 Level: intermediate 3727 3728 Notes: 3729 This matrix will ultimately pushed down to NVIDIA GPUs and use the CuSPARSE library for 3730 calculations. For good matrix assembly performance the user should preallocate the matrix 3731 storage by setting the parameter `nz` (or the array `nnz`). 3732 3733 It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`, 3734 MatXXXXSetPreallocation() paradgm instead of this routine directly. 3735 [MatXXXXSetPreallocation() is, for example, `MatSeqAIJSetPreallocation()`] 3736 3737 The AIJ format, also called 3738 compressed row storage, is fully compatible with standard Fortran 3739 storage. That is, the stored row and column indices can begin at 3740 either one (as in Fortran) or zero. 3741 3742 Specify the preallocated storage with either nz or nnz (not both). 3743 Set `nz` = `PETSC_DEFAULT` and `nnz` = `NULL` for PETSc to control dynamic memory 3744 allocation. 3745 3746 .seealso: [](ch_matrices), `Mat`, `MATSEQAIJCUSPARSE`, `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`, `MATAIJCUSPARSE` 3747 @*/ 3748 PetscErrorCode MatCreateSeqAIJCUSPARSE(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt nz, const PetscInt nnz[], Mat *A) 3749 { 3750 PetscFunctionBegin; 3751 PetscCall(MatCreate(comm, A)); 3752 PetscCall(MatSetSizes(*A, m, n, m, n)); 3753 PetscCall(MatSetType(*A, MATSEQAIJCUSPARSE)); 3754 PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A, nz, (PetscInt *)nnz)); 3755 PetscFunctionReturn(PETSC_SUCCESS); 3756 } 3757 3758 static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A) 3759 { 3760 PetscFunctionBegin; 3761 if (A->factortype == MAT_FACTOR_NONE) { 3762 PetscCall(MatSeqAIJCUSPARSE_Destroy(A)); 3763 } else { 3764 PetscCall(MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors **)&A->spptr)); 3765 } 3766 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", NULL)); 3767 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatCUSPARSESetFormat_C", NULL)); 3768 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatCUSPARSESetUseCPUSolve_C", NULL)); 3769 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C", NULL)); 3770 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdense_C", NULL)); 3771 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C", NULL)); 3772 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatFactorGetSolverType_C", NULL)); 3773 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL)); 3774 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL)); 3775 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqaijcusparse_hypre_C", NULL)); 3776 PetscCall(MatDestroy_SeqAIJ(A)); 3777 PetscFunctionReturn(PETSC_SUCCESS); 3778 } 3779 3780 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat, MatType, MatReuse, Mat *); 3781 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat, PetscBool); 3782 static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A, MatDuplicateOption cpvalues, Mat *B) 3783 { 3784 PetscFunctionBegin; 3785 PetscCall(MatDuplicate_SeqAIJ(A, cpvalues, B)); 3786 PetscCall(MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B, MATSEQAIJCUSPARSE, MAT_INPLACE_MATRIX, B)); 3787 PetscFunctionReturn(PETSC_SUCCESS); 3788 } 3789 3790 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat Y, PetscScalar a, Mat X, MatStructure str) 3791 { 3792 Mat_SeqAIJ *x = (Mat_SeqAIJ *)X->data, *y = (Mat_SeqAIJ *)Y->data; 3793 Mat_SeqAIJCUSPARSE *cy; 3794 Mat_SeqAIJCUSPARSE *cx; 3795 PetscScalar *ay; 3796 const PetscScalar *ax; 3797 CsrMatrix *csry, *csrx; 3798 3799 PetscFunctionBegin; 3800 cy = (Mat_SeqAIJCUSPARSE *)Y->spptr; 3801 cx = (Mat_SeqAIJCUSPARSE *)X->spptr; 3802 if (X->ops->axpy != Y->ops->axpy) { 3803 PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(Y, PETSC_FALSE)); 3804 PetscCall(MatAXPY_SeqAIJ(Y, a, X, str)); 3805 PetscFunctionReturn(PETSC_SUCCESS); 3806 } 3807 /* if we are here, it means both matrices are bound to GPU */ 3808 PetscCall(MatSeqAIJCUSPARSECopyToGPU(Y)); 3809 PetscCall(MatSeqAIJCUSPARSECopyToGPU(X)); 3810 PetscCheck(cy->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)Y), PETSC_ERR_GPU, "only MAT_CUSPARSE_CSR supported"); 3811 PetscCheck(cx->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)X), PETSC_ERR_GPU, "only MAT_CUSPARSE_CSR supported"); 3812 csry = (CsrMatrix *)cy->mat->mat; 3813 csrx = (CsrMatrix *)cx->mat->mat; 3814 /* see if we can turn this into a cublas axpy */ 3815 if (str != SAME_NONZERO_PATTERN && x->nz == y->nz && !x->compressedrow.use && !y->compressedrow.use) { 3816 bool eq = thrust::equal(thrust::device, csry->row_offsets->begin(), csry->row_offsets->end(), csrx->row_offsets->begin()); 3817 if (eq) eq = thrust::equal(thrust::device, csry->column_indices->begin(), csry->column_indices->end(), csrx->column_indices->begin()); 3818 if (eq) str = SAME_NONZERO_PATTERN; 3819 } 3820 /* spgeam is buggy with one column */ 3821 if (Y->cmap->n == 1 && str != SAME_NONZERO_PATTERN) str = DIFFERENT_NONZERO_PATTERN; 3822 3823 if (str == SUBSET_NONZERO_PATTERN) { 3824 PetscScalar b = 1.0; 3825 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3826 size_t bufferSize; 3827 void *buffer; 3828 #endif 3829 3830 PetscCall(MatSeqAIJCUSPARSEGetArrayRead(X, &ax)); 3831 PetscCall(MatSeqAIJCUSPARSEGetArray(Y, &ay)); 3832 PetscCallCUSPARSE(cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_HOST)); 3833 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3834 PetscCallCUSPARSE(cusparse_csr_spgeam_bufferSize(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(), 3835 csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get(), &bufferSize)); 3836 PetscCallCUDA(cudaMalloc(&buffer, bufferSize)); 3837 PetscCall(PetscLogGpuTimeBegin()); 3838 PetscCallCUSPARSE(cusparse_csr_spgeam(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(), 3839 csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get(), buffer)); 3840 PetscCall(PetscLogGpuFlops(x->nz + y->nz)); 3841 PetscCall(PetscLogGpuTimeEnd()); 3842 PetscCallCUDA(cudaFree(buffer)); 3843 #else 3844 PetscCall(PetscLogGpuTimeBegin()); 3845 PetscCallCUSPARSE(cusparse_csr_spgeam(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(), 3846 csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get())); 3847 PetscCall(PetscLogGpuFlops(x->nz + y->nz)); 3848 PetscCall(PetscLogGpuTimeEnd()); 3849 #endif 3850 PetscCallCUSPARSE(cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_DEVICE)); 3851 PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(X, &ax)); 3852 PetscCall(MatSeqAIJCUSPARSERestoreArray(Y, &ay)); 3853 PetscCall(MatSeqAIJInvalidateDiagonal(Y)); 3854 } else if (str == SAME_NONZERO_PATTERN) { 3855 cublasHandle_t cublasv2handle; 3856 PetscBLASInt one = 1, bnz = 1; 3857 3858 PetscCall(MatSeqAIJCUSPARSEGetArrayRead(X, &ax)); 3859 PetscCall(MatSeqAIJCUSPARSEGetArray(Y, &ay)); 3860 PetscCall(PetscCUBLASGetHandle(&cublasv2handle)); 3861 PetscCall(PetscBLASIntCast(x->nz, &bnz)); 3862 PetscCall(PetscLogGpuTimeBegin()); 3863 PetscCallCUBLAS(cublasXaxpy(cublasv2handle, bnz, &a, ax, one, ay, one)); 3864 PetscCall(PetscLogGpuFlops(2.0 * bnz)); 3865 PetscCall(PetscLogGpuTimeEnd()); 3866 PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(X, &ax)); 3867 PetscCall(MatSeqAIJCUSPARSERestoreArray(Y, &ay)); 3868 PetscCall(MatSeqAIJInvalidateDiagonal(Y)); 3869 } else { 3870 PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(Y, PETSC_FALSE)); 3871 PetscCall(MatAXPY_SeqAIJ(Y, a, X, str)); 3872 } 3873 PetscFunctionReturn(PETSC_SUCCESS); 3874 } 3875 3876 static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat Y, PetscScalar a) 3877 { 3878 Mat_SeqAIJ *y = (Mat_SeqAIJ *)Y->data; 3879 PetscScalar *ay; 3880 cublasHandle_t cublasv2handle; 3881 PetscBLASInt one = 1, bnz = 1; 3882 3883 PetscFunctionBegin; 3884 PetscCall(MatSeqAIJCUSPARSEGetArray(Y, &ay)); 3885 PetscCall(PetscCUBLASGetHandle(&cublasv2handle)); 3886 PetscCall(PetscBLASIntCast(y->nz, &bnz)); 3887 PetscCall(PetscLogGpuTimeBegin()); 3888 PetscCallCUBLAS(cublasXscal(cublasv2handle, bnz, &a, ay, one)); 3889 PetscCall(PetscLogGpuFlops(bnz)); 3890 PetscCall(PetscLogGpuTimeEnd()); 3891 PetscCall(MatSeqAIJCUSPARSERestoreArray(Y, &ay)); 3892 PetscCall(MatSeqAIJInvalidateDiagonal(Y)); 3893 PetscFunctionReturn(PETSC_SUCCESS); 3894 } 3895 3896 static PetscErrorCode MatZeroEntries_SeqAIJCUSPARSE(Mat A) 3897 { 3898 PetscBool both = PETSC_FALSE; 3899 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 3900 3901 PetscFunctionBegin; 3902 if (A->factortype == MAT_FACTOR_NONE) { 3903 Mat_SeqAIJCUSPARSE *spptr = (Mat_SeqAIJCUSPARSE *)A->spptr; 3904 if (spptr->mat) { 3905 CsrMatrix *matrix = (CsrMatrix *)spptr->mat->mat; 3906 if (matrix->values) { 3907 both = PETSC_TRUE; 3908 thrust::fill(thrust::device, matrix->values->begin(), matrix->values->end(), 0.); 3909 } 3910 } 3911 if (spptr->matTranspose) { 3912 CsrMatrix *matrix = (CsrMatrix *)spptr->matTranspose->mat; 3913 if (matrix->values) thrust::fill(thrust::device, matrix->values->begin(), matrix->values->end(), 0.); 3914 } 3915 } 3916 PetscCall(PetscArrayzero(a->a, a->i[A->rmap->n])); 3917 PetscCall(MatSeqAIJInvalidateDiagonal(A)); 3918 if (both) A->offloadmask = PETSC_OFFLOAD_BOTH; 3919 else A->offloadmask = PETSC_OFFLOAD_CPU; 3920 PetscFunctionReturn(PETSC_SUCCESS); 3921 } 3922 3923 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A, PetscBool flg) 3924 { 3925 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 3926 3927 PetscFunctionBegin; 3928 if (A->factortype != MAT_FACTOR_NONE) { 3929 A->boundtocpu = flg; 3930 PetscFunctionReturn(PETSC_SUCCESS); 3931 } 3932 if (flg) { 3933 PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A)); 3934 3935 A->ops->scale = MatScale_SeqAIJ; 3936 A->ops->axpy = MatAXPY_SeqAIJ; 3937 A->ops->zeroentries = MatZeroEntries_SeqAIJ; 3938 A->ops->mult = MatMult_SeqAIJ; 3939 A->ops->multadd = MatMultAdd_SeqAIJ; 3940 A->ops->multtranspose = MatMultTranspose_SeqAIJ; 3941 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJ; 3942 A->ops->multhermitiantranspose = NULL; 3943 A->ops->multhermitiantransposeadd = NULL; 3944 A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJ; 3945 PetscCall(PetscMemzero(a->ops, sizeof(Mat_SeqAIJOps))); 3946 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", NULL)); 3947 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C", NULL)); 3948 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdense_C", NULL)); 3949 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL)); 3950 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL)); 3951 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C", NULL)); 3952 } else { 3953 A->ops->scale = MatScale_SeqAIJCUSPARSE; 3954 A->ops->axpy = MatAXPY_SeqAIJCUSPARSE; 3955 A->ops->zeroentries = MatZeroEntries_SeqAIJCUSPARSE; 3956 A->ops->mult = MatMult_SeqAIJCUSPARSE; 3957 A->ops->multadd = MatMultAdd_SeqAIJCUSPARSE; 3958 A->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE; 3959 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE; 3960 A->ops->multhermitiantranspose = MatMultHermitianTranspose_SeqAIJCUSPARSE; 3961 A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE; 3962 A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJCUSPARSE; 3963 a->ops->getarray = MatSeqAIJGetArray_SeqAIJCUSPARSE; 3964 a->ops->restorearray = MatSeqAIJRestoreArray_SeqAIJCUSPARSE; 3965 a->ops->getarrayread = MatSeqAIJGetArrayRead_SeqAIJCUSPARSE; 3966 a->ops->restorearrayread = MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE; 3967 a->ops->getarraywrite = MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE; 3968 a->ops->restorearraywrite = MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE; 3969 a->ops->getcsrandmemtype = MatSeqAIJGetCSRAndMemType_SeqAIJCUSPARSE; 3970 3971 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", MatSeqAIJCopySubArray_SeqAIJCUSPARSE)); 3972 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C", MatProductSetFromOptions_SeqAIJCUSPARSE)); 3973 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdense_C", MatProductSetFromOptions_SeqAIJCUSPARSE)); 3974 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", MatSetPreallocationCOO_SeqAIJCUSPARSE)); 3975 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", MatSetValuesCOO_SeqAIJCUSPARSE)); 3976 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C", MatProductSetFromOptions_SeqAIJCUSPARSE)); 3977 } 3978 A->boundtocpu = flg; 3979 if (flg && a->inode.size) { 3980 a->inode.use = PETSC_TRUE; 3981 } else { 3982 a->inode.use = PETSC_FALSE; 3983 } 3984 PetscFunctionReturn(PETSC_SUCCESS); 3985 } 3986 3987 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType, MatReuse reuse, Mat *newmat) 3988 { 3989 Mat B; 3990 3991 PetscFunctionBegin; 3992 PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA)); /* first use of CUSPARSE may be via MatConvert */ 3993 if (reuse == MAT_INITIAL_MATRIX) { 3994 PetscCall(MatDuplicate(A, MAT_COPY_VALUES, newmat)); 3995 } else if (reuse == MAT_REUSE_MATRIX) { 3996 PetscCall(MatCopy(A, *newmat, SAME_NONZERO_PATTERN)); 3997 } 3998 B = *newmat; 3999 4000 PetscCall(PetscFree(B->defaultvectype)); 4001 PetscCall(PetscStrallocpy(VECCUDA, &B->defaultvectype)); 4002 4003 if (reuse != MAT_REUSE_MATRIX && !B->spptr) { 4004 if (B->factortype == MAT_FACTOR_NONE) { 4005 Mat_SeqAIJCUSPARSE *spptr; 4006 PetscCall(PetscNew(&spptr)); 4007 PetscCallCUSPARSE(cusparseCreate(&spptr->handle)); 4008 PetscCallCUSPARSE(cusparseSetStream(spptr->handle, PetscDefaultCudaStream)); 4009 spptr->format = MAT_CUSPARSE_CSR; 4010 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 4011 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 4012 spptr->spmvAlg = CUSPARSE_SPMV_CSR_ALG1; /* default, since we only support csr */ 4013 #else 4014 spptr->spmvAlg = CUSPARSE_CSRMV_ALG1; /* default, since we only support csr */ 4015 #endif 4016 spptr->spmmAlg = CUSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */ 4017 spptr->csr2cscAlg = CUSPARSE_CSR2CSC_ALG1; 4018 #endif 4019 B->spptr = spptr; 4020 } else { 4021 Mat_SeqAIJCUSPARSETriFactors *spptr; 4022 4023 PetscCall(PetscNew(&spptr)); 4024 PetscCallCUSPARSE(cusparseCreate(&spptr->handle)); 4025 PetscCallCUSPARSE(cusparseSetStream(spptr->handle, PetscDefaultCudaStream)); 4026 B->spptr = spptr; 4027 } 4028 B->offloadmask = PETSC_OFFLOAD_UNALLOCATED; 4029 } 4030 B->ops->assemblyend = MatAssemblyEnd_SeqAIJCUSPARSE; 4031 B->ops->destroy = MatDestroy_SeqAIJCUSPARSE; 4032 B->ops->setoption = MatSetOption_SeqAIJCUSPARSE; 4033 B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE; 4034 B->ops->bindtocpu = MatBindToCPU_SeqAIJCUSPARSE; 4035 B->ops->duplicate = MatDuplicate_SeqAIJCUSPARSE; 4036 4037 PetscCall(MatBindToCPU_SeqAIJCUSPARSE(B, PETSC_FALSE)); 4038 PetscCall(PetscObjectChangeTypeName((PetscObject)B, MATSEQAIJCUSPARSE)); 4039 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_SeqAIJCUSPARSE)); 4040 #if defined(PETSC_HAVE_HYPRE) 4041 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_seqaijcusparse_hypre_C", MatConvert_AIJ_HYPRE)); 4042 #endif 4043 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetUseCPUSolve_C", MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE)); 4044 PetscFunctionReturn(PETSC_SUCCESS); 4045 } 4046 4047 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B) 4048 { 4049 PetscFunctionBegin; 4050 PetscCall(MatCreate_SeqAIJ(B)); 4051 PetscCall(MatConvert_SeqAIJ_SeqAIJCUSPARSE(B, MATSEQAIJCUSPARSE, MAT_INPLACE_MATRIX, &B)); 4052 PetscFunctionReturn(PETSC_SUCCESS); 4053 } 4054 4055 /*MC 4056 MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices. 4057 4058 A matrix type whose data resides on NVIDIA GPUs. These matrices can be in either 4059 CSR, ELL, or Hybrid format. 4060 All matrix calculations are performed on NVIDIA GPUs using the CuSPARSE library. 4061 4062 Options Database Keys: 4063 + -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to `MatSetFromOptions()` 4064 . -mat_cusparse_storage_format csr - sets the storage format of matrices (for `MatMult()` and factors in `MatSolve()`). 4065 Other options include ell (ellpack) or hyb (hybrid). 4066 . -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for `MatMult()`). Other options include ell (ellpack) or hyb (hybrid). 4067 - -mat_cusparse_use_cpu_solve - Do `MatSolve()` on CPU 4068 4069 Level: beginner 4070 4071 .seealso: [](ch_matrices), `Mat`, `MatCreateSeqAIJCUSPARSE()`, `MatCUSPARSESetUseCPUSolve()`, `MATAIJCUSPARSE`, `MatCreateAIJCUSPARSE()`, `MatCUSPARSESetFormat()`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation` 4072 M*/ 4073 4074 PETSC_INTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void) 4075 { 4076 PetscFunctionBegin; 4077 PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_LU, MatGetFactor_seqaijcusparse_cusparse)); 4078 PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_CHOLESKY, MatGetFactor_seqaijcusparse_cusparse)); 4079 PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_ILU, MatGetFactor_seqaijcusparse_cusparse)); 4080 PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_ICC, MatGetFactor_seqaijcusparse_cusparse)); 4081 PetscFunctionReturn(PETSC_SUCCESS); 4082 } 4083 4084 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat mat) 4085 { 4086 Mat_SeqAIJCUSPARSE *cusp = static_cast<Mat_SeqAIJCUSPARSE *>(mat->spptr); 4087 4088 PetscFunctionBegin; 4089 if (cusp) { 4090 PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->mat, cusp->format)); 4091 PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose, cusp->format)); 4092 delete cusp->workVector; 4093 delete cusp->rowoffsets_gpu; 4094 delete cusp->csr2csc_i; 4095 delete cusp->coords; 4096 if (cusp->handle) PetscCallCUSPARSE(cusparseDestroy(cusp->handle)); 4097 PetscCall(PetscFree(mat->spptr)); 4098 } 4099 PetscFunctionReturn(PETSC_SUCCESS); 4100 } 4101 4102 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat) 4103 { 4104 PetscFunctionBegin; 4105 if (*mat) { 4106 delete (*mat)->values; 4107 delete (*mat)->column_indices; 4108 delete (*mat)->row_offsets; 4109 delete *mat; 4110 *mat = 0; 4111 } 4112 PetscFunctionReturn(PETSC_SUCCESS); 4113 } 4114 4115 #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0) 4116 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor) 4117 { 4118 PetscFunctionBegin; 4119 if (*trifactor) { 4120 if ((*trifactor)->descr) PetscCallCUSPARSE(cusparseDestroyMatDescr((*trifactor)->descr)); 4121 if ((*trifactor)->solveInfo) PetscCallCUSPARSE(cusparseDestroyCsrsvInfo((*trifactor)->solveInfo)); 4122 PetscCall(CsrMatrix_Destroy(&(*trifactor)->csrMat)); 4123 if ((*trifactor)->solveBuffer) PetscCallCUDA(cudaFree((*trifactor)->solveBuffer)); 4124 if ((*trifactor)->AA_h) PetscCallCUDA(cudaFreeHost((*trifactor)->AA_h)); 4125 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 4126 if ((*trifactor)->csr2cscBuffer) PetscCallCUDA(cudaFree((*trifactor)->csr2cscBuffer)); 4127 #endif 4128 PetscCall(PetscFree(*trifactor)); 4129 } 4130 PetscFunctionReturn(PETSC_SUCCESS); 4131 } 4132 #endif 4133 4134 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct, MatCUSPARSEStorageFormat format) 4135 { 4136 CsrMatrix *mat; 4137 4138 PetscFunctionBegin; 4139 if (*matstruct) { 4140 if ((*matstruct)->mat) { 4141 if (format == MAT_CUSPARSE_ELL || format == MAT_CUSPARSE_HYB) { 4142 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 4143 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 4144 #else 4145 cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat; 4146 PetscCallCUSPARSE(cusparseDestroyHybMat(hybMat)); 4147 #endif 4148 } else { 4149 mat = (CsrMatrix *)(*matstruct)->mat; 4150 PetscCall(CsrMatrix_Destroy(&mat)); 4151 } 4152 } 4153 if ((*matstruct)->descr) PetscCallCUSPARSE(cusparseDestroyMatDescr((*matstruct)->descr)); 4154 delete (*matstruct)->cprowIndices; 4155 if ((*matstruct)->alpha_one) PetscCallCUDA(cudaFree((*matstruct)->alpha_one)); 4156 if ((*matstruct)->beta_zero) PetscCallCUDA(cudaFree((*matstruct)->beta_zero)); 4157 if ((*matstruct)->beta_one) PetscCallCUDA(cudaFree((*matstruct)->beta_one)); 4158 4159 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 4160 Mat_SeqAIJCUSPARSEMultStruct *mdata = *matstruct; 4161 if (mdata->matDescr) PetscCallCUSPARSE(cusparseDestroySpMat(mdata->matDescr)); 4162 for (int i = 0; i < 3; i++) { 4163 if (mdata->cuSpMV[i].initialized) { 4164 PetscCallCUDA(cudaFree(mdata->cuSpMV[i].spmvBuffer)); 4165 PetscCallCUSPARSE(cusparseDestroyDnVec(mdata->cuSpMV[i].vecXDescr)); 4166 PetscCallCUSPARSE(cusparseDestroyDnVec(mdata->cuSpMV[i].vecYDescr)); 4167 } 4168 } 4169 #endif 4170 delete *matstruct; 4171 *matstruct = NULL; 4172 } 4173 PetscFunctionReturn(PETSC_SUCCESS); 4174 } 4175 4176 PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors_p *trifactors) 4177 { 4178 Mat_SeqAIJCUSPARSETriFactors *fs = *trifactors; 4179 4180 PetscFunctionBegin; 4181 if (fs) { 4182 #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0) 4183 PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->loTriFactorPtr)); 4184 PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->upTriFactorPtr)); 4185 PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->loTriFactorPtrTranspose)); 4186 PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->upTriFactorPtrTranspose)); 4187 delete fs->workVector; 4188 fs->workVector = NULL; 4189 #endif 4190 delete fs->rpermIndices; 4191 delete fs->cpermIndices; 4192 fs->rpermIndices = NULL; 4193 fs->cpermIndices = NULL; 4194 fs->init_dev_prop = PETSC_FALSE; 4195 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 4196 PetscCallCUDA(cudaFree(fs->csrRowPtr)); 4197 PetscCallCUDA(cudaFree(fs->csrColIdx)); 4198 PetscCallCUDA(cudaFree(fs->csrRowPtr32)); 4199 PetscCallCUDA(cudaFree(fs->csrColIdx32)); 4200 PetscCallCUDA(cudaFree(fs->csrVal)); 4201 PetscCallCUDA(cudaFree(fs->diag)); 4202 PetscCallCUDA(cudaFree(fs->X)); 4203 PetscCallCUDA(cudaFree(fs->Y)); 4204 // PetscCallCUDA(cudaFree(fs->factBuffer_M)); /* No needed since factBuffer_M shares with one of spsvBuffer_L/U */ 4205 PetscCallCUDA(cudaFree(fs->spsvBuffer_L)); 4206 PetscCallCUDA(cudaFree(fs->spsvBuffer_U)); 4207 PetscCallCUDA(cudaFree(fs->spsvBuffer_Lt)); 4208 PetscCallCUDA(cudaFree(fs->spsvBuffer_Ut)); 4209 PetscCallCUSPARSE(cusparseDestroyMatDescr(fs->matDescr_M)); 4210 PetscCallCUSPARSE(cusparseDestroySpMat(fs->spMatDescr_L)); 4211 PetscCallCUSPARSE(cusparseDestroySpMat(fs->spMatDescr_U)); 4212 PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_L)); 4213 PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_Lt)); 4214 PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_U)); 4215 PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_Ut)); 4216 PetscCallCUSPARSE(cusparseDestroyDnVec(fs->dnVecDescr_X)); 4217 PetscCallCUSPARSE(cusparseDestroyDnVec(fs->dnVecDescr_Y)); 4218 PetscCallCUSPARSE(cusparseDestroyCsrilu02Info(fs->ilu0Info_M)); 4219 PetscCallCUSPARSE(cusparseDestroyCsric02Info(fs->ic0Info_M)); 4220 PetscCall(PetscFree(fs->csrRowPtr_h)); 4221 PetscCall(PetscFree(fs->csrVal_h)); 4222 PetscCall(PetscFree(fs->diag_h)); 4223 fs->createdTransposeSpSVDescr = PETSC_FALSE; 4224 fs->updatedTransposeSpSVAnalysis = PETSC_FALSE; 4225 #endif 4226 } 4227 PetscFunctionReturn(PETSC_SUCCESS); 4228 } 4229 4230 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors **trifactors) 4231 { 4232 PetscFunctionBegin; 4233 if (*trifactors) { 4234 PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(trifactors)); 4235 PetscCallCUSPARSE(cusparseDestroy((*trifactors)->handle)); 4236 PetscCall(PetscFree(*trifactors)); 4237 } 4238 PetscFunctionReturn(PETSC_SUCCESS); 4239 } 4240 4241 struct IJCompare { 4242 __host__ __device__ inline bool operator()(const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2) 4243 { 4244 if (thrust::get<0>(t1) < thrust::get<0>(t2)) return true; 4245 if (thrust::get<0>(t1) == thrust::get<0>(t2)) return thrust::get<1>(t1) < thrust::get<1>(t2); 4246 return false; 4247 } 4248 }; 4249 4250 static PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat A, PetscBool destroy) 4251 { 4252 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 4253 4254 PetscFunctionBegin; 4255 PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 4256 if (!cusp) PetscFunctionReturn(PETSC_SUCCESS); 4257 if (destroy) { 4258 PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose, cusp->format)); 4259 delete cusp->csr2csc_i; 4260 cusp->csr2csc_i = NULL; 4261 } 4262 A->transupdated = PETSC_FALSE; 4263 PetscFunctionReturn(PETSC_SUCCESS); 4264 } 4265 4266 static PetscErrorCode MatCOOStructDestroy_SeqAIJCUSPARSE(void *data) 4267 { 4268 MatCOOStruct_SeqAIJ *coo = (MatCOOStruct_SeqAIJ *)data; 4269 4270 PetscFunctionBegin; 4271 PetscCallCUDA(cudaFree(coo->perm)); 4272 PetscCallCUDA(cudaFree(coo->jmap)); 4273 PetscCall(PetscFree(coo)); 4274 PetscFunctionReturn(PETSC_SUCCESS); 4275 } 4276 4277 static PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[]) 4278 { 4279 PetscBool dev_ij = PETSC_FALSE; 4280 PetscMemType mtype = PETSC_MEMTYPE_HOST; 4281 PetscInt *i, *j; 4282 PetscContainer container_h, container_d; 4283 MatCOOStruct_SeqAIJ *coo_h, *coo_d; 4284 4285 PetscFunctionBegin; 4286 // The two MatResetPreallocationCOO_* must be done in order. The former relies on values that might be destroyed by the latter 4287 PetscCall(PetscGetMemType(coo_i, &mtype)); 4288 if (PetscMemTypeDevice(mtype)) { 4289 dev_ij = PETSC_TRUE; 4290 PetscCall(PetscMalloc2(coo_n, &i, coo_n, &j)); 4291 PetscCallCUDA(cudaMemcpy(i, coo_i, coo_n * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 4292 PetscCallCUDA(cudaMemcpy(j, coo_j, coo_n * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 4293 } else { 4294 i = coo_i; 4295 j = coo_j; 4296 } 4297 4298 PetscCall(MatSetPreallocationCOO_SeqAIJ(mat, coo_n, i, j)); 4299 if (dev_ij) PetscCall(PetscFree2(i, j)); 4300 mat->offloadmask = PETSC_OFFLOAD_CPU; 4301 // Create the GPU memory 4302 PetscCall(MatSeqAIJCUSPARSECopyToGPU(mat)); 4303 4304 // Copy the COO struct to device 4305 PetscCall(PetscObjectQuery((PetscObject)mat, "__PETSc_MatCOOStruct_Host", (PetscObject *)&container_h)); 4306 PetscCall(PetscContainerGetPointer(container_h, (void **)&coo_h)); 4307 PetscCall(PetscMalloc1(1, &coo_d)); 4308 *coo_d = *coo_h; // do a shallow copy and then amend some fields that need to be different 4309 PetscCallCUDA(cudaMalloc((void **)&coo_d->jmap, (coo_h->nz + 1) * sizeof(PetscCount))); 4310 PetscCallCUDA(cudaMemcpy(coo_d->jmap, coo_h->jmap, (coo_h->nz + 1) * sizeof(PetscCount), cudaMemcpyHostToDevice)); 4311 PetscCallCUDA(cudaMalloc((void **)&coo_d->perm, coo_h->Atot * sizeof(PetscCount))); 4312 PetscCallCUDA(cudaMemcpy(coo_d->perm, coo_h->perm, coo_h->Atot * sizeof(PetscCount), cudaMemcpyHostToDevice)); 4313 4314 // Put the COO struct in a container and then attach that to the matrix 4315 PetscCall(PetscContainerCreate(PETSC_COMM_SELF, &container_d)); 4316 PetscCall(PetscContainerSetPointer(container_d, coo_d)); 4317 PetscCall(PetscContainerSetUserDestroy(container_d, MatCOOStructDestroy_SeqAIJCUSPARSE)); 4318 PetscCall(PetscObjectCompose((PetscObject)mat, "__PETSc_MatCOOStruct_Device", (PetscObject)container_d)); 4319 PetscCall(PetscContainerDestroy(&container_d)); 4320 PetscFunctionReturn(PETSC_SUCCESS); 4321 } 4322 4323 __global__ static void MatAddCOOValues(const PetscScalar kv[], PetscCount nnz, const PetscCount jmap[], const PetscCount perm[], InsertMode imode, PetscScalar a[]) 4324 { 4325 PetscCount i = blockIdx.x * blockDim.x + threadIdx.x; 4326 const PetscCount grid_size = gridDim.x * blockDim.x; 4327 for (; i < nnz; i += grid_size) { 4328 PetscScalar sum = 0.0; 4329 for (PetscCount k = jmap[i]; k < jmap[i + 1]; k++) sum += kv[perm[k]]; 4330 a[i] = (imode == INSERT_VALUES ? 0.0 : a[i]) + sum; 4331 } 4332 } 4333 4334 static PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode) 4335 { 4336 Mat_SeqAIJ *seq = (Mat_SeqAIJ *)A->data; 4337 Mat_SeqAIJCUSPARSE *dev = (Mat_SeqAIJCUSPARSE *)A->spptr; 4338 PetscCount Annz = seq->nz; 4339 PetscMemType memtype; 4340 const PetscScalar *v1 = v; 4341 PetscScalar *Aa; 4342 PetscContainer container; 4343 MatCOOStruct_SeqAIJ *coo; 4344 4345 PetscFunctionBegin; 4346 if (!dev->mat) PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 4347 4348 PetscCall(PetscObjectQuery((PetscObject)A, "__PETSc_MatCOOStruct_Device", (PetscObject *)&container)); 4349 PetscCall(PetscContainerGetPointer(container, (void **)&coo)); 4350 4351 PetscCall(PetscGetMemType(v, &memtype)); 4352 if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */ 4353 PetscCallCUDA(cudaMalloc((void **)&v1, coo->n * sizeof(PetscScalar))); 4354 PetscCallCUDA(cudaMemcpy((void *)v1, v, coo->n * sizeof(PetscScalar), cudaMemcpyHostToDevice)); 4355 } 4356 4357 if (imode == INSERT_VALUES) PetscCall(MatSeqAIJCUSPARSEGetArrayWrite(A, &Aa)); 4358 else PetscCall(MatSeqAIJCUSPARSEGetArray(A, &Aa)); 4359 4360 PetscCall(PetscLogGpuTimeBegin()); 4361 if (Annz) { 4362 MatAddCOOValues<<<(Annz + 255) / 256, 256>>>(v1, Annz, coo->jmap, coo->perm, imode, Aa); 4363 PetscCallCUDA(cudaPeekAtLastError()); 4364 } 4365 PetscCall(PetscLogGpuTimeEnd()); 4366 4367 if (imode == INSERT_VALUES) PetscCall(MatSeqAIJCUSPARSERestoreArrayWrite(A, &Aa)); 4368 else PetscCall(MatSeqAIJCUSPARSERestoreArray(A, &Aa)); 4369 4370 if (PetscMemTypeHost(memtype)) PetscCallCUDA(cudaFree((void *)v1)); 4371 PetscFunctionReturn(PETSC_SUCCESS); 4372 } 4373 4374 /*@C 4375 MatSeqAIJCUSPARSEGetIJ - returns the device row storage `i` and `j` indices for `MATSEQAIJCUSPARSE` matrices. 4376 4377 Not Collective 4378 4379 Input Parameters: 4380 + A - the matrix 4381 - compressed - `PETSC_TRUE` or `PETSC_FALSE` indicating the matrix data structure should be always returned in compressed form 4382 4383 Output Parameters: 4384 + i - the CSR row pointers 4385 - j - the CSR column indices 4386 4387 Level: developer 4388 4389 Note: 4390 When compressed is true, the CSR structure does not contain empty rows 4391 4392 .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSERestoreIJ()`, `MatSeqAIJCUSPARSEGetArrayRead()` 4393 @*/ 4394 PetscErrorCode MatSeqAIJCUSPARSEGetIJ(Mat A, PetscBool compressed, const int **i, const int **j) 4395 { 4396 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 4397 CsrMatrix *csr; 4398 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 4399 4400 PetscFunctionBegin; 4401 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 4402 if (!i || !j) PetscFunctionReturn(PETSC_SUCCESS); 4403 PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 4404 PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented"); 4405 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 4406 PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct"); 4407 csr = (CsrMatrix *)cusp->mat->mat; 4408 if (i) { 4409 if (!compressed && a->compressedrow.use) { /* need full row offset */ 4410 if (!cusp->rowoffsets_gpu) { 4411 cusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 4412 cusp->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1); 4413 PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt))); 4414 } 4415 *i = cusp->rowoffsets_gpu->data().get(); 4416 } else *i = csr->row_offsets->data().get(); 4417 } 4418 if (j) *j = csr->column_indices->data().get(); 4419 PetscFunctionReturn(PETSC_SUCCESS); 4420 } 4421 4422 /*@C 4423 MatSeqAIJCUSPARSERestoreIJ - restore the device row storage `i` and `j` indices obtained with `MatSeqAIJCUSPARSEGetIJ()` 4424 4425 Not Collective 4426 4427 Input Parameters: 4428 + A - the matrix 4429 . compressed - `PETSC_TRUE` or `PETSC_FALSE` indicating the matrix data structure should be always returned in compressed form 4430 . i - the CSR row pointers 4431 - j - the CSR column indices 4432 4433 Level: developer 4434 4435 .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetIJ()` 4436 @*/ 4437 PetscErrorCode MatSeqAIJCUSPARSERestoreIJ(Mat A, PetscBool compressed, const int **i, const int **j) 4438 { 4439 PetscFunctionBegin; 4440 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 4441 PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 4442 if (i) *i = NULL; 4443 if (j) *j = NULL; 4444 (void)compressed; 4445 PetscFunctionReturn(PETSC_SUCCESS); 4446 } 4447 4448 /*@C 4449 MatSeqAIJCUSPARSEGetArrayRead - gives read-only access to the array where the device data for a `MATSEQAIJCUSPARSE` matrix is stored 4450 4451 Not Collective 4452 4453 Input Parameter: 4454 . A - a `MATSEQAIJCUSPARSE` matrix 4455 4456 Output Parameter: 4457 . a - pointer to the device data 4458 4459 Level: developer 4460 4461 Note: 4462 May trigger host-device copies if up-to-date matrix data is on host 4463 4464 .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArray()`, `MatSeqAIJCUSPARSEGetArrayWrite()`, `MatSeqAIJCUSPARSERestoreArrayRead()` 4465 @*/ 4466 PetscErrorCode MatSeqAIJCUSPARSEGetArrayRead(Mat A, const PetscScalar **a) 4467 { 4468 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 4469 CsrMatrix *csr; 4470 4471 PetscFunctionBegin; 4472 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 4473 PetscAssertPointer(a, 2); 4474 PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 4475 PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented"); 4476 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 4477 PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct"); 4478 csr = (CsrMatrix *)cusp->mat->mat; 4479 PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory"); 4480 *a = csr->values->data().get(); 4481 PetscFunctionReturn(PETSC_SUCCESS); 4482 } 4483 4484 /*@C 4485 MatSeqAIJCUSPARSERestoreArrayRead - restore the read-only access array obtained from `MatSeqAIJCUSPARSEGetArrayRead()` 4486 4487 Not Collective 4488 4489 Input Parameters: 4490 + A - a `MATSEQAIJCUSPARSE` matrix 4491 - a - pointer to the device data 4492 4493 Level: developer 4494 4495 .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArrayRead()` 4496 @*/ 4497 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayRead(Mat A, const PetscScalar **a) 4498 { 4499 PetscFunctionBegin; 4500 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 4501 PetscAssertPointer(a, 2); 4502 PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 4503 *a = NULL; 4504 PetscFunctionReturn(PETSC_SUCCESS); 4505 } 4506 4507 /*@C 4508 MatSeqAIJCUSPARSEGetArray - gives read-write access to the array where the device data for a `MATSEQAIJCUSPARSE` matrix is stored 4509 4510 Not Collective 4511 4512 Input Parameter: 4513 . A - a `MATSEQAIJCUSPARSE` matrix 4514 4515 Output Parameter: 4516 . a - pointer to the device data 4517 4518 Level: developer 4519 4520 Note: 4521 May trigger host-device copies if up-to-date matrix data is on host 4522 4523 .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArrayRead()`, `MatSeqAIJCUSPARSEGetArrayWrite()`, `MatSeqAIJCUSPARSERestoreArray()` 4524 @*/ 4525 PetscErrorCode MatSeqAIJCUSPARSEGetArray(Mat A, PetscScalar **a) 4526 { 4527 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 4528 CsrMatrix *csr; 4529 4530 PetscFunctionBegin; 4531 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 4532 PetscAssertPointer(a, 2); 4533 PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 4534 PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented"); 4535 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 4536 PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct"); 4537 csr = (CsrMatrix *)cusp->mat->mat; 4538 PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory"); 4539 *a = csr->values->data().get(); 4540 A->offloadmask = PETSC_OFFLOAD_GPU; 4541 PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_FALSE)); 4542 PetscFunctionReturn(PETSC_SUCCESS); 4543 } 4544 /*@C 4545 MatSeqAIJCUSPARSERestoreArray - restore the read-write access array obtained from `MatSeqAIJCUSPARSEGetArray()` 4546 4547 Not Collective 4548 4549 Input Parameters: 4550 + A - a `MATSEQAIJCUSPARSE` matrix 4551 - a - pointer to the device data 4552 4553 Level: developer 4554 4555 .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArray()` 4556 @*/ 4557 PetscErrorCode MatSeqAIJCUSPARSERestoreArray(Mat A, PetscScalar **a) 4558 { 4559 PetscFunctionBegin; 4560 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 4561 PetscAssertPointer(a, 2); 4562 PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 4563 PetscCall(MatSeqAIJInvalidateDiagonal(A)); 4564 PetscCall(PetscObjectStateIncrease((PetscObject)A)); 4565 *a = NULL; 4566 PetscFunctionReturn(PETSC_SUCCESS); 4567 } 4568 4569 /*@C 4570 MatSeqAIJCUSPARSEGetArrayWrite - gives write access to the array where the device data for a `MATSEQAIJCUSPARSE` matrix is stored 4571 4572 Not Collective 4573 4574 Input Parameter: 4575 . A - a `MATSEQAIJCUSPARSE` matrix 4576 4577 Output Parameter: 4578 . a - pointer to the device data 4579 4580 Level: developer 4581 4582 Note: 4583 Does not trigger host-device copies and flags data validity on the GPU 4584 4585 .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArray()`, `MatSeqAIJCUSPARSEGetArrayRead()`, `MatSeqAIJCUSPARSERestoreArrayWrite()` 4586 @*/ 4587 PetscErrorCode MatSeqAIJCUSPARSEGetArrayWrite(Mat A, PetscScalar **a) 4588 { 4589 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 4590 CsrMatrix *csr; 4591 4592 PetscFunctionBegin; 4593 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 4594 PetscAssertPointer(a, 2); 4595 PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 4596 PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented"); 4597 PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct"); 4598 csr = (CsrMatrix *)cusp->mat->mat; 4599 PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory"); 4600 *a = csr->values->data().get(); 4601 A->offloadmask = PETSC_OFFLOAD_GPU; 4602 PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_FALSE)); 4603 PetscFunctionReturn(PETSC_SUCCESS); 4604 } 4605 4606 /*@C 4607 MatSeqAIJCUSPARSERestoreArrayWrite - restore the write-only access array obtained from `MatSeqAIJCUSPARSEGetArrayWrite()` 4608 4609 Not Collective 4610 4611 Input Parameters: 4612 + A - a `MATSEQAIJCUSPARSE` matrix 4613 - a - pointer to the device data 4614 4615 Level: developer 4616 4617 .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArrayWrite()` 4618 @*/ 4619 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayWrite(Mat A, PetscScalar **a) 4620 { 4621 PetscFunctionBegin; 4622 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 4623 PetscAssertPointer(a, 2); 4624 PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 4625 PetscCall(MatSeqAIJInvalidateDiagonal(A)); 4626 PetscCall(PetscObjectStateIncrease((PetscObject)A)); 4627 *a = NULL; 4628 PetscFunctionReturn(PETSC_SUCCESS); 4629 } 4630 4631 struct IJCompare4 { 4632 __host__ __device__ inline bool operator()(const thrust::tuple<int, int, PetscScalar, int> &t1, const thrust::tuple<int, int, PetscScalar, int> &t2) 4633 { 4634 if (thrust::get<0>(t1) < thrust::get<0>(t2)) return true; 4635 if (thrust::get<0>(t1) == thrust::get<0>(t2)) return thrust::get<1>(t1) < thrust::get<1>(t2); 4636 return false; 4637 } 4638 }; 4639 4640 struct Shift { 4641 int _shift; 4642 4643 Shift(int shift) : _shift(shift) { } 4644 __host__ __device__ inline int operator()(const int &c) { return c + _shift; } 4645 }; 4646 4647 /* merges two SeqAIJCUSPARSE matrices A, B by concatenating their rows. [A';B']' operation in MATLAB notation */ 4648 PetscErrorCode MatSeqAIJCUSPARSEMergeMats(Mat A, Mat B, MatReuse reuse, Mat *C) 4649 { 4650 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c; 4651 Mat_SeqAIJCUSPARSE *Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr, *Bcusp = (Mat_SeqAIJCUSPARSE *)B->spptr, *Ccusp; 4652 Mat_SeqAIJCUSPARSEMultStruct *Cmat; 4653 CsrMatrix *Acsr, *Bcsr, *Ccsr; 4654 PetscInt Annz, Bnnz; 4655 cusparseStatus_t stat; 4656 PetscInt i, m, n, zero = 0; 4657 4658 PetscFunctionBegin; 4659 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 4660 PetscValidHeaderSpecific(B, MAT_CLASSID, 2); 4661 PetscAssertPointer(C, 4); 4662 PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 4663 PetscCheckTypeName(B, MATSEQAIJCUSPARSE); 4664 PetscCheck(A->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, A->rmap->n, B->rmap->n); 4665 PetscCheck(reuse != MAT_INPLACE_MATRIX, PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_INPLACE_MATRIX not supported"); 4666 PetscCheck(Acusp->format != MAT_CUSPARSE_ELL && Acusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented"); 4667 PetscCheck(Bcusp->format != MAT_CUSPARSE_ELL && Bcusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented"); 4668 if (reuse == MAT_INITIAL_MATRIX) { 4669 m = A->rmap->n; 4670 n = A->cmap->n + B->cmap->n; 4671 PetscCall(MatCreate(PETSC_COMM_SELF, C)); 4672 PetscCall(MatSetSizes(*C, m, n, m, n)); 4673 PetscCall(MatSetType(*C, MATSEQAIJCUSPARSE)); 4674 c = (Mat_SeqAIJ *)(*C)->data; 4675 Ccusp = (Mat_SeqAIJCUSPARSE *)(*C)->spptr; 4676 Cmat = new Mat_SeqAIJCUSPARSEMultStruct; 4677 Ccsr = new CsrMatrix; 4678 Cmat->cprowIndices = NULL; 4679 c->compressedrow.use = PETSC_FALSE; 4680 c->compressedrow.nrows = 0; 4681 c->compressedrow.i = NULL; 4682 c->compressedrow.rindex = NULL; 4683 Ccusp->workVector = NULL; 4684 Ccusp->nrows = m; 4685 Ccusp->mat = Cmat; 4686 Ccusp->mat->mat = Ccsr; 4687 Ccsr->num_rows = m; 4688 Ccsr->num_cols = n; 4689 PetscCallCUSPARSE(cusparseCreateMatDescr(&Cmat->descr)); 4690 PetscCallCUSPARSE(cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO)); 4691 PetscCallCUSPARSE(cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 4692 PetscCallCUDA(cudaMalloc((void **)&Cmat->alpha_one, sizeof(PetscScalar))); 4693 PetscCallCUDA(cudaMalloc((void **)&Cmat->beta_zero, sizeof(PetscScalar))); 4694 PetscCallCUDA(cudaMalloc((void **)&Cmat->beta_one, sizeof(PetscScalar))); 4695 PetscCallCUDA(cudaMemcpy(Cmat->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 4696 PetscCallCUDA(cudaMemcpy(Cmat->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 4697 PetscCallCUDA(cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 4698 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 4699 PetscCall(MatSeqAIJCUSPARSECopyToGPU(B)); 4700 PetscCheck(Acusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct"); 4701 PetscCheck(Bcusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct"); 4702 4703 Acsr = (CsrMatrix *)Acusp->mat->mat; 4704 Bcsr = (CsrMatrix *)Bcusp->mat->mat; 4705 Annz = (PetscInt)Acsr->column_indices->size(); 4706 Bnnz = (PetscInt)Bcsr->column_indices->size(); 4707 c->nz = Annz + Bnnz; 4708 Ccsr->row_offsets = new THRUSTINTARRAY32(m + 1); 4709 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 4710 Ccsr->values = new THRUSTARRAY(c->nz); 4711 Ccsr->num_entries = c->nz; 4712 Ccusp->coords = new THRUSTINTARRAY(c->nz); 4713 if (c->nz) { 4714 auto Acoo = new THRUSTINTARRAY32(Annz); 4715 auto Bcoo = new THRUSTINTARRAY32(Bnnz); 4716 auto Ccoo = new THRUSTINTARRAY32(c->nz); 4717 THRUSTINTARRAY32 *Aroff, *Broff; 4718 4719 if (a->compressedrow.use) { /* need full row offset */ 4720 if (!Acusp->rowoffsets_gpu) { 4721 Acusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 4722 Acusp->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1); 4723 PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt))); 4724 } 4725 Aroff = Acusp->rowoffsets_gpu; 4726 } else Aroff = Acsr->row_offsets; 4727 if (b->compressedrow.use) { /* need full row offset */ 4728 if (!Bcusp->rowoffsets_gpu) { 4729 Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1); 4730 Bcusp->rowoffsets_gpu->assign(b->i, b->i + B->rmap->n + 1); 4731 PetscCall(PetscLogCpuToGpu((B->rmap->n + 1) * sizeof(PetscInt))); 4732 } 4733 Broff = Bcusp->rowoffsets_gpu; 4734 } else Broff = Bcsr->row_offsets; 4735 PetscCall(PetscLogGpuTimeBegin()); 4736 stat = cusparseXcsr2coo(Acusp->handle, Aroff->data().get(), Annz, m, Acoo->data().get(), CUSPARSE_INDEX_BASE_ZERO); 4737 PetscCallCUSPARSE(stat); 4738 stat = cusparseXcsr2coo(Bcusp->handle, Broff->data().get(), Bnnz, m, Bcoo->data().get(), CUSPARSE_INDEX_BASE_ZERO); 4739 PetscCallCUSPARSE(stat); 4740 /* Issues when using bool with large matrices on SUMMIT 10.2.89 */ 4741 auto Aperm = thrust::make_constant_iterator(1); 4742 auto Bperm = thrust::make_constant_iterator(0); 4743 #if PETSC_PKG_CUDA_VERSION_GE(10, 0, 0) 4744 auto Bcib = thrust::make_transform_iterator(Bcsr->column_indices->begin(), Shift(A->cmap->n)); 4745 auto Bcie = thrust::make_transform_iterator(Bcsr->column_indices->end(), Shift(A->cmap->n)); 4746 #else 4747 /* there are issues instantiating the merge operation using a transform iterator for the columns of B */ 4748 auto Bcib = Bcsr->column_indices->begin(); 4749 auto Bcie = Bcsr->column_indices->end(); 4750 thrust::transform(Bcib, Bcie, Bcib, Shift(A->cmap->n)); 4751 #endif 4752 auto wPerm = new THRUSTINTARRAY32(Annz + Bnnz); 4753 auto Azb = thrust::make_zip_iterator(thrust::make_tuple(Acoo->begin(), Acsr->column_indices->begin(), Acsr->values->begin(), Aperm)); 4754 auto Aze = thrust::make_zip_iterator(thrust::make_tuple(Acoo->end(), Acsr->column_indices->end(), Acsr->values->end(), Aperm)); 4755 auto Bzb = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->begin(), Bcib, Bcsr->values->begin(), Bperm)); 4756 auto Bze = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->end(), Bcie, Bcsr->values->end(), Bperm)); 4757 auto Czb = thrust::make_zip_iterator(thrust::make_tuple(Ccoo->begin(), Ccsr->column_indices->begin(), Ccsr->values->begin(), wPerm->begin())); 4758 auto p1 = Ccusp->coords->begin(); 4759 auto p2 = Ccusp->coords->begin(); 4760 thrust::advance(p2, Annz); 4761 PetscCallThrust(thrust::merge(thrust::device, Azb, Aze, Bzb, Bze, Czb, IJCompare4())); 4762 #if PETSC_PKG_CUDA_VERSION_LT(10, 0, 0) 4763 thrust::transform(Bcib, Bcie, Bcib, Shift(-A->cmap->n)); 4764 #endif 4765 auto cci = thrust::make_counting_iterator(zero); 4766 auto cce = thrust::make_counting_iterator(c->nz); 4767 #if 0 //Errors on SUMMIT cuda 11.1.0 4768 PetscCallThrust(thrust::partition_copy(thrust::device,cci,cce,wPerm->begin(),p1,p2,thrust::identity<int>())); 4769 #else 4770 auto pred = thrust::identity<int>(); 4771 PetscCallThrust(thrust::copy_if(thrust::device, cci, cce, wPerm->begin(), p1, pred)); 4772 PetscCallThrust(thrust::remove_copy_if(thrust::device, cci, cce, wPerm->begin(), p2, pred)); 4773 #endif 4774 stat = cusparseXcoo2csr(Ccusp->handle, Ccoo->data().get(), c->nz, m, Ccsr->row_offsets->data().get(), CUSPARSE_INDEX_BASE_ZERO); 4775 PetscCallCUSPARSE(stat); 4776 PetscCall(PetscLogGpuTimeEnd()); 4777 delete wPerm; 4778 delete Acoo; 4779 delete Bcoo; 4780 delete Ccoo; 4781 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 4782 stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, Ccsr->num_entries, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype); 4783 PetscCallCUSPARSE(stat); 4784 #endif 4785 if (A->form_explicit_transpose && B->form_explicit_transpose) { /* if A and B have the transpose, generate C transpose too */ 4786 PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A)); 4787 PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(B)); 4788 PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE; 4789 Mat_SeqAIJCUSPARSEMultStruct *CmatT = new Mat_SeqAIJCUSPARSEMultStruct; 4790 CsrMatrix *CcsrT = new CsrMatrix; 4791 CsrMatrix *AcsrT = AT ? (CsrMatrix *)Acusp->matTranspose->mat : NULL; 4792 CsrMatrix *BcsrT = BT ? (CsrMatrix *)Bcusp->matTranspose->mat : NULL; 4793 4794 (*C)->form_explicit_transpose = PETSC_TRUE; 4795 (*C)->transupdated = PETSC_TRUE; 4796 Ccusp->rowoffsets_gpu = NULL; 4797 CmatT->cprowIndices = NULL; 4798 CmatT->mat = CcsrT; 4799 CcsrT->num_rows = n; 4800 CcsrT->num_cols = m; 4801 CcsrT->num_entries = c->nz; 4802 4803 CcsrT->row_offsets = new THRUSTINTARRAY32(n + 1); 4804 CcsrT->column_indices = new THRUSTINTARRAY32(c->nz); 4805 CcsrT->values = new THRUSTARRAY(c->nz); 4806 4807 PetscCall(PetscLogGpuTimeBegin()); 4808 auto rT = CcsrT->row_offsets->begin(); 4809 if (AT) { 4810 rT = thrust::copy(AcsrT->row_offsets->begin(), AcsrT->row_offsets->end(), rT); 4811 thrust::advance(rT, -1); 4812 } 4813 if (BT) { 4814 auto titb = thrust::make_transform_iterator(BcsrT->row_offsets->begin(), Shift(a->nz)); 4815 auto tite = thrust::make_transform_iterator(BcsrT->row_offsets->end(), Shift(a->nz)); 4816 thrust::copy(titb, tite, rT); 4817 } 4818 auto cT = CcsrT->column_indices->begin(); 4819 if (AT) cT = thrust::copy(AcsrT->column_indices->begin(), AcsrT->column_indices->end(), cT); 4820 if (BT) thrust::copy(BcsrT->column_indices->begin(), BcsrT->column_indices->end(), cT); 4821 auto vT = CcsrT->values->begin(); 4822 if (AT) vT = thrust::copy(AcsrT->values->begin(), AcsrT->values->end(), vT); 4823 if (BT) thrust::copy(BcsrT->values->begin(), BcsrT->values->end(), vT); 4824 PetscCall(PetscLogGpuTimeEnd()); 4825 4826 PetscCallCUSPARSE(cusparseCreateMatDescr(&CmatT->descr)); 4827 PetscCallCUSPARSE(cusparseSetMatIndexBase(CmatT->descr, CUSPARSE_INDEX_BASE_ZERO)); 4828 PetscCallCUSPARSE(cusparseSetMatType(CmatT->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 4829 PetscCallCUDA(cudaMalloc((void **)&CmatT->alpha_one, sizeof(PetscScalar))); 4830 PetscCallCUDA(cudaMalloc((void **)&CmatT->beta_zero, sizeof(PetscScalar))); 4831 PetscCallCUDA(cudaMalloc((void **)&CmatT->beta_one, sizeof(PetscScalar))); 4832 PetscCallCUDA(cudaMemcpy(CmatT->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 4833 PetscCallCUDA(cudaMemcpy(CmatT->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 4834 PetscCallCUDA(cudaMemcpy(CmatT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 4835 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 4836 stat = cusparseCreateCsr(&CmatT->matDescr, CcsrT->num_rows, CcsrT->num_cols, CcsrT->num_entries, CcsrT->row_offsets->data().get(), CcsrT->column_indices->data().get(), CcsrT->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype); 4837 PetscCallCUSPARSE(stat); 4838 #endif 4839 Ccusp->matTranspose = CmatT; 4840 } 4841 } 4842 4843 c->free_a = PETSC_TRUE; 4844 PetscCall(PetscShmgetAllocateArray(c->nz, sizeof(PetscInt), (void **)&c->j)); 4845 PetscCall(PetscShmgetAllocateArray(m + 1, sizeof(PetscInt), (void **)&c->i)); 4846 c->free_ij = PETSC_TRUE; 4847 if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64-bit conversion on the GPU and then copy to host (lazy) */ 4848 THRUSTINTARRAY ii(Ccsr->row_offsets->size()); 4849 THRUSTINTARRAY jj(Ccsr->column_indices->size()); 4850 ii = *Ccsr->row_offsets; 4851 jj = *Ccsr->column_indices; 4852 PetscCallCUDA(cudaMemcpy(c->i, ii.data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 4853 PetscCallCUDA(cudaMemcpy(c->j, jj.data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 4854 } else { 4855 PetscCallCUDA(cudaMemcpy(c->i, Ccsr->row_offsets->data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 4856 PetscCallCUDA(cudaMemcpy(c->j, Ccsr->column_indices->data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 4857 } 4858 PetscCall(PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size()) * sizeof(PetscInt))); 4859 PetscCall(PetscMalloc1(m, &c->ilen)); 4860 PetscCall(PetscMalloc1(m, &c->imax)); 4861 c->maxnz = c->nz; 4862 c->nonzerorowcnt = 0; 4863 c->rmax = 0; 4864 for (i = 0; i < m; i++) { 4865 const PetscInt nn = c->i[i + 1] - c->i[i]; 4866 c->ilen[i] = c->imax[i] = nn; 4867 c->nonzerorowcnt += (PetscInt) !!nn; 4868 c->rmax = PetscMax(c->rmax, nn); 4869 } 4870 PetscCall(MatMarkDiagonal_SeqAIJ(*C)); 4871 PetscCall(PetscMalloc1(c->nz, &c->a)); 4872 (*C)->nonzerostate++; 4873 PetscCall(PetscLayoutSetUp((*C)->rmap)); 4874 PetscCall(PetscLayoutSetUp((*C)->cmap)); 4875 Ccusp->nonzerostate = (*C)->nonzerostate; 4876 (*C)->preallocated = PETSC_TRUE; 4877 } else { 4878 PetscCheck((*C)->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, (*C)->rmap->n, B->rmap->n); 4879 c = (Mat_SeqAIJ *)(*C)->data; 4880 if (c->nz) { 4881 Ccusp = (Mat_SeqAIJCUSPARSE *)(*C)->spptr; 4882 PetscCheck(Ccusp->coords, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing coords"); 4883 PetscCheck(Ccusp->format != MAT_CUSPARSE_ELL && Ccusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented"); 4884 PetscCheck(Ccusp->nonzerostate == (*C)->nonzerostate, PETSC_COMM_SELF, PETSC_ERR_COR, "Wrong nonzerostate"); 4885 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 4886 PetscCall(MatSeqAIJCUSPARSECopyToGPU(B)); 4887 PetscCheck(Acusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct"); 4888 PetscCheck(Bcusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct"); 4889 Acsr = (CsrMatrix *)Acusp->mat->mat; 4890 Bcsr = (CsrMatrix *)Bcusp->mat->mat; 4891 Ccsr = (CsrMatrix *)Ccusp->mat->mat; 4892 PetscCheck(Acsr->num_entries == (PetscInt)Acsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "A nnz %" PetscInt_FMT " != %" PetscInt_FMT, Acsr->num_entries, (PetscInt)Acsr->values->size()); 4893 PetscCheck(Bcsr->num_entries == (PetscInt)Bcsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "B nnz %" PetscInt_FMT " != %" PetscInt_FMT, Bcsr->num_entries, (PetscInt)Bcsr->values->size()); 4894 PetscCheck(Ccsr->num_entries == (PetscInt)Ccsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "C nnz %" PetscInt_FMT " != %" PetscInt_FMT, Ccsr->num_entries, (PetscInt)Ccsr->values->size()); 4895 PetscCheck(Ccsr->num_entries == Acsr->num_entries + Bcsr->num_entries, PETSC_COMM_SELF, PETSC_ERR_COR, "C nnz %" PetscInt_FMT " != %" PetscInt_FMT " + %" PetscInt_FMT, Ccsr->num_entries, Acsr->num_entries, Bcsr->num_entries); 4896 PetscCheck(Ccusp->coords->size() == Ccsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "permSize %" PetscInt_FMT " != %" PetscInt_FMT, (PetscInt)Ccusp->coords->size(), (PetscInt)Ccsr->values->size()); 4897 auto pmid = Ccusp->coords->begin(); 4898 thrust::advance(pmid, Acsr->num_entries); 4899 PetscCall(PetscLogGpuTimeBegin()); 4900 auto zibait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->begin(), thrust::make_permutation_iterator(Ccsr->values->begin(), Ccusp->coords->begin()))); 4901 auto zieait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->end(), thrust::make_permutation_iterator(Ccsr->values->begin(), pmid))); 4902 thrust::for_each(zibait, zieait, VecCUDAEquals()); 4903 auto zibbit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->begin(), thrust::make_permutation_iterator(Ccsr->values->begin(), pmid))); 4904 auto ziebit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->end(), thrust::make_permutation_iterator(Ccsr->values->begin(), Ccusp->coords->end()))); 4905 thrust::for_each(zibbit, ziebit, VecCUDAEquals()); 4906 PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(*C, PETSC_FALSE)); 4907 if (A->form_explicit_transpose && B->form_explicit_transpose && (*C)->form_explicit_transpose) { 4908 PetscCheck(Ccusp->matTranspose, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing transpose Mat_SeqAIJCUSPARSEMultStruct"); 4909 PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE; 4910 CsrMatrix *AcsrT = AT ? (CsrMatrix *)Acusp->matTranspose->mat : NULL; 4911 CsrMatrix *BcsrT = BT ? (CsrMatrix *)Bcusp->matTranspose->mat : NULL; 4912 CsrMatrix *CcsrT = (CsrMatrix *)Ccusp->matTranspose->mat; 4913 auto vT = CcsrT->values->begin(); 4914 if (AT) vT = thrust::copy(AcsrT->values->begin(), AcsrT->values->end(), vT); 4915 if (BT) thrust::copy(BcsrT->values->begin(), BcsrT->values->end(), vT); 4916 (*C)->transupdated = PETSC_TRUE; 4917 } 4918 PetscCall(PetscLogGpuTimeEnd()); 4919 } 4920 } 4921 PetscCall(PetscObjectStateIncrease((PetscObject)*C)); 4922 (*C)->assembled = PETSC_TRUE; 4923 (*C)->was_assembled = PETSC_FALSE; 4924 (*C)->offloadmask = PETSC_OFFLOAD_GPU; 4925 PetscFunctionReturn(PETSC_SUCCESS); 4926 } 4927 4928 static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt idx[], PetscScalar v[]) 4929 { 4930 bool dmem; 4931 const PetscScalar *av; 4932 4933 PetscFunctionBegin; 4934 dmem = isCudaMem(v); 4935 PetscCall(MatSeqAIJCUSPARSEGetArrayRead(A, &av)); 4936 if (n && idx) { 4937 THRUSTINTARRAY widx(n); 4938 widx.assign(idx, idx + n); 4939 PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt))); 4940 4941 THRUSTARRAY *w = NULL; 4942 thrust::device_ptr<PetscScalar> dv; 4943 if (dmem) { 4944 dv = thrust::device_pointer_cast(v); 4945 } else { 4946 w = new THRUSTARRAY(n); 4947 dv = w->data(); 4948 } 4949 thrust::device_ptr<const PetscScalar> dav = thrust::device_pointer_cast(av); 4950 4951 auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav, widx.begin()), dv)); 4952 auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav, widx.end()), dv + n)); 4953 thrust::for_each(zibit, zieit, VecCUDAEquals()); 4954 if (w) PetscCallCUDA(cudaMemcpy(v, w->data().get(), n * sizeof(PetscScalar), cudaMemcpyDeviceToHost)); 4955 delete w; 4956 } else { 4957 PetscCallCUDA(cudaMemcpy(v, av, n * sizeof(PetscScalar), dmem ? cudaMemcpyDeviceToDevice : cudaMemcpyDeviceToHost)); 4958 } 4959 if (!dmem) PetscCall(PetscLogCpuToGpu(n * sizeof(PetscScalar))); 4960 PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(A, &av)); 4961 PetscFunctionReturn(PETSC_SUCCESS); 4962 } 4963