1a4963045SJacob Faibussowitsch #pragma once
2d4002b98SHong Zhang
3d4002b98SHong Zhang #include <petsc/private/matimpl.h>
4eec179cfSJacob Faibussowitsch #include <petsc/private/hashmapi.h>
5d4002b98SHong Zhang
6d4002b98SHong Zhang /*
74e58db63SHong Zhang For NVIDIA GPUs each slice should be padded to the boundary of 16 elements for best performance.
84e58db63SHong Zhang The optimal memory alignment in device memory is 128 bytes, 64 bytes, 32 bytes for double precision, single precision and half precision.
94e58db63SHong Zhang */
104e58db63SHong Zhang #if defined(PETSC_HAVE_DEVICE)
114e58db63SHong Zhang #define DEVICE_MEM_ALIGN 16
124e58db63SHong Zhang #endif
134e58db63SHong Zhang
144e58db63SHong Zhang /*
15d4002b98SHong Zhang Struct header for SeqSELL matrix format
16d4002b98SHong Zhang */
17d4002b98SHong Zhang #define SEQSELLHEADER(datatype) \
18d4002b98SHong Zhang PetscBool roworiented; /* if true, row-oriented input, default */ \
19d4002b98SHong Zhang PetscInt nonew; /* 1 don't add new nonzeros, -1 generate error on new */ \
20d4002b98SHong Zhang PetscInt nounused; /* -1 generate error on unused space */ \
21d4002b98SHong Zhang PetscBool singlemalloc; /* if true a, i, and j have been obtained with one big malloc */ \
22d4002b98SHong Zhang PetscInt maxallocmat; /* max allocated space for the matrix */ \
23d4002b98SHong Zhang PetscInt maxallocrow; /* max allocated space for each row */ \
24d4002b98SHong Zhang PetscInt nz; /* actual nonzeros */ \
25d4002b98SHong Zhang PetscInt rlenmax; /* max actual row length, rmax cannot exceed maxallocrow */ \
26d4002b98SHong Zhang PetscInt *rlen; /* actual length of each row (padding zeros excluded) */ \
27d4002b98SHong Zhang PetscBool free_rlen; /* free rlen array ? */ \
28d4002b98SHong Zhang PetscInt reallocs; /* number of mallocs done during MatSetValues() \
29d4002b98SHong Zhang as more values are set than were prealloced */ \
300b4b7b1cSBarry Smith PetscBool keepnonzeropattern; /* keeps matrix nonzero structure the same in calls to MatZeroRows()*/ \
31d4002b98SHong Zhang PetscBool ignorezeroentries; \
32d4002b98SHong Zhang PetscBool free_colidx; /* free the column indices colidx when the matrix is destroyed */ \
33d4002b98SHong Zhang PetscBool free_val; /* free the numerical values when matrix is destroy */ \
34d4002b98SHong Zhang PetscInt *colidx; /* column index */ \
35d4002b98SHong Zhang PetscInt *diag; /* pointers to diagonal elements */ \
36*07425a8dSBarry Smith PetscObjectState diagNonzeroState; /* nonzero state of the matrix when diag was obtained */ \
37*07425a8dSBarry Smith PetscBool diagDense; /* matrix contains all the diagonal entries */ \
38d4002b98SHong Zhang PetscInt nonzerorowcnt; /* how many rows have nonzero entries */ \
39d4002b98SHong Zhang datatype *val; /* elements including nonzeros and padding zeros */ \
40d4002b98SHong Zhang PetscScalar *solve_work; /* work space used in MatSolve */ \
41d4002b98SHong Zhang IS row, col, icol; /* index sets, used for reorderings */ \
42d4002b98SHong Zhang PetscBool pivotinblocks; /* pivot inside factorization of each diagonal block */ \
43d4002b98SHong Zhang Mat parent; /* set if this matrix was formed with MatDuplicate(...,MAT_SHARE_NONZERO_PATTERN,....);
44d4002b98SHong Zhang means that this shares some data structures with the parent including diag, ilen, imax, i, j */ \
45d4002b98SHong Zhang PetscInt *sliidx; /* slice index */ \
466108893eSStefano Zampini PetscInt totalslices; /* total number of slices */ \
4707e43b41SHong Zhang PetscInt sliceheight; /* slice height */ \
4807e43b41SHong Zhang PetscReal fillratio; /* ratio of number of padded zeros over total number of elements */ \
4907e43b41SHong Zhang PetscReal avgslicewidth; /* average slice width */ \
5007e43b41SHong Zhang PetscInt maxslicewidth; /* maximum slice width */ \
51b921024eSHong Zhang PetscReal varslicesize; /* variance of slice size */ \
5207e43b41SHong Zhang PetscInt *sliperm; /* slice permutation array, CUDA only */ \
5307e43b41SHong Zhang PetscInt totalblocks; /* total number of blocks, CUDA only */ \
5407e43b41SHong Zhang PetscInt *blockidx; /* block index, CUDA only */ \
5507e43b41SHong Zhang PetscInt *block_row_map; /* starting row of the current block, CUDA only */ \
5690d2215bSHong Zhang PetscInt chunksize; /* chunk size, CUDA only */ \
5790d2215bSHong Zhang PetscInt totalchunks; /* total number of chunks, CUDA only */ \
58baca6076SPierre Jolivet PetscInt *chunk_slice_map; /* starting slice of the current chunk, CUDA only */ \
596108893eSStefano Zampini PetscInt *getrowcols; /* workarray for MatGetRow_SeqSELL */ \
609371c9d4SSatish Balay PetscScalar *getrowvals /* workarray for MatGetRow_SeqSELL */
61d4002b98SHong Zhang
62d4002b98SHong Zhang typedef struct {
63d4002b98SHong Zhang SEQSELLHEADER(MatScalar);
64d4002b98SHong Zhang MatScalar *saved_values; /* location for stashing nonzero values of matrix */
65*07425a8dSBarry Smith
66*07425a8dSBarry Smith /* data needed for MatSOR_SeqAIJ() */
67*07425a8dSBarry Smith PetscScalar *mdiag, *idiag; /* diagonal values, inverse of diagonal entries */
68*07425a8dSBarry Smith PetscScalar *ssor_work; /* workspace for Eisenstat trick */
69*07425a8dSBarry Smith PetscObjectState idiagState; /* state of the matrix when mdiag and idiag was obtained */
70d4002b98SHong Zhang PetscScalar fshift, omega; /* last used omega and fshift */
71*07425a8dSBarry Smith
72d4002b98SHong Zhang ISColoring coloring; /* set with MatADSetColoring() used by MatADSetValues() */
73d4002b98SHong Zhang } Mat_SeqSELL;
74d4002b98SHong Zhang
75d4002b98SHong Zhang /*
76d4002b98SHong Zhang Frees the arrays from the XSELLPACK matrix type
77d4002b98SHong Zhang */
MatSeqXSELLFreeSELL(Mat AA,MatScalar ** val,PetscInt ** colidx)78d71ae5a4SJacob Faibussowitsch static inline PetscErrorCode MatSeqXSELLFreeSELL(Mat AA, MatScalar **val, PetscInt **colidx)
79d71ae5a4SJacob Faibussowitsch {
80d4002b98SHong Zhang Mat_SeqSELL *A = (Mat_SeqSELL *)AA->data;
81d4002b98SHong Zhang if (A->singlemalloc) {
829566063dSJacob Faibussowitsch PetscCall(PetscFree2(*val, *colidx));
83d4002b98SHong Zhang } else {
849566063dSJacob Faibussowitsch if (A->free_val) PetscCall(PetscFree(*val));
859566063dSJacob Faibussowitsch if (A->free_colidx) PetscCall(PetscFree(*colidx));
86d4002b98SHong Zhang }
873ba16761SJacob Faibussowitsch return PETSC_SUCCESS;
88d4002b98SHong Zhang }
89d4002b98SHong Zhang
904e58db63SHong Zhang #define MatSeqXSELLReallocateSELL(Amat, AM, BS2, WIDTH, SIDX, SH, SID, ROW, COL, COLIDX, VAL, CP, VP, NONEW, datatype, MUL) \
91a8f51744SPierre Jolivet do { \
9207e43b41SHong Zhang if (WIDTH >= (SIDX[SID + 1] - SIDX[SID]) / SH) { \
93d4002b98SHong Zhang Mat_SeqSELL *Ain = (Mat_SeqSELL *)Amat->data; \
942d1451d4SHong Zhang /* there is no extra room in row, therefore enlarge 1 slice column */ \
954e58db63SHong Zhang PetscInt new_size = Ain->maxallocmat + SH * MUL, *new_colidx; \
96d4002b98SHong Zhang datatype *new_val; \
97d4002b98SHong Zhang \
9800045ab3SPierre Jolivet PetscCheck(NONEW != -2, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "New nonzero at (%" PetscInt_FMT ",%" PetscInt_FMT ") caused a malloc. Use MatSetOption(A, MAT_NEW_NONZERO_ALLOCATION_ERR, PETSC_FALSE) to turn off this check", ROW, COL); \
99d4002b98SHong Zhang /* malloc new storage space */ \
1009566063dSJacob Faibussowitsch PetscCall(PetscMalloc2(BS2 * new_size, &new_val, BS2 * new_size, &new_colidx)); \
101d4002b98SHong Zhang \
102d4002b98SHong Zhang /* copy over old data into new slots by two steps: one step for data before the current slice and the other for the rest */ \
1039566063dSJacob Faibussowitsch PetscCall(PetscArraycpy(new_val, VAL, SIDX[SID + 1])); \
1049566063dSJacob Faibussowitsch PetscCall(PetscArraycpy(new_colidx, COLIDX, SIDX[SID + 1])); \
1054e58db63SHong Zhang PetscCall(PetscArraycpy(new_val + SIDX[SID + 1] + SH * MUL, VAL + SIDX[SID + 1], SIDX[Ain->totalslices] - SIDX[SID + 1])); \
1064e58db63SHong Zhang PetscCall(PetscArraycpy(new_colidx + SIDX[SID + 1] + SH * MUL, COLIDX + SIDX[SID + 1], SIDX[Ain->totalslices] - SIDX[SID + 1])); \
107d4002b98SHong Zhang /* update slice_idx */ \
1080b36fe0aSPierre Jolivet for (ii = SID + 1; ii <= Ain->totalslices; ii++) SIDX[ii] += SH * MUL; \
109baca6076SPierre Jolivet /* update pointers. Notice that they point to the FIRST position of the row */ \
11007e43b41SHong Zhang CP = new_colidx + SIDX[SID] + (ROW % SH); \
11107e43b41SHong Zhang VP = new_val + SIDX[SID] + (ROW % SH); \
112d4002b98SHong Zhang /* free up old matrix storage */ \
1139566063dSJacob Faibussowitsch PetscCall(MatSeqXSELLFreeSELL(A, &Ain->val, &Ain->colidx)); \
114835f2295SStefano Zampini Ain->val = new_val; \
115d4002b98SHong Zhang Ain->colidx = new_colidx; \
116d4002b98SHong Zhang Ain->singlemalloc = PETSC_TRUE; \
117d4002b98SHong Zhang Ain->maxallocmat = new_size; \
118d4002b98SHong Zhang Ain->reallocs++; \
119b758ae8cSBarry Smith A->nonzerostate++; \
1204e58db63SHong Zhang if (WIDTH >= Ain->maxallocrow) Ain->maxallocrow += MUL; \
121d4002b98SHong Zhang if (WIDTH >= Ain->rlenmax) Ain->rlenmax++; \
122a8f51744SPierre Jolivet } \
123a8f51744SPierre Jolivet } while (0)
124d4002b98SHong Zhang
125d4002b98SHong Zhang #define MatSetValue_SeqSELL_Private(A, row, col, value, addv, orow, ocol, cp, vp, lastcol, low, high) \
126a8f51744SPierre Jolivet do { \
127d4002b98SHong Zhang Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; \
128d4002b98SHong Zhang found = PETSC_FALSE; \
129d4002b98SHong Zhang if (col <= lastcol) low = 0; \
130d4002b98SHong Zhang else high = a->rlen[row]; \
131d4002b98SHong Zhang lastcol = col; \
132d4002b98SHong Zhang while (high - low > 5) { \
133d4002b98SHong Zhang t = (low + high) / 2; \
13407e43b41SHong Zhang if (*(cp + a->sliceheight * t) > col) high = t; \
135d4002b98SHong Zhang else low = t; \
136d4002b98SHong Zhang } \
137d4002b98SHong Zhang for (_i = low; _i < high; _i++) { \
13807e43b41SHong Zhang if (*(cp + a->sliceheight * _i) > col) break; \
13907e43b41SHong Zhang if (*(cp + a->sliceheight * _i) == col) { \
14007e43b41SHong Zhang if (addv == ADD_VALUES) *(vp + a->sliceheight * _i) += value; \
14107e43b41SHong Zhang else *(vp + a->sliceheight * _i) = value; \
142d4002b98SHong Zhang found = PETSC_TRUE; \
143d4002b98SHong Zhang break; \
144d4002b98SHong Zhang } \
145d4002b98SHong Zhang } \
146d4002b98SHong Zhang if (!found) { \
14708401ef6SPierre Jolivet PetscCheck(a->nonew != -1, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Inserting a new nonzero at global row/column (%" PetscInt_FMT ", %" PetscInt_FMT ") into matrix", orow, ocol); \
14807e43b41SHong Zhang if (a->nonew != 1 && !(value == 0.0 && a->ignorezeroentries) && a->rlen[row] >= (a->sliidx[row / a->sliceheight + 1] - a->sliidx[row / a->sliceheight]) / a->sliceheight) { \
1492d1451d4SHong Zhang /* there is no extra room in row, therefore enlarge 1 slice column */ \
15007e43b41SHong Zhang if (a->maxallocmat < a->sliidx[a->totalslices] + a->sliceheight) { \
151d4002b98SHong Zhang /* allocates a larger array for the XSELL matrix types; only extend the current slice by one more column. */ \
15207e43b41SHong Zhang PetscInt new_size = a->maxallocmat + a->sliceheight, *new_colidx; \
153d4002b98SHong Zhang MatScalar *new_val; \
15400045ab3SPierre Jolivet PetscCheck(a->nonew != -2, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "New nonzero at (%" PetscInt_FMT ",%" PetscInt_FMT ") caused a malloc. Use MatSetOption(A, MAT_NEW_NONZERO_ALLOCATION_ERR, PETSC_FALSE) to turn off this check", orow, ocol); \
155d4002b98SHong Zhang /* malloc new storage space */ \
1569566063dSJacob Faibussowitsch PetscCall(PetscMalloc2(new_size, &new_val, new_size, &new_colidx)); \
157d4002b98SHong Zhang /* copy over old data into new slots by two steps: one step for data before the current slice and the other for the rest */ \
15807e43b41SHong Zhang PetscCall(PetscArraycpy(new_val, a->val, a->sliidx[row / a->sliceheight + 1])); \
15907e43b41SHong Zhang PetscCall(PetscArraycpy(new_colidx, a->colidx, a->sliidx[row / a->sliceheight + 1])); \
1608e3a54c0SPierre Jolivet PetscCall(PetscArraycpy(new_val + a->sliidx[row / a->sliceheight + 1] + a->sliceheight, PetscSafePointerPlusOffset(a->val, a->sliidx[row / a->sliceheight + 1]), a->sliidx[a->totalslices] - a->sliidx[row / a->sliceheight + 1])); \
1618e3a54c0SPierre Jolivet PetscCall(PetscArraycpy(new_colidx + a->sliidx[row / a->sliceheight + 1] + a->sliceheight, PetscSafePointerPlusOffset(a->colidx, a->sliidx[row / a->sliceheight + 1]), a->sliidx[a->totalslices] - a->sliidx[row / a->sliceheight + 1])); \
162baca6076SPierre Jolivet /* update pointers. Notice that they point to the FIRST position of the row */ \
16307e43b41SHong Zhang cp = new_colidx + a->sliidx[row / a->sliceheight] + (row % a->sliceheight); \
16407e43b41SHong Zhang vp = new_val + a->sliidx[row / a->sliceheight] + (row % a->sliceheight); \
165d4002b98SHong Zhang /* free up old matrix storage */ \
1669566063dSJacob Faibussowitsch PetscCall(MatSeqXSELLFreeSELL(A, &a->val, &a->colidx)); \
167835f2295SStefano Zampini a->val = new_val; \
168d4002b98SHong Zhang a->colidx = new_colidx; \
169d4002b98SHong Zhang a->singlemalloc = PETSC_TRUE; \
170d4002b98SHong Zhang a->maxallocmat = new_size; \
171d4002b98SHong Zhang a->reallocs++; \
172d4002b98SHong Zhang } else { \
173d4002b98SHong Zhang /* no need to reallocate, just shift the following slices to create space for the added slice column */ \
17407e43b41SHong Zhang PetscCall(PetscArraymove(a->val + a->sliidx[row / a->sliceheight + 1] + a->sliceheight, a->val + a->sliidx[row / a->sliceheight + 1], a->sliidx[a->totalslices] - a->sliidx[row / a->sliceheight + 1])); \
17507e43b41SHong Zhang PetscCall(PetscArraymove(a->colidx + a->sliidx[row / a->sliceheight + 1] + a->sliceheight, a->colidx + a->sliidx[row / a->sliceheight + 1], a->sliidx[a->totalslices] - a->sliidx[row / a->sliceheight + 1])); \
176d4002b98SHong Zhang } \
177d4002b98SHong Zhang /* update slice_idx */ \
17807e43b41SHong Zhang for (ii = row / a->sliceheight + 1; ii <= a->totalslices; ii++) a->sliidx[ii] += a->sliceheight; \
179d4002b98SHong Zhang if (a->rlen[row] >= a->maxallocrow) a->maxallocrow++; \
180d4002b98SHong Zhang if (a->rlen[row] >= a->rlenmax) a->rlenmax++; \
181d4002b98SHong Zhang } \
182d4002b98SHong Zhang /* shift up all the later entries in this row */ \
183d4002b98SHong Zhang for (ii = a->rlen[row] - 1; ii >= _i; ii--) { \
18407e43b41SHong Zhang *(cp + a->sliceheight * (ii + 1)) = *(cp + a->sliceheight * ii); \
18507e43b41SHong Zhang *(vp + a->sliceheight * (ii + 1)) = *(vp + a->sliceheight * ii); \
186d4002b98SHong Zhang } \
18707e43b41SHong Zhang *(cp + a->sliceheight * _i) = col; \
18807e43b41SHong Zhang *(vp + a->sliceheight * _i) = value; \
1899371c9d4SSatish Balay a->nz++; \
1909371c9d4SSatish Balay a->rlen[row]++; \
1919371c9d4SSatish Balay A->nonzerostate++; \
1929371c9d4SSatish Balay low = _i + 1; \
1939371c9d4SSatish Balay high++; \
194d4002b98SHong Zhang } \
195a8f51744SPierre Jolivet } while (0)
196d4002b98SHong Zhang
197b7c0efcaSStefano Zampini #define PetscCeilIntMacro(x, y) ((((PetscInt)(x)) / ((PetscInt)(y))) + ((((PetscInt)(x)) % ((PetscInt)(y))) ? 1 : 0))
198b7c0efcaSStefano Zampini
199d4002b98SHong Zhang PETSC_INTERN PetscErrorCode MatSeqSELLSetPreallocation_SeqSELL(Mat, PetscInt, const PetscInt[]);
200d4002b98SHong Zhang PETSC_INTERN PetscErrorCode MatMult_SeqSELL(Mat, Vec, Vec);
201d4002b98SHong Zhang PETSC_INTERN PetscErrorCode MatMultAdd_SeqSELL(Mat, Vec, Vec, Vec);
202d4002b98SHong Zhang PETSC_INTERN PetscErrorCode MatMultTranspose_SeqSELL(Mat, Vec, Vec);
203d4002b98SHong Zhang PETSC_INTERN PetscErrorCode MatMultTransposeAdd_SeqSELL(Mat, Vec, Vec, Vec);
204d4002b98SHong Zhang PETSC_INTERN PetscErrorCode MatZeroEntries_SeqSELL(Mat);
205d4002b98SHong Zhang PETSC_INTERN PetscErrorCode MatDestroy_SeqSELL(Mat);
206d4002b98SHong Zhang PETSC_INTERN PetscErrorCode MatSetOption_SeqSELL(Mat, MatOption, PetscBool);
207d4002b98SHong Zhang PETSC_INTERN PetscErrorCode MatGetDiagonal_SeqSELL(Mat, Vec v);
208d4002b98SHong Zhang PETSC_INTERN PetscErrorCode MatGetValues_SeqSELL(Mat, PetscInt, const PetscInt[], PetscInt, const PetscInt[], PetscScalar[]);
209d4002b98SHong Zhang PETSC_INTERN PetscErrorCode MatView_SeqSELL(Mat, PetscViewer);
210d4002b98SHong Zhang PETSC_INTERN PetscErrorCode MatAssemblyEnd_SeqSELL(Mat, MatAssemblyType);
211d4002b98SHong Zhang PETSC_INTERN PetscErrorCode MatGetInfo_SeqSELL(Mat, MatInfoType, MatInfo *);
212d4002b98SHong Zhang PETSC_INTERN PetscErrorCode MatSetValues_SeqSELL(Mat, PetscInt, const PetscInt[], PetscInt, const PetscInt[], const PetscScalar[], InsertMode);
213d4002b98SHong Zhang PETSC_INTERN PetscErrorCode MatCopy_SeqSELL(Mat, Mat, MatStructure);
214d4002b98SHong Zhang PETSC_INTERN PetscErrorCode MatSetUp_SeqSELL(Mat);
215d4002b98SHong Zhang PETSC_INTERN PetscErrorCode MatSeqSELLGetArray_SeqSELL(Mat, PetscScalar *[]);
216d4002b98SHong Zhang PETSC_INTERN PetscErrorCode MatSeqSELLRestoreArray_SeqSELL(Mat, PetscScalar *[]);
217d4002b98SHong Zhang PETSC_INTERN PetscErrorCode MatShift_SeqSELL(Mat, PetscScalar);
218d4002b98SHong Zhang PETSC_INTERN PetscErrorCode MatSOR_SeqSELL(Mat, Vec, PetscReal, MatSORType, PetscReal, PetscInt, PetscInt, Vec);
219d4002b98SHong Zhang PETSC_EXTERN PetscErrorCode MatCreate_SeqSELL(Mat);
220d4002b98SHong Zhang PETSC_INTERN PetscErrorCode MatDuplicate_SeqSELL(Mat, MatDuplicateOption, Mat *);
221d4002b98SHong Zhang PETSC_INTERN PetscErrorCode MatEqual_SeqSELL(Mat, Mat, PetscBool *);
222d4002b98SHong Zhang PETSC_INTERN PetscErrorCode MatConvert_SeqSELL_SeqAIJ(Mat, MatType, MatReuse, Mat *);
223d4002b98SHong Zhang PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqSELL(Mat, MatType, MatReuse, Mat *);
224d4002b98SHong Zhang PETSC_INTERN PetscErrorCode MatFDColoringCreate_SeqSELL(Mat, ISColoring, MatFDColoring);
225d4002b98SHong Zhang PETSC_INTERN PetscErrorCode MatFDColoringSetUp_SeqSELL(Mat, ISColoring, MatFDColoring);
226d4002b98SHong Zhang PETSC_INTERN PetscErrorCode MatGetColumnIJ_SeqSELL_Color(Mat, PetscInt, PetscBool, PetscBool, PetscInt *, const PetscInt *[], const PetscInt *[], PetscInt *[], PetscBool *);
227d4002b98SHong Zhang PETSC_INTERN PetscErrorCode MatRestoreColumnIJ_SeqSELL_Color(Mat, PetscInt, PetscBool, PetscBool, PetscInt *, const PetscInt *[], const PetscInt *[], PetscInt *[], PetscBool *);
228d4002b98SHong Zhang PETSC_INTERN PetscErrorCode MatConjugate_SeqSELL(Mat A);
229d4002b98SHong Zhang PETSC_INTERN PetscErrorCode MatScale_SeqSELL(Mat, PetscScalar);
230d4002b98SHong Zhang PETSC_INTERN PetscErrorCode MatDiagonalScale_SeqSELL(Mat, Vec, Vec);
231