xref: /petsc/src/mat/impls/aij/seq/aijperm/aijperm.c (revision d0e6bf2ad94dcc89b258ce16c7987200a4714786)
1938d9b04SRichard Tran Mills /*
2938d9b04SRichard Tran Mills   Defines basic operations for the MATSEQAIJPERM matrix class.
3938d9b04SRichard Tran Mills   This class is derived from the MATSEQAIJ class and retains the
4938d9b04SRichard Tran Mills   compressed row storage (aka Yale sparse matrix format) but augments
5938d9b04SRichard Tran Mills   it with some permutation information that enables some operations
6938d9b04SRichard Tran Mills   to be more vectorizable.  A physically rearranged copy of the matrix
7938d9b04SRichard Tran Mills   may be stored if the user desires.
8938d9b04SRichard Tran Mills 
9938d9b04SRichard Tran Mills   Eventually a variety of permutations may be supported.
10938d9b04SRichard Tran Mills */
11938d9b04SRichard Tran Mills 
12938d9b04SRichard Tran Mills #include <../src/mat/impls/aij/seq/aij.h>
13938d9b04SRichard Tran Mills 
1480423c7aSSatish Balay #if defined(PETSC_USE_AVX512_KERNELS) && defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
15f67b6f2eSRichard Tran Mills   #include <immintrin.h>
16f67b6f2eSRichard Tran Mills 
17f67b6f2eSRichard Tran Mills   #if !defined(_MM_SCALE_8)
18f67b6f2eSRichard Tran Mills     #define _MM_SCALE_8 8
19f67b6f2eSRichard Tran Mills   #endif
20f67b6f2eSRichard Tran Mills   #if !defined(_MM_SCALE_4)
21f67b6f2eSRichard Tran Mills     #define _MM_SCALE_4 4
22f67b6f2eSRichard Tran Mills   #endif
23f67b6f2eSRichard Tran Mills #endif
24f67b6f2eSRichard Tran Mills 
25938d9b04SRichard Tran Mills #define NDIM 512
26938d9b04SRichard Tran Mills /* NDIM specifies how many rows at a time we should work with when
27938d9b04SRichard Tran Mills  * performing the vectorized mat-vec.  This depends on various factors
28938d9b04SRichard Tran Mills  * such as vector register length, etc., and I really need to add a
29938d9b04SRichard Tran Mills  * way for the user (or the library) to tune this.  I'm setting it to
30938d9b04SRichard Tran Mills  * 512 for now since that is what Ed D'Azevedo was using in his Fortran
31938d9b04SRichard Tran Mills  * routines. */
32938d9b04SRichard Tran Mills 
33938d9b04SRichard Tran Mills typedef struct {
34938d9b04SRichard Tran Mills   PetscObjectState nonzerostate; /* used to determine if the nonzero structure has changed and hence the permutations need updating */
35938d9b04SRichard Tran Mills 
36938d9b04SRichard Tran Mills   PetscInt  ngroup;
37938d9b04SRichard Tran Mills   PetscInt *xgroup;
38938d9b04SRichard Tran Mills   /* Denotes where groups of rows with same number of nonzeros
39938d9b04SRichard Tran Mills    * begin and end, i.e., xgroup[i] gives us the position in iperm[]
40938d9b04SRichard Tran Mills    * where the ith group begins. */
41938d9b04SRichard Tran Mills 
42938d9b04SRichard Tran Mills   PetscInt *nzgroup; /*  how many nonzeros each row that is a member of group i has. */
43938d9b04SRichard Tran Mills   PetscInt *iperm;   /* The permutation vector. */
44938d9b04SRichard Tran Mills 
45938d9b04SRichard Tran Mills   /* Some of this stuff is for Ed's recursive triangular solve.
46938d9b04SRichard Tran Mills    * I'm not sure what I need yet. */
47938d9b04SRichard Tran Mills   PetscInt   blocksize;
48938d9b04SRichard Tran Mills   PetscInt   nstep;
49938d9b04SRichard Tran Mills   PetscInt  *jstart_list;
50938d9b04SRichard Tran Mills   PetscInt  *jend_list;
51938d9b04SRichard Tran Mills   PetscInt  *action_list;
52938d9b04SRichard Tran Mills   PetscInt  *ngroup_list;
53938d9b04SRichard Tran Mills   PetscInt **ipointer_list;
54938d9b04SRichard Tran Mills   PetscInt **xgroup_list;
55938d9b04SRichard Tran Mills   PetscInt **nzgroup_list;
56938d9b04SRichard Tran Mills   PetscInt **iperm_list;
57938d9b04SRichard Tran Mills } Mat_SeqAIJPERM;
58938d9b04SRichard Tran Mills 
MatConvert_SeqAIJPERM_SeqAIJ(Mat A,MatType type,MatReuse reuse,Mat * newmat)59d71ae5a4SJacob Faibussowitsch PETSC_INTERN PetscErrorCode MatConvert_SeqAIJPERM_SeqAIJ(Mat A, MatType type, MatReuse reuse, Mat *newmat)
60d71ae5a4SJacob Faibussowitsch {
61938d9b04SRichard Tran Mills   /* This routine is only called to convert a MATAIJPERM to its base PETSc type, */
62938d9b04SRichard Tran Mills   /* so we will ignore 'MatType type'. */
63938d9b04SRichard Tran Mills   Mat             B       = *newmat;
64938d9b04SRichard Tran Mills   Mat_SeqAIJPERM *aijperm = (Mat_SeqAIJPERM *)A->spptr;
65938d9b04SRichard Tran Mills 
66938d9b04SRichard Tran Mills   PetscFunctionBegin;
67938d9b04SRichard Tran Mills   if (reuse == MAT_INITIAL_MATRIX) {
689566063dSJacob Faibussowitsch     PetscCall(MatDuplicate(A, MAT_COPY_VALUES, &B));
69938d9b04SRichard Tran Mills     aijperm = (Mat_SeqAIJPERM *)B->spptr;
70938d9b04SRichard Tran Mills   }
71938d9b04SRichard Tran Mills 
72938d9b04SRichard Tran Mills   /* Reset the original function pointers. */
73938d9b04SRichard Tran Mills   B->ops->assemblyend = MatAssemblyEnd_SeqAIJ;
74938d9b04SRichard Tran Mills   B->ops->destroy     = MatDestroy_SeqAIJ;
75938d9b04SRichard Tran Mills   B->ops->duplicate   = MatDuplicate_SeqAIJ;
76938d9b04SRichard Tran Mills   B->ops->mult        = MatMult_SeqAIJ;
77938d9b04SRichard Tran Mills   B->ops->multadd     = MatMultAdd_SeqAIJ;
78938d9b04SRichard Tran Mills 
799566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_seqaijperm_seqaij_C", NULL));
80938d9b04SRichard Tran Mills 
81938d9b04SRichard Tran Mills   /* Free everything in the Mat_SeqAIJPERM data structure.*/
829566063dSJacob Faibussowitsch   PetscCall(PetscFree(aijperm->xgroup));
839566063dSJacob Faibussowitsch   PetscCall(PetscFree(aijperm->nzgroup));
849566063dSJacob Faibussowitsch   PetscCall(PetscFree(aijperm->iperm));
859566063dSJacob Faibussowitsch   PetscCall(PetscFree(B->spptr));
86938d9b04SRichard Tran Mills 
87938d9b04SRichard Tran Mills   /* Change the type of B to MATSEQAIJ. */
889566063dSJacob Faibussowitsch   PetscCall(PetscObjectChangeTypeName((PetscObject)B, MATSEQAIJ));
89938d9b04SRichard Tran Mills 
90938d9b04SRichard Tran Mills   *newmat = B;
913ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
92938d9b04SRichard Tran Mills }
93938d9b04SRichard Tran Mills 
MatDestroy_SeqAIJPERM(Mat A)9466976f2fSJacob Faibussowitsch static PetscErrorCode MatDestroy_SeqAIJPERM(Mat A)
95d71ae5a4SJacob Faibussowitsch {
96938d9b04SRichard Tran Mills   Mat_SeqAIJPERM *aijperm = (Mat_SeqAIJPERM *)A->spptr;
97938d9b04SRichard Tran Mills 
98938d9b04SRichard Tran Mills   PetscFunctionBegin;
99938d9b04SRichard Tran Mills   if (aijperm) {
100938d9b04SRichard Tran Mills     /* If MatHeaderMerge() was used then this SeqAIJPERM matrix will not have a spprt. */
1019566063dSJacob Faibussowitsch     PetscCall(PetscFree(aijperm->xgroup));
1029566063dSJacob Faibussowitsch     PetscCall(PetscFree(aijperm->nzgroup));
1039566063dSJacob Faibussowitsch     PetscCall(PetscFree(aijperm->iperm));
1049566063dSJacob Faibussowitsch     PetscCall(PetscFree(A->spptr));
105938d9b04SRichard Tran Mills   }
106938d9b04SRichard Tran Mills   /* Change the type of A back to SEQAIJ and use MatDestroy_SeqAIJ()
107938d9b04SRichard Tran Mills    * to destroy everything that remains. */
1089566063dSJacob Faibussowitsch   PetscCall(PetscObjectChangeTypeName((PetscObject)A, MATSEQAIJ));
109938d9b04SRichard Tran Mills   /* Note that I don't call MatSetType().  I believe this is because that
110938d9b04SRichard Tran Mills    * is only to be called when *building* a matrix.  I could be wrong, but
111938d9b04SRichard Tran Mills    * that is how things work for the SuperLU matrix class. */
1122e956fe4SStefano Zampini   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqaijperm_seqaij_C", NULL));
1139566063dSJacob Faibussowitsch   PetscCall(MatDestroy_SeqAIJ(A));
1143ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
115938d9b04SRichard Tran Mills }
116938d9b04SRichard Tran Mills 
MatDuplicate_SeqAIJPERM(Mat A,MatDuplicateOption op,Mat * M)11766976f2fSJacob Faibussowitsch static PetscErrorCode MatDuplicate_SeqAIJPERM(Mat A, MatDuplicateOption op, Mat *M)
118d71ae5a4SJacob Faibussowitsch {
119938d9b04SRichard Tran Mills   Mat_SeqAIJPERM *aijperm = (Mat_SeqAIJPERM *)A->spptr;
120938d9b04SRichard Tran Mills   Mat_SeqAIJPERM *aijperm_dest;
121938d9b04SRichard Tran Mills   PetscBool       perm;
122938d9b04SRichard Tran Mills 
123938d9b04SRichard Tran Mills   PetscFunctionBegin;
1249566063dSJacob Faibussowitsch   PetscCall(MatDuplicate_SeqAIJ(A, op, M));
1259566063dSJacob Faibussowitsch   PetscCall(PetscObjectTypeCompare((PetscObject)*M, MATSEQAIJPERM, &perm));
126938d9b04SRichard Tran Mills   if (perm) {
127938d9b04SRichard Tran Mills     aijperm_dest = (Mat_SeqAIJPERM *)(*M)->spptr;
1289566063dSJacob Faibussowitsch     PetscCall(PetscFree(aijperm_dest->xgroup));
1299566063dSJacob Faibussowitsch     PetscCall(PetscFree(aijperm_dest->nzgroup));
1309566063dSJacob Faibussowitsch     PetscCall(PetscFree(aijperm_dest->iperm));
131938d9b04SRichard Tran Mills   } else {
1324dfa11a4SJacob Faibussowitsch     PetscCall(PetscNew(&aijperm_dest));
133938d9b04SRichard Tran Mills     (*M)->spptr = (void *)aijperm_dest;
1349566063dSJacob Faibussowitsch     PetscCall(PetscObjectChangeTypeName((PetscObject)*M, MATSEQAIJPERM));
1359566063dSJacob Faibussowitsch     PetscCall(PetscObjectComposeFunction((PetscObject)*M, "MatConvert_seqaijperm_seqaij_C", MatConvert_SeqAIJPERM_SeqAIJ));
136938d9b04SRichard Tran Mills   }
1379566063dSJacob Faibussowitsch   PetscCall(PetscArraycpy(aijperm_dest, aijperm, 1));
138938d9b04SRichard Tran Mills   /* Allocate space for, and copy the grouping and permutation info.
139938d9b04SRichard Tran Mills    * I note that when the groups are initially determined in
140938d9b04SRichard Tran Mills    * MatSeqAIJPERM_create_perm, xgroup and nzgroup may be sized larger than
141938d9b04SRichard Tran Mills    * necessary.  But at this point, we know how large they need to be, and
142938d9b04SRichard Tran Mills    * allocate only the necessary amount of memory.  So the duplicated matrix
143938d9b04SRichard Tran Mills    * may actually use slightly less storage than the original! */
1449566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(A->rmap->n, &aijperm_dest->iperm));
1459566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(aijperm->ngroup + 1, &aijperm_dest->xgroup));
1469566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(aijperm->ngroup, &aijperm_dest->nzgroup));
1479566063dSJacob Faibussowitsch   PetscCall(PetscArraycpy(aijperm_dest->iperm, aijperm->iperm, A->rmap->n));
1489566063dSJacob Faibussowitsch   PetscCall(PetscArraycpy(aijperm_dest->xgroup, aijperm->xgroup, aijperm->ngroup + 1));
1499566063dSJacob Faibussowitsch   PetscCall(PetscArraycpy(aijperm_dest->nzgroup, aijperm->nzgroup, aijperm->ngroup));
1503ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
151938d9b04SRichard Tran Mills }
152938d9b04SRichard Tran Mills 
MatSeqAIJPERM_create_perm(Mat A)15366976f2fSJacob Faibussowitsch static PetscErrorCode MatSeqAIJPERM_create_perm(Mat A)
154d71ae5a4SJacob Faibussowitsch {
155*57508eceSPierre Jolivet   Mat_SeqAIJ     *a       = (Mat_SeqAIJ *)A->data;
156938d9b04SRichard Tran Mills   Mat_SeqAIJPERM *aijperm = (Mat_SeqAIJPERM *)A->spptr;
157938d9b04SRichard Tran Mills   PetscInt        m;     /* Number of rows in the matrix. */
158938d9b04SRichard Tran Mills   PetscInt       *ia;    /* From the CSR representation; points to the beginning  of each row. */
159938d9b04SRichard Tran Mills   PetscInt        maxnz; /* Maximum number of nonzeros in any row. */
160938d9b04SRichard Tran Mills   PetscInt       *rows_in_bucket;
161938d9b04SRichard Tran Mills   /* To construct the permutation, we sort each row into one of maxnz
162938d9b04SRichard Tran Mills    * buckets based on how many nonzeros are in the row. */
163938d9b04SRichard Tran Mills   PetscInt  nz;
164938d9b04SRichard Tran Mills   PetscInt *nz_in_row; /* the number of nonzero elements in row k. */
165938d9b04SRichard Tran Mills   PetscInt *ipnz;
166938d9b04SRichard Tran Mills   /* When constructing the iperm permutation vector,
167938d9b04SRichard Tran Mills    * ipnz[nz] is used to point to the next place in the permutation vector
168938d9b04SRichard Tran Mills    * that a row with nz nonzero elements should be placed.*/
169938d9b04SRichard Tran Mills   PetscInt i, ngroup, istart, ipos;
170938d9b04SRichard Tran Mills 
171938d9b04SRichard Tran Mills   PetscFunctionBegin;
1723ba16761SJacob Faibussowitsch   if (aijperm->nonzerostate == A->nonzerostate) PetscFunctionReturn(PETSC_SUCCESS); /* permutation exists and matches current nonzero structure */
173938d9b04SRichard Tran Mills   aijperm->nonzerostate = A->nonzerostate;
174938d9b04SRichard Tran Mills   /* Free anything previously put in the Mat_SeqAIJPERM data structure. */
1759566063dSJacob Faibussowitsch   PetscCall(PetscFree(aijperm->xgroup));
1769566063dSJacob Faibussowitsch   PetscCall(PetscFree(aijperm->nzgroup));
1779566063dSJacob Faibussowitsch   PetscCall(PetscFree(aijperm->iperm));
178938d9b04SRichard Tran Mills 
179938d9b04SRichard Tran Mills   m  = A->rmap->n;
180938d9b04SRichard Tran Mills   ia = a->i;
181938d9b04SRichard Tran Mills 
182938d9b04SRichard Tran Mills   /* Allocate the arrays that will hold the permutation vector. */
1839566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(m, &aijperm->iperm));
184938d9b04SRichard Tran Mills 
185938d9b04SRichard Tran Mills   /* Allocate some temporary work arrays that will be used in
1866aad120cSJose E. Roman    * calculating the permutation vector and groupings. */
1879566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(m, &nz_in_row));
188938d9b04SRichard Tran Mills 
189938d9b04SRichard Tran Mills   /* Now actually figure out the permutation and grouping. */
190938d9b04SRichard Tran Mills 
191938d9b04SRichard Tran Mills   /* First pass: Determine number of nonzeros in each row, maximum
192938d9b04SRichard Tran Mills    * number of nonzeros in any row, and how many rows fall into each
193938d9b04SRichard Tran Mills    * "bucket" of rows with same number of nonzeros. */
194938d9b04SRichard Tran Mills   maxnz = 0;
195938d9b04SRichard Tran Mills   for (i = 0; i < m; i++) {
196938d9b04SRichard Tran Mills     nz_in_row[i] = ia[i + 1] - ia[i];
197938d9b04SRichard Tran Mills     if (nz_in_row[i] > maxnz) maxnz = nz_in_row[i];
198938d9b04SRichard Tran Mills   }
1999566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(PetscMax(maxnz, m) + 1, &rows_in_bucket));
2009566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(PetscMax(maxnz, m) + 1, &ipnz));
201938d9b04SRichard Tran Mills 
202ad540459SPierre Jolivet   for (i = 0; i <= maxnz; i++) rows_in_bucket[i] = 0;
203938d9b04SRichard Tran Mills   for (i = 0; i < m; i++) {
204938d9b04SRichard Tran Mills     nz = nz_in_row[i];
205938d9b04SRichard Tran Mills     rows_in_bucket[nz]++;
206938d9b04SRichard Tran Mills   }
207938d9b04SRichard Tran Mills 
208938d9b04SRichard Tran Mills   /* Allocate space for the grouping info.  There will be at most (maxnz + 1)
209938d9b04SRichard Tran Mills    * groups.  (It is maxnz + 1 instead of simply maxnz because there may be
210938d9b04SRichard Tran Mills    * rows with no nonzero elements.)  If there are (maxnz + 1) groups,
211938d9b04SRichard Tran Mills    * then xgroup[] must consist of (maxnz + 2) elements, since the last
212938d9b04SRichard Tran Mills    * element of xgroup will tell us where the (maxnz + 1)th group ends.
213938d9b04SRichard Tran Mills    * We allocate space for the maximum number of groups;
214938d9b04SRichard Tran Mills    * that is potentially a little wasteful, but not too much so.
215938d9b04SRichard Tran Mills    * Perhaps I should fix it later. */
2169566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(maxnz + 2, &aijperm->xgroup));
2179566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(maxnz + 1, &aijperm->nzgroup));
218938d9b04SRichard Tran Mills 
219938d9b04SRichard Tran Mills   /* Second pass.  Look at what is in the buckets and create the groupings.
220938d9b04SRichard Tran Mills    * Note that it is OK to have a group of rows with no non-zero values. */
221938d9b04SRichard Tran Mills   ngroup = 0;
222938d9b04SRichard Tran Mills   istart = 0;
223938d9b04SRichard Tran Mills   for (i = 0; i <= maxnz; i++) {
224938d9b04SRichard Tran Mills     if (rows_in_bucket[i] > 0) {
225938d9b04SRichard Tran Mills       aijperm->nzgroup[ngroup] = i;
226938d9b04SRichard Tran Mills       aijperm->xgroup[ngroup]  = istart;
227938d9b04SRichard Tran Mills       ngroup++;
228938d9b04SRichard Tran Mills       istart += rows_in_bucket[i];
229938d9b04SRichard Tran Mills     }
230938d9b04SRichard Tran Mills   }
231938d9b04SRichard Tran Mills 
232938d9b04SRichard Tran Mills   aijperm->xgroup[ngroup] = istart;
233938d9b04SRichard Tran Mills   aijperm->ngroup         = ngroup;
234938d9b04SRichard Tran Mills 
235938d9b04SRichard Tran Mills   /* Now fill in the permutation vector iperm. */
236938d9b04SRichard Tran Mills   ipnz[0] = 0;
237ad540459SPierre Jolivet   for (i = 0; i < maxnz; i++) ipnz[i + 1] = ipnz[i] + rows_in_bucket[i];
238938d9b04SRichard Tran Mills 
239938d9b04SRichard Tran Mills   for (i = 0; i < m; i++) {
240938d9b04SRichard Tran Mills     nz                   = nz_in_row[i];
241938d9b04SRichard Tran Mills     ipos                 = ipnz[nz];
242938d9b04SRichard Tran Mills     aijperm->iperm[ipos] = i;
243938d9b04SRichard Tran Mills     ipnz[nz]++;
244938d9b04SRichard Tran Mills   }
245938d9b04SRichard Tran Mills 
246938d9b04SRichard Tran Mills   /* Clean up temporary work arrays. */
2479566063dSJacob Faibussowitsch   PetscCall(PetscFree(rows_in_bucket));
2489566063dSJacob Faibussowitsch   PetscCall(PetscFree(ipnz));
2499566063dSJacob Faibussowitsch   PetscCall(PetscFree(nz_in_row));
2503ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
251938d9b04SRichard Tran Mills }
252938d9b04SRichard Tran Mills 
MatAssemblyEnd_SeqAIJPERM(Mat A,MatAssemblyType mode)25366976f2fSJacob Faibussowitsch static PetscErrorCode MatAssemblyEnd_SeqAIJPERM(Mat A, MatAssemblyType mode)
254d71ae5a4SJacob Faibussowitsch {
255938d9b04SRichard Tran Mills   Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
256938d9b04SRichard Tran Mills 
257938d9b04SRichard Tran Mills   PetscFunctionBegin;
2583ba16761SJacob Faibussowitsch   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(PETSC_SUCCESS);
259938d9b04SRichard Tran Mills 
260938d9b04SRichard Tran Mills   /* Since a MATSEQAIJPERM matrix is really just a MATSEQAIJ with some
261938d9b04SRichard Tran Mills    * extra information, call the AssemblyEnd routine for a MATSEQAIJ.
262938d9b04SRichard Tran Mills    * I'm not sure if this is the best way to do this, but it avoids
263938d9b04SRichard Tran Mills    * a lot of code duplication.
264938d9b04SRichard Tran Mills    * I also note that currently MATSEQAIJPERM doesn't know anything about
265938d9b04SRichard Tran Mills    * the Mat_CompressedRow data structure that SeqAIJ now uses when there
266938d9b04SRichard Tran Mills    * are many zero rows.  If the SeqAIJ assembly end routine decides to use
267938d9b04SRichard Tran Mills    * this, this may break things.  (Don't know... haven't looked at it.) */
268938d9b04SRichard Tran Mills   a->inode.use = PETSC_FALSE;
2699566063dSJacob Faibussowitsch   PetscCall(MatAssemblyEnd_SeqAIJ(A, mode));
270938d9b04SRichard Tran Mills 
271938d9b04SRichard Tran Mills   /* Now calculate the permutation and grouping information. */
2729566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJPERM_create_perm(A));
2733ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
274938d9b04SRichard Tran Mills }
275938d9b04SRichard Tran Mills 
MatMult_SeqAIJPERM(Mat A,Vec xx,Vec yy)27666976f2fSJacob Faibussowitsch static PetscErrorCode MatMult_SeqAIJPERM(Mat A, Vec xx, Vec yy)
277d71ae5a4SJacob Faibussowitsch {
278938d9b04SRichard Tran Mills   Mat_SeqAIJ        *a = (Mat_SeqAIJ *)A->data;
279938d9b04SRichard Tran Mills   const PetscScalar *x;
280938d9b04SRichard Tran Mills   PetscScalar       *y;
281938d9b04SRichard Tran Mills   const MatScalar   *aa;
282938d9b04SRichard Tran Mills   const PetscInt    *aj, *ai;
283938d9b04SRichard Tran Mills   PetscInt           i, j;
28480423c7aSSatish Balay #if defined(PETSC_USE_AVX512_KERNELS) && defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
285f67b6f2eSRichard Tran Mills   __m512d  vec_x, vec_y, vec_vals;
286f67b6f2eSRichard Tran Mills   __m256i  vec_idx, vec_ipos, vec_j;
287f67b6f2eSRichard Tran Mills   __mmask8 mask;
288f67b6f2eSRichard Tran Mills #endif
289938d9b04SRichard Tran Mills 
290938d9b04SRichard Tran Mills   /* Variables that don't appear in MatMult_SeqAIJ. */
291938d9b04SRichard Tran Mills   Mat_SeqAIJPERM *aijperm = (Mat_SeqAIJPERM *)A->spptr;
292938d9b04SRichard Tran Mills   PetscInt       *iperm; /* Points to the permutation vector. */
293938d9b04SRichard Tran Mills   PetscInt       *xgroup;
294938d9b04SRichard Tran Mills   /* Denotes where groups of rows with same number of nonzeros
295938d9b04SRichard Tran Mills    * begin and end in iperm. */
296938d9b04SRichard Tran Mills   PetscInt *nzgroup;
297938d9b04SRichard Tran Mills   PetscInt  ngroup;
298938d9b04SRichard Tran Mills   PetscInt  igroup;
299938d9b04SRichard Tran Mills   PetscInt  jstart, jend;
300938d9b04SRichard Tran Mills   /* jstart is used in loops to denote the position in iperm where a
301938d9b04SRichard Tran Mills    * group starts; jend denotes the position where it ends.
302938d9b04SRichard Tran Mills    * (jend + 1 is where the next group starts.) */
303938d9b04SRichard Tran Mills   PetscInt    iold, nz;
304938d9b04SRichard Tran Mills   PetscInt    istart, iend, isize;
305938d9b04SRichard Tran Mills   PetscInt    ipos;
306938d9b04SRichard Tran Mills   PetscScalar yp[NDIM];
307938d9b04SRichard Tran Mills   PetscInt    ip[NDIM]; /* yp[] and ip[] are treated as vector "registers" for performing the mat-vec. */
308938d9b04SRichard Tran Mills 
309938d9b04SRichard Tran Mills #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
310938d9b04SRichard Tran Mills   #pragma disjoint(*x, *y, *aa)
311938d9b04SRichard Tran Mills #endif
312938d9b04SRichard Tran Mills 
313938d9b04SRichard Tran Mills   PetscFunctionBegin;
3149566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(xx, &x));
3159566063dSJacob Faibussowitsch   PetscCall(VecGetArray(yy, &y));
316938d9b04SRichard Tran Mills   aj = a->j; /* aj[k] gives column index for element aa[k]. */
317938d9b04SRichard Tran Mills   aa = a->a; /* Nonzero elements stored row-by-row. */
318938d9b04SRichard Tran Mills   ai = a->i; /* ai[k] is the position in aa and aj where row k starts. */
319938d9b04SRichard Tran Mills 
320938d9b04SRichard Tran Mills   /* Get the info we need about the permutations and groupings. */
321938d9b04SRichard Tran Mills   iperm   = aijperm->iperm;
322938d9b04SRichard Tran Mills   ngroup  = aijperm->ngroup;
323938d9b04SRichard Tran Mills   xgroup  = aijperm->xgroup;
324938d9b04SRichard Tran Mills   nzgroup = aijperm->nzgroup;
325938d9b04SRichard Tran Mills 
326938d9b04SRichard Tran Mills   for (igroup = 0; igroup < ngroup; igroup++) {
327938d9b04SRichard Tran Mills     jstart = xgroup[igroup];
328938d9b04SRichard Tran Mills     jend   = xgroup[igroup + 1] - 1;
329938d9b04SRichard Tran Mills     nz     = nzgroup[igroup];
330938d9b04SRichard Tran Mills 
331938d9b04SRichard Tran Mills     /* Handle the special cases where the number of nonzeros per row
332938d9b04SRichard Tran Mills      * in the group is either 0 or 1. */
333938d9b04SRichard Tran Mills     if (nz == 0) {
334ad540459SPierre Jolivet       for (i = jstart; i <= jend; i++) y[iperm[i]] = 0.0;
335938d9b04SRichard Tran Mills     } else if (nz == 1) {
336938d9b04SRichard Tran Mills       for (i = jstart; i <= jend; i++) {
337938d9b04SRichard Tran Mills         iold    = iperm[i];
338938d9b04SRichard Tran Mills         ipos    = ai[iold];
339938d9b04SRichard Tran Mills         y[iold] = aa[ipos] * x[aj[ipos]];
340938d9b04SRichard Tran Mills       }
341938d9b04SRichard Tran Mills     } else {
342938d9b04SRichard Tran Mills       /* We work our way through the current group in chunks of NDIM rows
343938d9b04SRichard Tran Mills        * at a time. */
344938d9b04SRichard Tran Mills 
345938d9b04SRichard Tran Mills       for (istart = jstart; istart <= jend; istart += NDIM) {
346938d9b04SRichard Tran Mills         /* Figure out where the chunk of 'isize' rows ends in iperm.
347938d9b04SRichard Tran Mills          * 'isize may of course be less than NDIM for the last chunk. */
348938d9b04SRichard Tran Mills         iend = istart + (NDIM - 1);
349938d9b04SRichard Tran Mills 
350938d9b04SRichard Tran Mills         if (iend > jend) iend = jend;
351938d9b04SRichard Tran Mills 
352938d9b04SRichard Tran Mills         isize = iend - istart + 1;
353938d9b04SRichard Tran Mills 
354938d9b04SRichard Tran Mills         /* Initialize the yp[] array that will be used to hold part of
355938d9b04SRichard Tran Mills          * the permuted results vector, and figure out where in aa each
356938d9b04SRichard Tran Mills          * row of the chunk will begin. */
357938d9b04SRichard Tran Mills         for (i = 0; i < isize; i++) {
358938d9b04SRichard Tran Mills           iold = iperm[istart + i];
359938d9b04SRichard Tran Mills           /* iold is a row number from the matrix A *before* reordering. */
360938d9b04SRichard Tran Mills           ip[i] = ai[iold];
361938d9b04SRichard Tran Mills           /* ip[i] tells us where the ith row of the chunk begins in aa. */
362938d9b04SRichard Tran Mills           yp[i] = (PetscScalar)0.0;
363938d9b04SRichard Tran Mills         }
364938d9b04SRichard Tran Mills 
365938d9b04SRichard Tran Mills         /* If the number of zeros per row exceeds the number of rows in
366938d9b04SRichard Tran Mills          * the chunk, we should vectorize along nz, that is, perform the
367938d9b04SRichard Tran Mills          * mat-vec one row at a time as in the usual CSR case. */
368938d9b04SRichard Tran Mills         if (nz > isize) {
369938d9b04SRichard Tran Mills #if defined(PETSC_HAVE_CRAY_VECTOR)
370938d9b04SRichard Tran Mills   #pragma _CRI preferstream
371938d9b04SRichard Tran Mills #endif
372938d9b04SRichard Tran Mills           for (i = 0; i < isize; i++) {
373938d9b04SRichard Tran Mills #if defined(PETSC_HAVE_CRAY_VECTOR)
374938d9b04SRichard Tran Mills   #pragma _CRI prefervector
375938d9b04SRichard Tran Mills #endif
376f67b6f2eSRichard Tran Mills 
37780423c7aSSatish Balay #if defined(PETSC_USE_AVX512_KERNELS) && defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
378f67b6f2eSRichard Tran Mills             vec_y = _mm512_setzero_pd();
379f67b6f2eSRichard Tran Mills             ipos  = ip[i];
380f67b6f2eSRichard Tran Mills             for (j = 0; j < (nz >> 3); j++) {
381f67b6f2eSRichard Tran Mills               vec_idx  = _mm256_loadu_si256((__m256i const *)&aj[ipos]);
382f67b6f2eSRichard Tran Mills               vec_vals = _mm512_loadu_pd(&aa[ipos]);
383f67b6f2eSRichard Tran Mills               vec_x    = _mm512_i32gather_pd(vec_idx, x, _MM_SCALE_8);
384f67b6f2eSRichard Tran Mills               vec_y    = _mm512_fmadd_pd(vec_x, vec_vals, vec_y);
385f67b6f2eSRichard Tran Mills               ipos += 8;
386f67b6f2eSRichard Tran Mills             }
387f67b6f2eSRichard Tran Mills             if ((nz & 0x07) > 2) {
388f67b6f2eSRichard Tran Mills               mask     = (__mmask8)(0xff >> (8 - (nz & 0x07)));
389f67b6f2eSRichard Tran Mills               vec_idx  = _mm256_loadu_si256((__m256i const *)&aj[ipos]);
390f67b6f2eSRichard Tran Mills               vec_vals = _mm512_loadu_pd(&aa[ipos]);
391f67b6f2eSRichard Tran Mills               vec_x    = _mm512_mask_i32gather_pd(vec_x, mask, vec_idx, x, _MM_SCALE_8);
392f67b6f2eSRichard Tran Mills               vec_y    = _mm512_mask3_fmadd_pd(vec_x, vec_vals, vec_y, mask);
393f67b6f2eSRichard Tran Mills             } else if ((nz & 0x07) == 2) {
394f67b6f2eSRichard Tran Mills               yp[i] += aa[ipos] * x[aj[ipos]];
395f67b6f2eSRichard Tran Mills               yp[i] += aa[ipos + 1] * x[aj[ipos + 1]];
396f67b6f2eSRichard Tran Mills             } else if ((nz & 0x07) == 1) {
397f67b6f2eSRichard Tran Mills               yp[i] += aa[ipos] * x[aj[ipos]];
398f67b6f2eSRichard Tran Mills             }
399f67b6f2eSRichard Tran Mills             yp[i] += _mm512_reduce_add_pd(vec_y);
400f67b6f2eSRichard Tran Mills #else
401938d9b04SRichard Tran Mills             for (j = 0; j < nz; j++) {
402938d9b04SRichard Tran Mills               ipos = ip[i] + j;
403938d9b04SRichard Tran Mills               yp[i] += aa[ipos] * x[aj[ipos]];
404938d9b04SRichard Tran Mills             }
405f67b6f2eSRichard Tran Mills #endif
406938d9b04SRichard Tran Mills           }
407938d9b04SRichard Tran Mills         } else {
408938d9b04SRichard Tran Mills           /* Otherwise, there are enough rows in the chunk to make it
409938d9b04SRichard Tran Mills            * worthwhile to vectorize across the rows, that is, to do the
410938d9b04SRichard Tran Mills            * matvec by operating with "columns" of the chunk. */
411938d9b04SRichard Tran Mills           for (j = 0; j < nz; j++) {
41280423c7aSSatish Balay #if defined(PETSC_USE_AVX512_KERNELS) && defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
413f67b6f2eSRichard Tran Mills             vec_j = _mm256_set1_epi32(j);
414f67b6f2eSRichard Tran Mills             for (i = 0; i < ((isize >> 3) << 3); i += 8) {
415f67b6f2eSRichard Tran Mills               vec_y    = _mm512_loadu_pd(&yp[i]);
416f67b6f2eSRichard Tran Mills               vec_ipos = _mm256_loadu_si256((__m256i const *)&ip[i]);
417f67b6f2eSRichard Tran Mills               vec_ipos = _mm256_add_epi32(vec_ipos, vec_j);
418f67b6f2eSRichard Tran Mills               vec_idx  = _mm256_i32gather_epi32(aj, vec_ipos, _MM_SCALE_4);
419f67b6f2eSRichard Tran Mills               vec_vals = _mm512_i32gather_pd(vec_ipos, aa, _MM_SCALE_8);
420f67b6f2eSRichard Tran Mills               vec_x    = _mm512_i32gather_pd(vec_idx, x, _MM_SCALE_8);
421f67b6f2eSRichard Tran Mills               vec_y    = _mm512_fmadd_pd(vec_x, vec_vals, vec_y);
422f67b6f2eSRichard Tran Mills               _mm512_storeu_pd(&yp[i], vec_y);
423f67b6f2eSRichard Tran Mills             }
424f67b6f2eSRichard Tran Mills             for (i = isize - (isize & 0x07); i < isize; i++) {
425f67b6f2eSRichard Tran Mills               ipos = ip[i] + j;
426f67b6f2eSRichard Tran Mills               yp[i] += aa[ipos] * x[aj[ipos]];
427f67b6f2eSRichard Tran Mills             }
428f67b6f2eSRichard Tran Mills #else
429938d9b04SRichard Tran Mills             for (i = 0; i < isize; i++) {
430938d9b04SRichard Tran Mills               ipos = ip[i] + j;
431938d9b04SRichard Tran Mills               yp[i] += aa[ipos] * x[aj[ipos]];
432938d9b04SRichard Tran Mills             }
433f67b6f2eSRichard Tran Mills #endif
434938d9b04SRichard Tran Mills           }
435938d9b04SRichard Tran Mills         }
436938d9b04SRichard Tran Mills 
437938d9b04SRichard Tran Mills #if defined(PETSC_HAVE_CRAY_VECTOR)
438938d9b04SRichard Tran Mills   #pragma _CRI ivdep
439938d9b04SRichard Tran Mills #endif
440938d9b04SRichard Tran Mills         /* Put results from yp[] into non-permuted result vector y. */
441ad540459SPierre Jolivet         for (i = 0; i < isize; i++) y[iperm[istart + i]] = yp[i];
442938d9b04SRichard Tran Mills       } /* End processing chunk of isize rows of a group. */
443938d9b04SRichard Tran Mills     } /* End handling matvec for chunk with nz > 1. */
444938d9b04SRichard Tran Mills   } /* End loop over igroup. */
4459566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(PetscMax(2.0 * a->nz - A->rmap->n, 0)));
4469566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(xx, &x));
4479566063dSJacob Faibussowitsch   PetscCall(VecRestoreArray(yy, &y));
4483ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
449938d9b04SRichard Tran Mills }
450938d9b04SRichard Tran Mills 
451938d9b04SRichard Tran Mills /* MatMultAdd_SeqAIJPERM() calculates yy = ww + A * xx.
452938d9b04SRichard Tran Mills  * Note that the names I used to designate the vectors differs from that
453938d9b04SRichard Tran Mills  * used in MatMultAdd_SeqAIJ().  I did this to keep my notation consistent
454938d9b04SRichard Tran Mills  * with the MatMult_SeqAIJPERM() routine, which is very similar to this one. */
455938d9b04SRichard Tran Mills /*
456938d9b04SRichard Tran Mills     I hate having virtually identical code for the mult and the multadd!!!
457938d9b04SRichard Tran Mills */
MatMultAdd_SeqAIJPERM(Mat A,Vec xx,Vec ww,Vec yy)45866976f2fSJacob Faibussowitsch static PetscErrorCode MatMultAdd_SeqAIJPERM(Mat A, Vec xx, Vec ww, Vec yy)
459d71ae5a4SJacob Faibussowitsch {
460938d9b04SRichard Tran Mills   Mat_SeqAIJ        *a = (Mat_SeqAIJ *)A->data;
461938d9b04SRichard Tran Mills   const PetscScalar *x;
462938d9b04SRichard Tran Mills   PetscScalar       *y, *w;
463938d9b04SRichard Tran Mills   const MatScalar   *aa;
464938d9b04SRichard Tran Mills   const PetscInt    *aj, *ai;
465938d9b04SRichard Tran Mills   PetscInt           i, j;
466938d9b04SRichard Tran Mills 
467938d9b04SRichard Tran Mills   /* Variables that don't appear in MatMultAdd_SeqAIJ. */
468938d9b04SRichard Tran Mills   Mat_SeqAIJPERM *aijperm;
469938d9b04SRichard Tran Mills   PetscInt       *iperm; /* Points to the permutation vector. */
470938d9b04SRichard Tran Mills   PetscInt       *xgroup;
471938d9b04SRichard Tran Mills   /* Denotes where groups of rows with same number of nonzeros
472938d9b04SRichard Tran Mills    * begin and end in iperm. */
473938d9b04SRichard Tran Mills   PetscInt *nzgroup;
474938d9b04SRichard Tran Mills   PetscInt  ngroup;
475938d9b04SRichard Tran Mills   PetscInt  igroup;
476938d9b04SRichard Tran Mills   PetscInt  jstart, jend;
477938d9b04SRichard Tran Mills   /* jstart is used in loops to denote the position in iperm where a
478938d9b04SRichard Tran Mills    * group starts; jend denotes the position where it ends.
479938d9b04SRichard Tran Mills    * (jend + 1 is where the next group starts.) */
480938d9b04SRichard Tran Mills   PetscInt    iold, nz;
481938d9b04SRichard Tran Mills   PetscInt    istart, iend, isize;
482938d9b04SRichard Tran Mills   PetscInt    ipos;
483938d9b04SRichard Tran Mills   PetscScalar yp[NDIM];
484938d9b04SRichard Tran Mills   PetscInt    ip[NDIM];
485938d9b04SRichard Tran Mills   /* yp[] and ip[] are treated as vector "registers" for performing
486938d9b04SRichard Tran Mills    * the mat-vec. */
487938d9b04SRichard Tran Mills 
488938d9b04SRichard Tran Mills #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
489938d9b04SRichard Tran Mills   #pragma disjoint(*x, *y, *aa)
490938d9b04SRichard Tran Mills #endif
491938d9b04SRichard Tran Mills 
492938d9b04SRichard Tran Mills   PetscFunctionBegin;
4939566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(xx, &x));
4949566063dSJacob Faibussowitsch   PetscCall(VecGetArrayPair(yy, ww, &y, &w));
495938d9b04SRichard Tran Mills 
496938d9b04SRichard Tran Mills   aj = a->j; /* aj[k] gives column index for element aa[k]. */
497938d9b04SRichard Tran Mills   aa = a->a; /* Nonzero elements stored row-by-row. */
498938d9b04SRichard Tran Mills   ai = a->i; /* ai[k] is the position in aa and aj where row k starts. */
499938d9b04SRichard Tran Mills 
500938d9b04SRichard Tran Mills   /* Get the info we need about the permutations and groupings. */
501938d9b04SRichard Tran Mills   aijperm = (Mat_SeqAIJPERM *)A->spptr;
502938d9b04SRichard Tran Mills   iperm   = aijperm->iperm;
503938d9b04SRichard Tran Mills   ngroup  = aijperm->ngroup;
504938d9b04SRichard Tran Mills   xgroup  = aijperm->xgroup;
505938d9b04SRichard Tran Mills   nzgroup = aijperm->nzgroup;
506938d9b04SRichard Tran Mills 
507938d9b04SRichard Tran Mills   for (igroup = 0; igroup < ngroup; igroup++) {
508938d9b04SRichard Tran Mills     jstart = xgroup[igroup];
509938d9b04SRichard Tran Mills     jend   = xgroup[igroup + 1] - 1;
510938d9b04SRichard Tran Mills 
511938d9b04SRichard Tran Mills     nz = nzgroup[igroup];
512938d9b04SRichard Tran Mills 
513938d9b04SRichard Tran Mills     /* Handle the special cases where the number of nonzeros per row
514938d9b04SRichard Tran Mills      * in the group is either 0 or 1. */
515938d9b04SRichard Tran Mills     if (nz == 0) {
516938d9b04SRichard Tran Mills       for (i = jstart; i <= jend; i++) {
517938d9b04SRichard Tran Mills         iold    = iperm[i];
518938d9b04SRichard Tran Mills         y[iold] = w[iold];
519938d9b04SRichard Tran Mills       }
5209371c9d4SSatish Balay     } else if (nz == 1) {
521938d9b04SRichard Tran Mills       for (i = jstart; i <= jend; i++) {
522938d9b04SRichard Tran Mills         iold    = iperm[i];
523938d9b04SRichard Tran Mills         ipos    = ai[iold];
524938d9b04SRichard Tran Mills         y[iold] = w[iold] + aa[ipos] * x[aj[ipos]];
525938d9b04SRichard Tran Mills       }
526938d9b04SRichard Tran Mills     }
527938d9b04SRichard Tran Mills     /* For the general case: */
528938d9b04SRichard Tran Mills     else {
529938d9b04SRichard Tran Mills       /* We work our way through the current group in chunks of NDIM rows
530938d9b04SRichard Tran Mills        * at a time. */
531938d9b04SRichard Tran Mills 
532938d9b04SRichard Tran Mills       for (istart = jstart; istart <= jend; istart += NDIM) {
533938d9b04SRichard Tran Mills         /* Figure out where the chunk of 'isize' rows ends in iperm.
534938d9b04SRichard Tran Mills          * 'isize may of course be less than NDIM for the last chunk. */
535938d9b04SRichard Tran Mills         iend = istart + (NDIM - 1);
536938d9b04SRichard Tran Mills         if (iend > jend) iend = jend;
537938d9b04SRichard Tran Mills         isize = iend - istart + 1;
538938d9b04SRichard Tran Mills 
539938d9b04SRichard Tran Mills         /* Initialize the yp[] array that will be used to hold part of
540938d9b04SRichard Tran Mills          * the permuted results vector, and figure out where in aa each
541938d9b04SRichard Tran Mills          * row of the chunk will begin. */
542938d9b04SRichard Tran Mills         for (i = 0; i < isize; i++) {
543938d9b04SRichard Tran Mills           iold = iperm[istart + i];
544938d9b04SRichard Tran Mills           /* iold is a row number from the matrix A *before* reordering. */
545938d9b04SRichard Tran Mills           ip[i] = ai[iold];
546938d9b04SRichard Tran Mills           /* ip[i] tells us where the ith row of the chunk begins in aa. */
547938d9b04SRichard Tran Mills           yp[i] = w[iold];
548938d9b04SRichard Tran Mills         }
549938d9b04SRichard Tran Mills 
550938d9b04SRichard Tran Mills         /* If the number of zeros per row exceeds the number of rows in
551938d9b04SRichard Tran Mills          * the chunk, we should vectorize along nz, that is, perform the
552938d9b04SRichard Tran Mills          * mat-vec one row at a time as in the usual CSR case. */
553938d9b04SRichard Tran Mills         if (nz > isize) {
554938d9b04SRichard Tran Mills #if defined(PETSC_HAVE_CRAY_VECTOR)
555938d9b04SRichard Tran Mills   #pragma _CRI preferstream
556938d9b04SRichard Tran Mills #endif
557938d9b04SRichard Tran Mills           for (i = 0; i < isize; i++) {
558938d9b04SRichard Tran Mills #if defined(PETSC_HAVE_CRAY_VECTOR)
559938d9b04SRichard Tran Mills   #pragma _CRI prefervector
560938d9b04SRichard Tran Mills #endif
561938d9b04SRichard Tran Mills             for (j = 0; j < nz; j++) {
562938d9b04SRichard Tran Mills               ipos = ip[i] + j;
563938d9b04SRichard Tran Mills               yp[i] += aa[ipos] * x[aj[ipos]];
564938d9b04SRichard Tran Mills             }
565938d9b04SRichard Tran Mills           }
566938d9b04SRichard Tran Mills         }
567938d9b04SRichard Tran Mills         /* Otherwise, there are enough rows in the chunk to make it
568938d9b04SRichard Tran Mills          * worthwhile to vectorize across the rows, that is, to do the
569938d9b04SRichard Tran Mills          * matvec by operating with "columns" of the chunk. */
570938d9b04SRichard Tran Mills         else {
571938d9b04SRichard Tran Mills           for (j = 0; j < nz; j++) {
572938d9b04SRichard Tran Mills             for (i = 0; i < isize; i++) {
573938d9b04SRichard Tran Mills               ipos = ip[i] + j;
574938d9b04SRichard Tran Mills               yp[i] += aa[ipos] * x[aj[ipos]];
575938d9b04SRichard Tran Mills             }
576938d9b04SRichard Tran Mills           }
577938d9b04SRichard Tran Mills         }
578938d9b04SRichard Tran Mills 
579938d9b04SRichard Tran Mills #if defined(PETSC_HAVE_CRAY_VECTOR)
580938d9b04SRichard Tran Mills   #pragma _CRI ivdep
581938d9b04SRichard Tran Mills #endif
582938d9b04SRichard Tran Mills         /* Put results from yp[] into non-permuted result vector y. */
583ad540459SPierre Jolivet         for (i = 0; i < isize; i++) y[iperm[istart + i]] = yp[i];
584938d9b04SRichard Tran Mills       } /* End processing chunk of isize rows of a group. */
585938d9b04SRichard Tran Mills 
586938d9b04SRichard Tran Mills     } /* End handling matvec for chunk with nz > 1. */
587938d9b04SRichard Tran Mills   } /* End loop over igroup. */
588938d9b04SRichard Tran Mills 
5899566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(2.0 * a->nz));
5909566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(xx, &x));
5919566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayPair(yy, ww, &y, &w));
5923ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
593938d9b04SRichard Tran Mills }
594938d9b04SRichard Tran Mills 
595938d9b04SRichard Tran Mills /* MatConvert_SeqAIJ_SeqAIJPERM converts a SeqAIJ matrix into a
596938d9b04SRichard Tran Mills  * SeqAIJPERM matrix.  This routine is called by the MatCreate_SeqAIJPERM()
597938d9b04SRichard Tran Mills  * routine, but can also be used to convert an assembled SeqAIJ matrix
598938d9b04SRichard Tran Mills  * into a SeqAIJPERM one. */
MatConvert_SeqAIJ_SeqAIJPERM(Mat A,MatType type,MatReuse reuse,Mat * newmat)599d71ae5a4SJacob Faibussowitsch PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJPERM(Mat A, MatType type, MatReuse reuse, Mat *newmat)
600d71ae5a4SJacob Faibussowitsch {
601938d9b04SRichard Tran Mills   Mat             B = *newmat;
602938d9b04SRichard Tran Mills   Mat_SeqAIJPERM *aijperm;
603938d9b04SRichard Tran Mills   PetscBool       sametype;
604938d9b04SRichard Tran Mills 
605938d9b04SRichard Tran Mills   PetscFunctionBegin;
60648a46eb9SPierre Jolivet   if (reuse == MAT_INITIAL_MATRIX) PetscCall(MatDuplicate(A, MAT_COPY_VALUES, &B));
6079566063dSJacob Faibussowitsch   PetscCall(PetscObjectTypeCompare((PetscObject)A, type, &sametype));
6083ba16761SJacob Faibussowitsch   if (sametype) PetscFunctionReturn(PETSC_SUCCESS);
609938d9b04SRichard Tran Mills 
6104dfa11a4SJacob Faibussowitsch   PetscCall(PetscNew(&aijperm));
611938d9b04SRichard Tran Mills   B->spptr = (void *)aijperm;
612938d9b04SRichard Tran Mills 
613938d9b04SRichard Tran Mills   /* Set function pointers for methods that we inherit from AIJ but override. */
614938d9b04SRichard Tran Mills   B->ops->duplicate   = MatDuplicate_SeqAIJPERM;
615938d9b04SRichard Tran Mills   B->ops->assemblyend = MatAssemblyEnd_SeqAIJPERM;
616938d9b04SRichard Tran Mills   B->ops->destroy     = MatDestroy_SeqAIJPERM;
617938d9b04SRichard Tran Mills   B->ops->mult        = MatMult_SeqAIJPERM;
618938d9b04SRichard Tran Mills   B->ops->multadd     = MatMultAdd_SeqAIJPERM;
619938d9b04SRichard Tran Mills 
620938d9b04SRichard Tran Mills   aijperm->nonzerostate = -1; /* this will trigger the generation of the permutation information the first time through MatAssembly()*/
621938d9b04SRichard Tran Mills   /* If A has already been assembled, compute the permutation. */
6221baa6e33SBarry Smith   if (A->assembled) PetscCall(MatSeqAIJPERM_create_perm(B));
623938d9b04SRichard Tran Mills 
6249566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_seqaijperm_seqaij_C", MatConvert_SeqAIJPERM_SeqAIJ));
625938d9b04SRichard Tran Mills 
6269566063dSJacob Faibussowitsch   PetscCall(PetscObjectChangeTypeName((PetscObject)B, MATSEQAIJPERM));
627938d9b04SRichard Tran Mills   *newmat = B;
6283ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
629938d9b04SRichard Tran Mills }
630938d9b04SRichard Tran Mills 
631938d9b04SRichard Tran Mills /*@C
63211a5261eSBarry Smith   MatCreateSeqAIJPERM - Creates a sparse matrix of type `MATSEQAIJPERM`.
633938d9b04SRichard Tran Mills 
634d083f849SBarry Smith   Collective
635938d9b04SRichard Tran Mills 
636938d9b04SRichard Tran Mills   Input Parameters:
63711a5261eSBarry Smith + comm - MPI communicator, set to `PETSC_COMM_SELF`
638938d9b04SRichard Tran Mills . m    - number of rows
639938d9b04SRichard Tran Mills . n    - number of columns
64020f4b53cSBarry Smith . nz   - number of nonzeros per row (same for all rows), ignored if `nnz` is given
64120f4b53cSBarry Smith - nnz  - array containing the number of nonzeros in the various rows (possibly different for each row) or `NULL`
642938d9b04SRichard Tran Mills 
643938d9b04SRichard Tran Mills   Output Parameter:
644938d9b04SRichard Tran Mills . A - the matrix
645938d9b04SRichard Tran Mills 
646938d9b04SRichard Tran Mills   Level: intermediate
647938d9b04SRichard Tran Mills 
6482920cce0SJacob Faibussowitsch   Notes:
6492920cce0SJacob Faibussowitsch   This type inherits from `MATSEQAIJ`, but calculates some additional permutation information
6502920cce0SJacob Faibussowitsch   that is used to allow better vectorization of some operations.  At the cost of increased
6512920cce0SJacob Faibussowitsch   storage, the `MATSEQAIJ` formatted matrix can be copied to a format in which pieces of the
6522920cce0SJacob Faibussowitsch   matrix are stored in ELLPACK format, allowing the vectorized matrix multiply routine to use
6532920cce0SJacob Faibussowitsch   stride-1 memory accesses.
6542920cce0SJacob Faibussowitsch 
6551cc06b55SBarry Smith .seealso: [](ch_matrices), `Mat`, `MatCreate()`, `MatCreateMPIAIJPERM()`, `MatSetValues()`
656938d9b04SRichard Tran Mills @*/
MatCreateSeqAIJPERM(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat * A)657d71ae5a4SJacob Faibussowitsch PetscErrorCode MatCreateSeqAIJPERM(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt nz, const PetscInt nnz[], Mat *A)
658d71ae5a4SJacob Faibussowitsch {
659938d9b04SRichard Tran Mills   PetscFunctionBegin;
6609566063dSJacob Faibussowitsch   PetscCall(MatCreate(comm, A));
6619566063dSJacob Faibussowitsch   PetscCall(MatSetSizes(*A, m, n, m, n));
6629566063dSJacob Faibussowitsch   PetscCall(MatSetType(*A, MATSEQAIJPERM));
6639566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A, nz, nnz));
6643ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
665938d9b04SRichard Tran Mills }
666938d9b04SRichard Tran Mills 
MatCreate_SeqAIJPERM(Mat A)667d71ae5a4SJacob Faibussowitsch PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJPERM(Mat A)
668d71ae5a4SJacob Faibussowitsch {
669938d9b04SRichard Tran Mills   PetscFunctionBegin;
6709566063dSJacob Faibussowitsch   PetscCall(MatSetType(A, MATSEQAIJ));
6719566063dSJacob Faibussowitsch   PetscCall(MatConvert_SeqAIJ_SeqAIJPERM(A, MATSEQAIJPERM, MAT_INPLACE_MATRIX, &A));
6723ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
673938d9b04SRichard Tran Mills }
674