xref: /petsc/src/mat/impls/aij/seq/aijperm/aijperm.c (revision bcee047adeeb73090d7e36cc71e39fc287cdbb97)
1 
2 /*
3   Defines basic operations for the MATSEQAIJPERM matrix class.
4   This class is derived from the MATSEQAIJ class and retains the
5   compressed row storage (aka Yale sparse matrix format) but augments
6   it with some permutation information that enables some operations
7   to be more vectorizable.  A physically rearranged copy of the matrix
8   may be stored if the user desires.
9 
10   Eventually a variety of permutations may be supported.
11 */
12 
13 #include <../src/mat/impls/aij/seq/aij.h>
14 
15 #if defined(PETSC_USE_AVX512_KERNELS) && defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
16   #include <immintrin.h>
17 
18   #if !defined(_MM_SCALE_8)
19     #define _MM_SCALE_8 8
20   #endif
21   #if !defined(_MM_SCALE_4)
22     #define _MM_SCALE_4 4
23   #endif
24 #endif
25 
26 #define NDIM 512
27 /* NDIM specifies how many rows at a time we should work with when
28  * performing the vectorized mat-vec.  This depends on various factors
29  * such as vector register length, etc., and I really need to add a
30  * way for the user (or the library) to tune this.  I'm setting it to
31  * 512 for now since that is what Ed D'Azevedo was using in his Fortran
32  * routines. */
33 
34 typedef struct {
35   PetscObjectState nonzerostate; /* used to determine if the nonzero structure has changed and hence the permutations need updating */
36 
37   PetscInt  ngroup;
38   PetscInt *xgroup;
39   /* Denotes where groups of rows with same number of nonzeros
40    * begin and end, i.e., xgroup[i] gives us the position in iperm[]
41    * where the ith group begins. */
42 
43   PetscInt *nzgroup; /*  how many nonzeros each row that is a member of group i has. */
44   PetscInt *iperm;   /* The permutation vector. */
45 
46   /* Some of this stuff is for Ed's recursive triangular solve.
47    * I'm not sure what I need yet. */
48   PetscInt   blocksize;
49   PetscInt   nstep;
50   PetscInt  *jstart_list;
51   PetscInt  *jend_list;
52   PetscInt  *action_list;
53   PetscInt  *ngroup_list;
54   PetscInt **ipointer_list;
55   PetscInt **xgroup_list;
56   PetscInt **nzgroup_list;
57   PetscInt **iperm_list;
58 } Mat_SeqAIJPERM;
59 
60 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJPERM_SeqAIJ(Mat A, MatType type, MatReuse reuse, Mat *newmat)
61 {
62   /* This routine is only called to convert a MATAIJPERM to its base PETSc type, */
63   /* so we will ignore 'MatType type'. */
64   Mat             B       = *newmat;
65   Mat_SeqAIJPERM *aijperm = (Mat_SeqAIJPERM *)A->spptr;
66 
67   PetscFunctionBegin;
68   if (reuse == MAT_INITIAL_MATRIX) {
69     PetscCall(MatDuplicate(A, MAT_COPY_VALUES, &B));
70     aijperm = (Mat_SeqAIJPERM *)B->spptr;
71   }
72 
73   /* Reset the original function pointers. */
74   B->ops->assemblyend = MatAssemblyEnd_SeqAIJ;
75   B->ops->destroy     = MatDestroy_SeqAIJ;
76   B->ops->duplicate   = MatDuplicate_SeqAIJ;
77   B->ops->mult        = MatMult_SeqAIJ;
78   B->ops->multadd     = MatMultAdd_SeqAIJ;
79 
80   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_seqaijperm_seqaij_C", NULL));
81 
82   /* Free everything in the Mat_SeqAIJPERM data structure.*/
83   PetscCall(PetscFree(aijperm->xgroup));
84   PetscCall(PetscFree(aijperm->nzgroup));
85   PetscCall(PetscFree(aijperm->iperm));
86   PetscCall(PetscFree(B->spptr));
87 
88   /* Change the type of B to MATSEQAIJ. */
89   PetscCall(PetscObjectChangeTypeName((PetscObject)B, MATSEQAIJ));
90 
91   *newmat = B;
92   PetscFunctionReturn(PETSC_SUCCESS);
93 }
94 
95 PetscErrorCode MatDestroy_SeqAIJPERM(Mat A)
96 {
97   Mat_SeqAIJPERM *aijperm = (Mat_SeqAIJPERM *)A->spptr;
98 
99   PetscFunctionBegin;
100   if (aijperm) {
101     /* If MatHeaderMerge() was used then this SeqAIJPERM matrix will not have a spprt. */
102     PetscCall(PetscFree(aijperm->xgroup));
103     PetscCall(PetscFree(aijperm->nzgroup));
104     PetscCall(PetscFree(aijperm->iperm));
105     PetscCall(PetscFree(A->spptr));
106   }
107   /* Change the type of A back to SEQAIJ and use MatDestroy_SeqAIJ()
108    * to destroy everything that remains. */
109   PetscCall(PetscObjectChangeTypeName((PetscObject)A, MATSEQAIJ));
110   /* Note that I don't call MatSetType().  I believe this is because that
111    * is only to be called when *building* a matrix.  I could be wrong, but
112    * that is how things work for the SuperLU matrix class. */
113   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqaijperm_seqaij_C", NULL));
114   PetscCall(MatDestroy_SeqAIJ(A));
115   PetscFunctionReturn(PETSC_SUCCESS);
116 }
117 
118 PetscErrorCode MatDuplicate_SeqAIJPERM(Mat A, MatDuplicateOption op, Mat *M)
119 {
120   Mat_SeqAIJPERM *aijperm = (Mat_SeqAIJPERM *)A->spptr;
121   Mat_SeqAIJPERM *aijperm_dest;
122   PetscBool       perm;
123 
124   PetscFunctionBegin;
125   PetscCall(MatDuplicate_SeqAIJ(A, op, M));
126   PetscCall(PetscObjectTypeCompare((PetscObject)*M, MATSEQAIJPERM, &perm));
127   if (perm) {
128     aijperm_dest = (Mat_SeqAIJPERM *)(*M)->spptr;
129     PetscCall(PetscFree(aijperm_dest->xgroup));
130     PetscCall(PetscFree(aijperm_dest->nzgroup));
131     PetscCall(PetscFree(aijperm_dest->iperm));
132   } else {
133     PetscCall(PetscNew(&aijperm_dest));
134     (*M)->spptr = (void *)aijperm_dest;
135     PetscCall(PetscObjectChangeTypeName((PetscObject)*M, MATSEQAIJPERM));
136     PetscCall(PetscObjectComposeFunction((PetscObject)*M, "MatConvert_seqaijperm_seqaij_C", MatConvert_SeqAIJPERM_SeqAIJ));
137   }
138   PetscCall(PetscArraycpy(aijperm_dest, aijperm, 1));
139   /* Allocate space for, and copy the grouping and permutation info.
140    * I note that when the groups are initially determined in
141    * MatSeqAIJPERM_create_perm, xgroup and nzgroup may be sized larger than
142    * necessary.  But at this point, we know how large they need to be, and
143    * allocate only the necessary amount of memory.  So the duplicated matrix
144    * may actually use slightly less storage than the original! */
145   PetscCall(PetscMalloc1(A->rmap->n, &aijperm_dest->iperm));
146   PetscCall(PetscMalloc1(aijperm->ngroup + 1, &aijperm_dest->xgroup));
147   PetscCall(PetscMalloc1(aijperm->ngroup, &aijperm_dest->nzgroup));
148   PetscCall(PetscArraycpy(aijperm_dest->iperm, aijperm->iperm, A->rmap->n));
149   PetscCall(PetscArraycpy(aijperm_dest->xgroup, aijperm->xgroup, aijperm->ngroup + 1));
150   PetscCall(PetscArraycpy(aijperm_dest->nzgroup, aijperm->nzgroup, aijperm->ngroup));
151   PetscFunctionReturn(PETSC_SUCCESS);
152 }
153 
154 PetscErrorCode MatSeqAIJPERM_create_perm(Mat A)
155 {
156   Mat_SeqAIJ     *a       = (Mat_SeqAIJ *)(A)->data;
157   Mat_SeqAIJPERM *aijperm = (Mat_SeqAIJPERM *)A->spptr;
158   PetscInt        m;     /* Number of rows in the matrix. */
159   PetscInt       *ia;    /* From the CSR representation; points to the beginning  of each row. */
160   PetscInt        maxnz; /* Maximum number of nonzeros in any row. */
161   PetscInt       *rows_in_bucket;
162   /* To construct the permutation, we sort each row into one of maxnz
163    * buckets based on how many nonzeros are in the row. */
164   PetscInt  nz;
165   PetscInt *nz_in_row; /* the number of nonzero elements in row k. */
166   PetscInt *ipnz;
167   /* When constructing the iperm permutation vector,
168    * ipnz[nz] is used to point to the next place in the permutation vector
169    * that a row with nz nonzero elements should be placed.*/
170   PetscInt i, ngroup, istart, ipos;
171 
172   PetscFunctionBegin;
173   if (aijperm->nonzerostate == A->nonzerostate) PetscFunctionReturn(PETSC_SUCCESS); /* permutation exists and matches current nonzero structure */
174   aijperm->nonzerostate = A->nonzerostate;
175   /* Free anything previously put in the Mat_SeqAIJPERM data structure. */
176   PetscCall(PetscFree(aijperm->xgroup));
177   PetscCall(PetscFree(aijperm->nzgroup));
178   PetscCall(PetscFree(aijperm->iperm));
179 
180   m  = A->rmap->n;
181   ia = a->i;
182 
183   /* Allocate the arrays that will hold the permutation vector. */
184   PetscCall(PetscMalloc1(m, &aijperm->iperm));
185 
186   /* Allocate some temporary work arrays that will be used in
187    * calculating the permutation vector and groupings. */
188   PetscCall(PetscMalloc1(m, &nz_in_row));
189 
190   /* Now actually figure out the permutation and grouping. */
191 
192   /* First pass: Determine number of nonzeros in each row, maximum
193    * number of nonzeros in any row, and how many rows fall into each
194    * "bucket" of rows with same number of nonzeros. */
195   maxnz = 0;
196   for (i = 0; i < m; i++) {
197     nz_in_row[i] = ia[i + 1] - ia[i];
198     if (nz_in_row[i] > maxnz) maxnz = nz_in_row[i];
199   }
200   PetscCall(PetscMalloc1(PetscMax(maxnz, m) + 1, &rows_in_bucket));
201   PetscCall(PetscMalloc1(PetscMax(maxnz, m) + 1, &ipnz));
202 
203   for (i = 0; i <= maxnz; i++) rows_in_bucket[i] = 0;
204   for (i = 0; i < m; i++) {
205     nz = nz_in_row[i];
206     rows_in_bucket[nz]++;
207   }
208 
209   /* Allocate space for the grouping info.  There will be at most (maxnz + 1)
210    * groups.  (It is maxnz + 1 instead of simply maxnz because there may be
211    * rows with no nonzero elements.)  If there are (maxnz + 1) groups,
212    * then xgroup[] must consist of (maxnz + 2) elements, since the last
213    * element of xgroup will tell us where the (maxnz + 1)th group ends.
214    * We allocate space for the maximum number of groups;
215    * that is potentially a little wasteful, but not too much so.
216    * Perhaps I should fix it later. */
217   PetscCall(PetscMalloc1(maxnz + 2, &aijperm->xgroup));
218   PetscCall(PetscMalloc1(maxnz + 1, &aijperm->nzgroup));
219 
220   /* Second pass.  Look at what is in the buckets and create the groupings.
221    * Note that it is OK to have a group of rows with no non-zero values. */
222   ngroup = 0;
223   istart = 0;
224   for (i = 0; i <= maxnz; i++) {
225     if (rows_in_bucket[i] > 0) {
226       aijperm->nzgroup[ngroup] = i;
227       aijperm->xgroup[ngroup]  = istart;
228       ngroup++;
229       istart += rows_in_bucket[i];
230     }
231   }
232 
233   aijperm->xgroup[ngroup] = istart;
234   aijperm->ngroup         = ngroup;
235 
236   /* Now fill in the permutation vector iperm. */
237   ipnz[0] = 0;
238   for (i = 0; i < maxnz; i++) ipnz[i + 1] = ipnz[i] + rows_in_bucket[i];
239 
240   for (i = 0; i < m; i++) {
241     nz                   = nz_in_row[i];
242     ipos                 = ipnz[nz];
243     aijperm->iperm[ipos] = i;
244     ipnz[nz]++;
245   }
246 
247   /* Clean up temporary work arrays. */
248   PetscCall(PetscFree(rows_in_bucket));
249   PetscCall(PetscFree(ipnz));
250   PetscCall(PetscFree(nz_in_row));
251   PetscFunctionReturn(PETSC_SUCCESS);
252 }
253 
254 PetscErrorCode MatAssemblyEnd_SeqAIJPERM(Mat A, MatAssemblyType mode)
255 {
256   Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
257 
258   PetscFunctionBegin;
259   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(PETSC_SUCCESS);
260 
261   /* Since a MATSEQAIJPERM matrix is really just a MATSEQAIJ with some
262    * extra information, call the AssemblyEnd routine for a MATSEQAIJ.
263    * I'm not sure if this is the best way to do this, but it avoids
264    * a lot of code duplication.
265    * I also note that currently MATSEQAIJPERM doesn't know anything about
266    * the Mat_CompressedRow data structure that SeqAIJ now uses when there
267    * are many zero rows.  If the SeqAIJ assembly end routine decides to use
268    * this, this may break things.  (Don't know... haven't looked at it.) */
269   a->inode.use = PETSC_FALSE;
270   PetscCall(MatAssemblyEnd_SeqAIJ(A, mode));
271 
272   /* Now calculate the permutation and grouping information. */
273   PetscCall(MatSeqAIJPERM_create_perm(A));
274   PetscFunctionReturn(PETSC_SUCCESS);
275 }
276 
277 PetscErrorCode MatMult_SeqAIJPERM(Mat A, Vec xx, Vec yy)
278 {
279   Mat_SeqAIJ        *a = (Mat_SeqAIJ *)A->data;
280   const PetscScalar *x;
281   PetscScalar       *y;
282   const MatScalar   *aa;
283   const PetscInt    *aj, *ai;
284 #if !(defined(PETSC_USE_FORTRAN_KERNEL_MULTAIJPERM) && defined(notworking))
285   PetscInt i, j;
286 #endif
287 #if defined(PETSC_USE_AVX512_KERNELS) && defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
288   __m512d  vec_x, vec_y, vec_vals;
289   __m256i  vec_idx, vec_ipos, vec_j;
290   __mmask8 mask;
291 #endif
292 
293   /* Variables that don't appear in MatMult_SeqAIJ. */
294   Mat_SeqAIJPERM *aijperm = (Mat_SeqAIJPERM *)A->spptr;
295   PetscInt       *iperm; /* Points to the permutation vector. */
296   PetscInt       *xgroup;
297   /* Denotes where groups of rows with same number of nonzeros
298    * begin and end in iperm. */
299   PetscInt *nzgroup;
300   PetscInt  ngroup;
301   PetscInt  igroup;
302   PetscInt  jstart, jend;
303   /* jstart is used in loops to denote the position in iperm where a
304    * group starts; jend denotes the position where it ends.
305    * (jend + 1 is where the next group starts.) */
306   PetscInt    iold, nz;
307   PetscInt    istart, iend, isize;
308   PetscInt    ipos;
309   PetscScalar yp[NDIM];
310   PetscInt    ip[NDIM]; /* yp[] and ip[] are treated as vector "registers" for performing the mat-vec. */
311 
312 #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
313   #pragma disjoint(*x, *y, *aa)
314 #endif
315 
316   PetscFunctionBegin;
317   PetscCall(VecGetArrayRead(xx, &x));
318   PetscCall(VecGetArray(yy, &y));
319   aj = a->j; /* aj[k] gives column index for element aa[k]. */
320   aa = a->a; /* Nonzero elements stored row-by-row. */
321   ai = a->i; /* ai[k] is the position in aa and aj where row k starts. */
322 
323   /* Get the info we need about the permutations and groupings. */
324   iperm   = aijperm->iperm;
325   ngroup  = aijperm->ngroup;
326   xgroup  = aijperm->xgroup;
327   nzgroup = aijperm->nzgroup;
328 
329 #if defined(PETSC_USE_FORTRAN_KERNEL_MULTAIJPERM) && defined(notworking)
330   fortranmultaijperm_(&m, x, ii, aj, aa, y);
331 #else
332 
333   for (igroup = 0; igroup < ngroup; igroup++) {
334     jstart = xgroup[igroup];
335     jend   = xgroup[igroup + 1] - 1;
336     nz     = nzgroup[igroup];
337 
338     /* Handle the special cases where the number of nonzeros per row
339      * in the group is either 0 or 1. */
340     if (nz == 0) {
341       for (i = jstart; i <= jend; i++) y[iperm[i]] = 0.0;
342     } else if (nz == 1) {
343       for (i = jstart; i <= jend; i++) {
344         iold    = iperm[i];
345         ipos    = ai[iold];
346         y[iold] = aa[ipos] * x[aj[ipos]];
347       }
348     } else {
349       /* We work our way through the current group in chunks of NDIM rows
350        * at a time. */
351 
352       for (istart = jstart; istart <= jend; istart += NDIM) {
353         /* Figure out where the chunk of 'isize' rows ends in iperm.
354          * 'isize may of course be less than NDIM for the last chunk. */
355         iend = istart + (NDIM - 1);
356 
357         if (iend > jend) iend = jend;
358 
359         isize = iend - istart + 1;
360 
361         /* Initialize the yp[] array that will be used to hold part of
362          * the permuted results vector, and figure out where in aa each
363          * row of the chunk will begin. */
364         for (i = 0; i < isize; i++) {
365           iold = iperm[istart + i];
366           /* iold is a row number from the matrix A *before* reordering. */
367           ip[i] = ai[iold];
368           /* ip[i] tells us where the ith row of the chunk begins in aa. */
369           yp[i] = (PetscScalar)0.0;
370         }
371 
372         /* If the number of zeros per row exceeds the number of rows in
373          * the chunk, we should vectorize along nz, that is, perform the
374          * mat-vec one row at a time as in the usual CSR case. */
375         if (nz > isize) {
376   #if defined(PETSC_HAVE_CRAY_VECTOR)
377     #pragma _CRI preferstream
378   #endif
379           for (i = 0; i < isize; i++) {
380   #if defined(PETSC_HAVE_CRAY_VECTOR)
381     #pragma _CRI prefervector
382   #endif
383 
384   #if defined(PETSC_USE_AVX512_KERNELS) && defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
385             vec_y = _mm512_setzero_pd();
386             ipos  = ip[i];
387             for (j = 0; j < (nz >> 3); j++) {
388               vec_idx  = _mm256_loadu_si256((__m256i const *)&aj[ipos]);
389               vec_vals = _mm512_loadu_pd(&aa[ipos]);
390               vec_x    = _mm512_i32gather_pd(vec_idx, x, _MM_SCALE_8);
391               vec_y    = _mm512_fmadd_pd(vec_x, vec_vals, vec_y);
392               ipos += 8;
393             }
394             if ((nz & 0x07) > 2) {
395               mask     = (__mmask8)(0xff >> (8 - (nz & 0x07)));
396               vec_idx  = _mm256_loadu_si256((__m256i const *)&aj[ipos]);
397               vec_vals = _mm512_loadu_pd(&aa[ipos]);
398               vec_x    = _mm512_mask_i32gather_pd(vec_x, mask, vec_idx, x, _MM_SCALE_8);
399               vec_y    = _mm512_mask3_fmadd_pd(vec_x, vec_vals, vec_y, mask);
400             } else if ((nz & 0x07) == 2) {
401               yp[i] += aa[ipos] * x[aj[ipos]];
402               yp[i] += aa[ipos + 1] * x[aj[ipos + 1]];
403             } else if ((nz & 0x07) == 1) {
404               yp[i] += aa[ipos] * x[aj[ipos]];
405             }
406             yp[i] += _mm512_reduce_add_pd(vec_y);
407   #else
408             for (j = 0; j < nz; j++) {
409               ipos = ip[i] + j;
410               yp[i] += aa[ipos] * x[aj[ipos]];
411             }
412   #endif
413           }
414         } else {
415           /* Otherwise, there are enough rows in the chunk to make it
416            * worthwhile to vectorize across the rows, that is, to do the
417            * matvec by operating with "columns" of the chunk. */
418           for (j = 0; j < nz; j++) {
419   #if defined(PETSC_USE_AVX512_KERNELS) && defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
420             vec_j = _mm256_set1_epi32(j);
421             for (i = 0; i < ((isize >> 3) << 3); i += 8) {
422               vec_y    = _mm512_loadu_pd(&yp[i]);
423               vec_ipos = _mm256_loadu_si256((__m256i const *)&ip[i]);
424               vec_ipos = _mm256_add_epi32(vec_ipos, vec_j);
425               vec_idx  = _mm256_i32gather_epi32(aj, vec_ipos, _MM_SCALE_4);
426               vec_vals = _mm512_i32gather_pd(vec_ipos, aa, _MM_SCALE_8);
427               vec_x    = _mm512_i32gather_pd(vec_idx, x, _MM_SCALE_8);
428               vec_y    = _mm512_fmadd_pd(vec_x, vec_vals, vec_y);
429               _mm512_storeu_pd(&yp[i], vec_y);
430             }
431             for (i = isize - (isize & 0x07); i < isize; i++) {
432               ipos = ip[i] + j;
433               yp[i] += aa[ipos] * x[aj[ipos]];
434             }
435   #else
436             for (i = 0; i < isize; i++) {
437               ipos = ip[i] + j;
438               yp[i] += aa[ipos] * x[aj[ipos]];
439             }
440   #endif
441           }
442         }
443 
444   #if defined(PETSC_HAVE_CRAY_VECTOR)
445     #pragma _CRI ivdep
446   #endif
447         /* Put results from yp[] into non-permuted result vector y. */
448         for (i = 0; i < isize; i++) y[iperm[istart + i]] = yp[i];
449       } /* End processing chunk of isize rows of a group. */
450     }   /* End handling matvec for chunk with nz > 1. */
451   }     /* End loop over igroup. */
452 #endif
453   PetscCall(PetscLogFlops(PetscMax(2.0 * a->nz - A->rmap->n, 0)));
454   PetscCall(VecRestoreArrayRead(xx, &x));
455   PetscCall(VecRestoreArray(yy, &y));
456   PetscFunctionReturn(PETSC_SUCCESS);
457 }
458 
459 /* MatMultAdd_SeqAIJPERM() calculates yy = ww + A * xx.
460  * Note that the names I used to designate the vectors differs from that
461  * used in MatMultAdd_SeqAIJ().  I did this to keep my notation consistent
462  * with the MatMult_SeqAIJPERM() routine, which is very similar to this one. */
463 /*
464     I hate having virtually identical code for the mult and the multadd!!!
465 */
466 PetscErrorCode MatMultAdd_SeqAIJPERM(Mat A, Vec xx, Vec ww, Vec yy)
467 {
468   Mat_SeqAIJ        *a = (Mat_SeqAIJ *)A->data;
469   const PetscScalar *x;
470   PetscScalar       *y, *w;
471   const MatScalar   *aa;
472   const PetscInt    *aj, *ai;
473 #if !defined(PETSC_USE_FORTRAN_KERNEL_MULTADDAIJPERM)
474   PetscInt i, j;
475 #endif
476 
477   /* Variables that don't appear in MatMultAdd_SeqAIJ. */
478   Mat_SeqAIJPERM *aijperm;
479   PetscInt       *iperm; /* Points to the permutation vector. */
480   PetscInt       *xgroup;
481   /* Denotes where groups of rows with same number of nonzeros
482    * begin and end in iperm. */
483   PetscInt *nzgroup;
484   PetscInt  ngroup;
485   PetscInt  igroup;
486   PetscInt  jstart, jend;
487   /* jstart is used in loops to denote the position in iperm where a
488    * group starts; jend denotes the position where it ends.
489    * (jend + 1 is where the next group starts.) */
490   PetscInt    iold, nz;
491   PetscInt    istart, iend, isize;
492   PetscInt    ipos;
493   PetscScalar yp[NDIM];
494   PetscInt    ip[NDIM];
495   /* yp[] and ip[] are treated as vector "registers" for performing
496    * the mat-vec. */
497 
498 #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
499   #pragma disjoint(*x, *y, *aa)
500 #endif
501 
502   PetscFunctionBegin;
503   PetscCall(VecGetArrayRead(xx, &x));
504   PetscCall(VecGetArrayPair(yy, ww, &y, &w));
505 
506   aj = a->j; /* aj[k] gives column index for element aa[k]. */
507   aa = a->a; /* Nonzero elements stored row-by-row. */
508   ai = a->i; /* ai[k] is the position in aa and aj where row k starts. */
509 
510   /* Get the info we need about the permutations and groupings. */
511   aijperm = (Mat_SeqAIJPERM *)A->spptr;
512   iperm   = aijperm->iperm;
513   ngroup  = aijperm->ngroup;
514   xgroup  = aijperm->xgroup;
515   nzgroup = aijperm->nzgroup;
516 
517 #if defined(PETSC_USE_FORTRAN_KERNEL_MULTADDAIJPERM)
518   fortranmultaddaijperm_(&m, x, ii, aj, aa, y, w);
519 #else
520 
521   for (igroup = 0; igroup < ngroup; igroup++) {
522     jstart = xgroup[igroup];
523     jend   = xgroup[igroup + 1] - 1;
524 
525     nz = nzgroup[igroup];
526 
527     /* Handle the special cases where the number of nonzeros per row
528      * in the group is either 0 or 1. */
529     if (nz == 0) {
530       for (i = jstart; i <= jend; i++) {
531         iold    = iperm[i];
532         y[iold] = w[iold];
533       }
534     } else if (nz == 1) {
535       for (i = jstart; i <= jend; i++) {
536         iold    = iperm[i];
537         ipos    = ai[iold];
538         y[iold] = w[iold] + aa[ipos] * x[aj[ipos]];
539       }
540     }
541     /* For the general case: */
542     else {
543       /* We work our way through the current group in chunks of NDIM rows
544        * at a time. */
545 
546       for (istart = jstart; istart <= jend; istart += NDIM) {
547         /* Figure out where the chunk of 'isize' rows ends in iperm.
548          * 'isize may of course be less than NDIM for the last chunk. */
549         iend = istart + (NDIM - 1);
550         if (iend > jend) iend = jend;
551         isize = iend - istart + 1;
552 
553         /* Initialize the yp[] array that will be used to hold part of
554          * the permuted results vector, and figure out where in aa each
555          * row of the chunk will begin. */
556         for (i = 0; i < isize; i++) {
557           iold = iperm[istart + i];
558           /* iold is a row number from the matrix A *before* reordering. */
559           ip[i] = ai[iold];
560           /* ip[i] tells us where the ith row of the chunk begins in aa. */
561           yp[i] = w[iold];
562         }
563 
564         /* If the number of zeros per row exceeds the number of rows in
565          * the chunk, we should vectorize along nz, that is, perform the
566          * mat-vec one row at a time as in the usual CSR case. */
567         if (nz > isize) {
568   #if defined(PETSC_HAVE_CRAY_VECTOR)
569     #pragma _CRI preferstream
570   #endif
571           for (i = 0; i < isize; i++) {
572   #if defined(PETSC_HAVE_CRAY_VECTOR)
573     #pragma _CRI prefervector
574   #endif
575             for (j = 0; j < nz; j++) {
576               ipos = ip[i] + j;
577               yp[i] += aa[ipos] * x[aj[ipos]];
578             }
579           }
580         }
581         /* Otherwise, there are enough rows in the chunk to make it
582          * worthwhile to vectorize across the rows, that is, to do the
583          * matvec by operating with "columns" of the chunk. */
584         else {
585           for (j = 0; j < nz; j++) {
586             for (i = 0; i < isize; i++) {
587               ipos = ip[i] + j;
588               yp[i] += aa[ipos] * x[aj[ipos]];
589             }
590           }
591         }
592 
593   #if defined(PETSC_HAVE_CRAY_VECTOR)
594     #pragma _CRI ivdep
595   #endif
596         /* Put results from yp[] into non-permuted result vector y. */
597         for (i = 0; i < isize; i++) y[iperm[istart + i]] = yp[i];
598       } /* End processing chunk of isize rows of a group. */
599 
600     } /* End handling matvec for chunk with nz > 1. */
601   }   /* End loop over igroup. */
602 
603 #endif
604   PetscCall(PetscLogFlops(2.0 * a->nz));
605   PetscCall(VecRestoreArrayRead(xx, &x));
606   PetscCall(VecRestoreArrayPair(yy, ww, &y, &w));
607   PetscFunctionReturn(PETSC_SUCCESS);
608 }
609 
610 /* MatConvert_SeqAIJ_SeqAIJPERM converts a SeqAIJ matrix into a
611  * SeqAIJPERM matrix.  This routine is called by the MatCreate_SeqAIJPERM()
612  * routine, but can also be used to convert an assembled SeqAIJ matrix
613  * into a SeqAIJPERM one. */
614 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJPERM(Mat A, MatType type, MatReuse reuse, Mat *newmat)
615 {
616   Mat             B = *newmat;
617   Mat_SeqAIJPERM *aijperm;
618   PetscBool       sametype;
619 
620   PetscFunctionBegin;
621   if (reuse == MAT_INITIAL_MATRIX) PetscCall(MatDuplicate(A, MAT_COPY_VALUES, &B));
622   PetscCall(PetscObjectTypeCompare((PetscObject)A, type, &sametype));
623   if (sametype) PetscFunctionReturn(PETSC_SUCCESS);
624 
625   PetscCall(PetscNew(&aijperm));
626   B->spptr = (void *)aijperm;
627 
628   /* Set function pointers for methods that we inherit from AIJ but override. */
629   B->ops->duplicate   = MatDuplicate_SeqAIJPERM;
630   B->ops->assemblyend = MatAssemblyEnd_SeqAIJPERM;
631   B->ops->destroy     = MatDestroy_SeqAIJPERM;
632   B->ops->mult        = MatMult_SeqAIJPERM;
633   B->ops->multadd     = MatMultAdd_SeqAIJPERM;
634 
635   aijperm->nonzerostate = -1; /* this will trigger the generation of the permutation information the first time through MatAssembly()*/
636   /* If A has already been assembled, compute the permutation. */
637   if (A->assembled) PetscCall(MatSeqAIJPERM_create_perm(B));
638 
639   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_seqaijperm_seqaij_C", MatConvert_SeqAIJPERM_SeqAIJ));
640 
641   PetscCall(PetscObjectChangeTypeName((PetscObject)B, MATSEQAIJPERM));
642   *newmat = B;
643   PetscFunctionReturn(PETSC_SUCCESS);
644 }
645 
646 /*@C
647    MatCreateSeqAIJPERM - Creates a sparse matrix of type `MATSEQAIJPERM`.
648    This type inherits from `MATSEQAIJ`, but calculates some additional permutation
649    information that is used to allow better vectorization of some
650    operations.  At the cost of increased storage, the `MATSEQAIJ` formatted
651    matrix can be copied to a format in which pieces of the matrix are
652    stored in ELLPACK format, allowing the vectorized matrix multiply
653    routine to use stride-1 memory accesses.
654 
655    Collective
656 
657    Input Parameters:
658 +  comm - MPI communicator, set to `PETSC_COMM_SELF`
659 .  m - number of rows
660 .  n - number of columns
661 .  nz - number of nonzeros per row (same for all rows), ignored if `nnz` is given
662 -  nnz - array containing the number of nonzeros in the various rows (possibly different for each row) or `NULL`
663 
664    Output Parameter:
665 .  A - the matrix
666 
667    Level: intermediate
668 
669 .seealso: [](ch_matrices), `Mat`, `MatCreate()`, `MatCreateMPIAIJPERM()`, `MatSetValues()`
670 @*/
671 PetscErrorCode MatCreateSeqAIJPERM(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt nz, const PetscInt nnz[], Mat *A)
672 {
673   PetscFunctionBegin;
674   PetscCall(MatCreate(comm, A));
675   PetscCall(MatSetSizes(*A, m, n, m, n));
676   PetscCall(MatSetType(*A, MATSEQAIJPERM));
677   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A, nz, nnz));
678   PetscFunctionReturn(PETSC_SUCCESS);
679 }
680 
681 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJPERM(Mat A)
682 {
683   PetscFunctionBegin;
684   PetscCall(MatSetType(A, MATSEQAIJ));
685   PetscCall(MatConvert_SeqAIJ_SeqAIJPERM(A, MATSEQAIJPERM, MAT_INPLACE_MATRIX, &A));
686   PetscFunctionReturn(PETSC_SUCCESS);
687 }
688