xref: /petsc/src/mat/impls/aij/seq/aijperm/aijperm.c (revision 76be6f4ff3bd4e251c19fc00ebbebfd58b6e7589)
1 
2 /*
3   Defines basic operations for the MATSEQAIJPERM matrix class.
4   This class is derived from the MATSEQAIJ class and retains the
5   compressed row storage (aka Yale sparse matrix format) but augments
6   it with some permutation information that enables some operations
7   to be more vectorizable.  A physically rearranged copy of the matrix
8   may be stored if the user desires.
9 
10   Eventually a variety of permutations may be supported.
11 */
12 
13 #include <../src/mat/impls/aij/seq/aij.h>
14 
15 #if defined(PETSC_USE_AVX512_KERNELS) && defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
16 #include <immintrin.h>
17 
18 #if !defined(_MM_SCALE_8)
19 #define _MM_SCALE_8    8
20 #endif
21 #if !defined(_MM_SCALE_4)
22 #define _MM_SCALE_4    4
23 #endif
24 #endif
25 
26 #define NDIM 512
27 /* NDIM specifies how many rows at a time we should work with when
28  * performing the vectorized mat-vec.  This depends on various factors
29  * such as vector register length, etc., and I really need to add a
30  * way for the user (or the library) to tune this.  I'm setting it to
31  * 512 for now since that is what Ed D'Azevedo was using in his Fortran
32  * routines. */
33 
34 typedef struct {
35   PetscObjectState nonzerostate; /* used to determine if the nonzero structure has changed and hence the permutations need updating */
36 
37   PetscInt         ngroup;
38   PetscInt         *xgroup;
39   /* Denotes where groups of rows with same number of nonzeros
40    * begin and end, i.e., xgroup[i] gives us the position in iperm[]
41    * where the ith group begins. */
42 
43   PetscInt         *nzgroup; /*  how many nonzeros each row that is a member of group i has. */
44   PetscInt         *iperm;  /* The permutation vector. */
45 
46   /* Some of this stuff is for Ed's recursive triangular solve.
47    * I'm not sure what I need yet. */
48   PetscInt         blocksize;
49   PetscInt         nstep;
50   PetscInt         *jstart_list;
51   PetscInt         *jend_list;
52   PetscInt         *action_list;
53   PetscInt         *ngroup_list;
54   PetscInt         **ipointer_list;
55   PetscInt         **xgroup_list;
56   PetscInt         **nzgroup_list;
57   PetscInt         **iperm_list;
58 } Mat_SeqAIJPERM;
59 
60 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJPERM_SeqAIJ(Mat A,MatType type,MatReuse reuse,Mat *newmat)
61 {
62   /* This routine is only called to convert a MATAIJPERM to its base PETSc type, */
63   /* so we will ignore 'MatType type'. */
64   Mat            B       = *newmat;
65   Mat_SeqAIJPERM *aijperm=(Mat_SeqAIJPERM*)A->spptr;
66 
67   PetscFunctionBegin;
68   if (reuse == MAT_INITIAL_MATRIX) {
69     PetscCall(MatDuplicate(A,MAT_COPY_VALUES,&B));
70     aijperm=(Mat_SeqAIJPERM*)B->spptr;
71   }
72 
73   /* Reset the original function pointers. */
74   B->ops->assemblyend = MatAssemblyEnd_SeqAIJ;
75   B->ops->destroy     = MatDestroy_SeqAIJ;
76   B->ops->duplicate   = MatDuplicate_SeqAIJ;
77   B->ops->mult        = MatMult_SeqAIJ;
78   B->ops->multadd     = MatMultAdd_SeqAIJ;
79 
80   PetscCall(PetscObjectComposeFunction((PetscObject)B,"MatConvert_seqaijperm_seqaij_C",NULL));
81 
82   /* Free everything in the Mat_SeqAIJPERM data structure.*/
83   PetscCall(PetscFree(aijperm->xgroup));
84   PetscCall(PetscFree(aijperm->nzgroup));
85   PetscCall(PetscFree(aijperm->iperm));
86   PetscCall(PetscFree(B->spptr));
87 
88   /* Change the type of B to MATSEQAIJ. */
89   PetscCall(PetscObjectChangeTypeName((PetscObject)B, MATSEQAIJ));
90 
91   *newmat = B;
92   PetscFunctionReturn(0);
93 }
94 
95 PetscErrorCode MatDestroy_SeqAIJPERM(Mat A)
96 {
97   Mat_SeqAIJPERM *aijperm = (Mat_SeqAIJPERM*) A->spptr;
98 
99   PetscFunctionBegin;
100   if (aijperm) {
101     /* If MatHeaderMerge() was used then this SeqAIJPERM matrix will not have a spprt. */
102     PetscCall(PetscFree(aijperm->xgroup));
103     PetscCall(PetscFree(aijperm->nzgroup));
104     PetscCall(PetscFree(aijperm->iperm));
105     PetscCall(PetscFree(A->spptr));
106   }
107   /* Change the type of A back to SEQAIJ and use MatDestroy_SeqAIJ()
108    * to destroy everything that remains. */
109   PetscCall(PetscObjectChangeTypeName((PetscObject)A, MATSEQAIJ));
110   /* Note that I don't call MatSetType().  I believe this is because that
111    * is only to be called when *building* a matrix.  I could be wrong, but
112    * that is how things work for the SuperLU matrix class. */
113   PetscCall(MatDestroy_SeqAIJ(A));
114   PetscFunctionReturn(0);
115 }
116 
117 PetscErrorCode MatDuplicate_SeqAIJPERM(Mat A, MatDuplicateOption op, Mat *M)
118 {
119   Mat_SeqAIJPERM *aijperm      = (Mat_SeqAIJPERM*) A->spptr;
120   Mat_SeqAIJPERM *aijperm_dest;
121   PetscBool      perm;
122 
123   PetscFunctionBegin;
124   PetscCall(MatDuplicate_SeqAIJ(A,op,M));
125   PetscCall(PetscObjectTypeCompare((PetscObject)*M,MATSEQAIJPERM,&perm));
126   if (perm) {
127     aijperm_dest = (Mat_SeqAIJPERM *) (*M)->spptr;
128     PetscCall(PetscFree(aijperm_dest->xgroup));
129     PetscCall(PetscFree(aijperm_dest->nzgroup));
130     PetscCall(PetscFree(aijperm_dest->iperm));
131   } else {
132     PetscCall(PetscNewLog(*M,&aijperm_dest));
133     (*M)->spptr = (void*) aijperm_dest;
134     PetscCall(PetscObjectChangeTypeName((PetscObject)*M,MATSEQAIJPERM));
135     PetscCall(PetscObjectComposeFunction((PetscObject)*M,"MatConvert_seqaijperm_seqaij_C",MatConvert_SeqAIJPERM_SeqAIJ));
136   }
137   PetscCall(PetscArraycpy(aijperm_dest,aijperm,1));
138   /* Allocate space for, and copy the grouping and permutation info.
139    * I note that when the groups are initially determined in
140    * MatSeqAIJPERM_create_perm, xgroup and nzgroup may be sized larger than
141    * necessary.  But at this point, we know how large they need to be, and
142    * allocate only the necessary amount of memory.  So the duplicated matrix
143    * may actually use slightly less storage than the original! */
144   PetscCall(PetscMalloc1(A->rmap->n, &aijperm_dest->iperm));
145   PetscCall(PetscMalloc1(aijperm->ngroup+1, &aijperm_dest->xgroup));
146   PetscCall(PetscMalloc1(aijperm->ngroup, &aijperm_dest->nzgroup));
147   PetscCall(PetscArraycpy(aijperm_dest->iperm,aijperm->iperm,A->rmap->n));
148   PetscCall(PetscArraycpy(aijperm_dest->xgroup,aijperm->xgroup,aijperm->ngroup+1));
149   PetscCall(PetscArraycpy(aijperm_dest->nzgroup,aijperm->nzgroup,aijperm->ngroup));
150   PetscFunctionReturn(0);
151 }
152 
153 PetscErrorCode MatSeqAIJPERM_create_perm(Mat A)
154 {
155   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)(A)->data;
156   Mat_SeqAIJPERM *aijperm = (Mat_SeqAIJPERM*) A->spptr;
157   PetscInt       m;       /* Number of rows in the matrix. */
158   PetscInt       *ia;       /* From the CSR representation; points to the beginning  of each row. */
159   PetscInt       maxnz;      /* Maximum number of nonzeros in any row. */
160   PetscInt       *rows_in_bucket;
161   /* To construct the permutation, we sort each row into one of maxnz
162    * buckets based on how many nonzeros are in the row. */
163   PetscInt       nz;
164   PetscInt       *nz_in_row;         /* the number of nonzero elements in row k. */
165   PetscInt       *ipnz;
166   /* When constructing the iperm permutation vector,
167    * ipnz[nz] is used to point to the next place in the permutation vector
168    * that a row with nz nonzero elements should be placed.*/
169   PetscInt       i, ngroup, istart, ipos;
170 
171   PetscFunctionBegin;
172   if (aijperm->nonzerostate == A->nonzerostate) PetscFunctionReturn(0); /* permutation exists and matches current nonzero structure */
173   aijperm->nonzerostate = A->nonzerostate;
174  /* Free anything previously put in the Mat_SeqAIJPERM data structure. */
175   PetscCall(PetscFree(aijperm->xgroup));
176   PetscCall(PetscFree(aijperm->nzgroup));
177   PetscCall(PetscFree(aijperm->iperm));
178 
179   m  = A->rmap->n;
180   ia = a->i;
181 
182   /* Allocate the arrays that will hold the permutation vector. */
183   PetscCall(PetscMalloc1(m, &aijperm->iperm));
184 
185   /* Allocate some temporary work arrays that will be used in
186    * calculating the permutation vector and groupings. */
187   PetscCall(PetscMalloc1(m, &nz_in_row));
188 
189   /* Now actually figure out the permutation and grouping. */
190 
191   /* First pass: Determine number of nonzeros in each row, maximum
192    * number of nonzeros in any row, and how many rows fall into each
193    * "bucket" of rows with same number of nonzeros. */
194   maxnz = 0;
195   for (i=0; i<m; i++) {
196     nz_in_row[i] = ia[i+1]-ia[i];
197     if (nz_in_row[i] > maxnz) maxnz = nz_in_row[i];
198   }
199   PetscCall(PetscMalloc1(PetscMax(maxnz,m)+1, &rows_in_bucket));
200   PetscCall(PetscMalloc1(PetscMax(maxnz,m)+1, &ipnz));
201 
202   for (i=0; i<=maxnz; i++) {
203     rows_in_bucket[i] = 0;
204   }
205   for (i=0; i<m; i++) {
206     nz = nz_in_row[i];
207     rows_in_bucket[nz]++;
208   }
209 
210   /* Allocate space for the grouping info.  There will be at most (maxnz + 1)
211    * groups.  (It is maxnz + 1 instead of simply maxnz because there may be
212    * rows with no nonzero elements.)  If there are (maxnz + 1) groups,
213    * then xgroup[] must consist of (maxnz + 2) elements, since the last
214    * element of xgroup will tell us where the (maxnz + 1)th group ends.
215    * We allocate space for the maximum number of groups;
216    * that is potentially a little wasteful, but not too much so.
217    * Perhaps I should fix it later. */
218   PetscCall(PetscMalloc1(maxnz+2, &aijperm->xgroup));
219   PetscCall(PetscMalloc1(maxnz+1, &aijperm->nzgroup));
220 
221   /* Second pass.  Look at what is in the buckets and create the groupings.
222    * Note that it is OK to have a group of rows with no non-zero values. */
223   ngroup = 0;
224   istart = 0;
225   for (i=0; i<=maxnz; i++) {
226     if (rows_in_bucket[i] > 0) {
227       aijperm->nzgroup[ngroup] = i;
228       aijperm->xgroup[ngroup]  = istart;
229       ngroup++;
230       istart += rows_in_bucket[i];
231     }
232   }
233 
234   aijperm->xgroup[ngroup] = istart;
235   aijperm->ngroup         = ngroup;
236 
237   /* Now fill in the permutation vector iperm. */
238   ipnz[0] = 0;
239   for (i=0; i<maxnz; i++) {
240     ipnz[i+1] = ipnz[i] + rows_in_bucket[i];
241   }
242 
243   for (i=0; i<m; i++) {
244     nz                   = nz_in_row[i];
245     ipos                 = ipnz[nz];
246     aijperm->iperm[ipos] = i;
247     ipnz[nz]++;
248   }
249 
250   /* Clean up temporary work arrays. */
251   PetscCall(PetscFree(rows_in_bucket));
252   PetscCall(PetscFree(ipnz));
253   PetscCall(PetscFree(nz_in_row));
254   PetscFunctionReturn(0);
255 }
256 
257 PetscErrorCode MatAssemblyEnd_SeqAIJPERM(Mat A, MatAssemblyType mode)
258 {
259   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
260 
261   PetscFunctionBegin;
262   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0);
263 
264   /* Since a MATSEQAIJPERM matrix is really just a MATSEQAIJ with some
265    * extra information, call the AssemblyEnd routine for a MATSEQAIJ.
266    * I'm not sure if this is the best way to do this, but it avoids
267    * a lot of code duplication.
268    * I also note that currently MATSEQAIJPERM doesn't know anything about
269    * the Mat_CompressedRow data structure that SeqAIJ now uses when there
270    * are many zero rows.  If the SeqAIJ assembly end routine decides to use
271    * this, this may break things.  (Don't know... haven't looked at it.) */
272   a->inode.use = PETSC_FALSE;
273   PetscCall(MatAssemblyEnd_SeqAIJ(A, mode));
274 
275   /* Now calculate the permutation and grouping information. */
276   PetscCall(MatSeqAIJPERM_create_perm(A));
277   PetscFunctionReturn(0);
278 }
279 
280 PetscErrorCode MatMult_SeqAIJPERM(Mat A,Vec xx,Vec yy)
281 {
282   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
283   const PetscScalar *x;
284   PetscScalar       *y;
285   const MatScalar   *aa;
286   const PetscInt    *aj,*ai;
287 #if !(defined(PETSC_USE_FORTRAN_KERNEL_MULTAIJPERM) && defined(notworking))
288   PetscInt          i,j;
289 #endif
290 #if defined(PETSC_USE_AVX512_KERNELS) && defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
291   __m512d           vec_x,vec_y,vec_vals;
292   __m256i           vec_idx,vec_ipos,vec_j;
293   __mmask8           mask;
294 #endif
295 
296   /* Variables that don't appear in MatMult_SeqAIJ. */
297   Mat_SeqAIJPERM    *aijperm = (Mat_SeqAIJPERM*) A->spptr;
298   PetscInt          *iperm;  /* Points to the permutation vector. */
299   PetscInt          *xgroup;
300   /* Denotes where groups of rows with same number of nonzeros
301    * begin and end in iperm. */
302   PetscInt          *nzgroup;
303   PetscInt          ngroup;
304   PetscInt          igroup;
305   PetscInt          jstart,jend;
306   /* jstart is used in loops to denote the position in iperm where a
307    * group starts; jend denotes the position where it ends.
308    * (jend + 1 is where the next group starts.) */
309   PetscInt          iold,nz;
310   PetscInt          istart,iend,isize;
311   PetscInt          ipos;
312   PetscScalar       yp[NDIM];
313   PetscInt          ip[NDIM];    /* yp[] and ip[] are treated as vector "registers" for performing the mat-vec. */
314 
315 #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
316 #pragma disjoint(*x,*y,*aa)
317 #endif
318 
319   PetscFunctionBegin;
320   PetscCall(VecGetArrayRead(xx,&x));
321   PetscCall(VecGetArray(yy,&y));
322   aj   = a->j;   /* aj[k] gives column index for element aa[k]. */
323   aa   = a->a; /* Nonzero elements stored row-by-row. */
324   ai   = a->i;  /* ai[k] is the position in aa and aj where row k starts. */
325 
326   /* Get the info we need about the permutations and groupings. */
327   iperm   = aijperm->iperm;
328   ngroup  = aijperm->ngroup;
329   xgroup  = aijperm->xgroup;
330   nzgroup = aijperm->nzgroup;
331 
332 #if defined(PETSC_USE_FORTRAN_KERNEL_MULTAIJPERM) && defined(notworking)
333   fortranmultaijperm_(&m,x,ii,aj,aa,y);
334 #else
335 
336   for (igroup=0; igroup<ngroup; igroup++) {
337     jstart = xgroup[igroup];
338     jend   = xgroup[igroup+1] - 1;
339     nz     = nzgroup[igroup];
340 
341     /* Handle the special cases where the number of nonzeros per row
342      * in the group is either 0 or 1. */
343     if (nz == 0) {
344       for (i=jstart; i<=jend; i++) {
345         y[iperm[i]] = 0.0;
346       }
347     } else if (nz == 1) {
348       for (i=jstart; i<=jend; i++) {
349         iold    = iperm[i];
350         ipos    = ai[iold];
351         y[iold] = aa[ipos] * x[aj[ipos]];
352       }
353     } else {
354 
355       /* We work our way through the current group in chunks of NDIM rows
356        * at a time. */
357 
358       for (istart=jstart; istart<=jend; istart+=NDIM) {
359         /* Figure out where the chunk of 'isize' rows ends in iperm.
360          * 'isize may of course be less than NDIM for the last chunk. */
361         iend = istart + (NDIM - 1);
362 
363         if (iend > jend) iend = jend;
364 
365         isize = iend - istart + 1;
366 
367         /* Initialize the yp[] array that will be used to hold part of
368          * the permuted results vector, and figure out where in aa each
369          * row of the chunk will begin. */
370         for (i=0; i<isize; i++) {
371           iold = iperm[istart + i];
372           /* iold is a row number from the matrix A *before* reordering. */
373           ip[i] = ai[iold];
374           /* ip[i] tells us where the ith row of the chunk begins in aa. */
375           yp[i] = (PetscScalar) 0.0;
376         }
377 
378         /* If the number of zeros per row exceeds the number of rows in
379          * the chunk, we should vectorize along nz, that is, perform the
380          * mat-vec one row at a time as in the usual CSR case. */
381         if (nz > isize) {
382 #if defined(PETSC_HAVE_CRAY_VECTOR)
383 #pragma _CRI preferstream
384 #endif
385           for (i=0; i<isize; i++) {
386 #if defined(PETSC_HAVE_CRAY_VECTOR)
387 #pragma _CRI prefervector
388 #endif
389 
390 #if defined(PETSC_USE_AVX512_KERNELS) && defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
391             vec_y = _mm512_setzero_pd();
392             ipos = ip[i];
393             for (j=0; j<(nz>>3); j++) {
394               vec_idx  = _mm256_loadu_si256((__m256i const*)&aj[ipos]);
395               vec_vals = _mm512_loadu_pd(&aa[ipos]);
396               vec_x    = _mm512_i32gather_pd(vec_idx,x,_MM_SCALE_8);
397               vec_y    = _mm512_fmadd_pd(vec_x,vec_vals,vec_y);
398               ipos += 8;
399             }
400             if ((nz&0x07)>2) {
401               mask     = (__mmask8)(0xff >> (8-(nz&0x07)));
402               vec_idx  = _mm256_loadu_si256((__m256i const*)&aj[ipos]);
403               vec_vals = _mm512_loadu_pd(&aa[ipos]);
404               vec_x    = _mm512_mask_i32gather_pd(vec_x,mask,vec_idx,x,_MM_SCALE_8);
405               vec_y    = _mm512_mask3_fmadd_pd(vec_x,vec_vals,vec_y,mask);
406             } else if ((nz&0x07)==2) {
407               yp[i] += aa[ipos]*x[aj[ipos]];
408               yp[i] += aa[ipos+1]*x[aj[ipos+1]];
409             } else if ((nz&0x07)==1) {
410               yp[i] += aa[ipos]*x[aj[ipos]];
411             }
412             yp[i] += _mm512_reduce_add_pd(vec_y);
413 #else
414             for (j=0; j<nz; j++) {
415               ipos   = ip[i] + j;
416               yp[i] += aa[ipos] * x[aj[ipos]];
417             }
418 #endif
419           }
420         } else {
421           /* Otherwise, there are enough rows in the chunk to make it
422            * worthwhile to vectorize across the rows, that is, to do the
423            * matvec by operating with "columns" of the chunk. */
424           for (j=0; j<nz; j++) {
425 #if defined(PETSC_USE_AVX512_KERNELS) && defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
426             vec_j = _mm256_set1_epi32(j);
427             for (i=0; i<((isize>>3)<<3); i+=8) {
428               vec_y    = _mm512_loadu_pd(&yp[i]);
429               vec_ipos = _mm256_loadu_si256((__m256i const*)&ip[i]);
430               vec_ipos = _mm256_add_epi32(vec_ipos,vec_j);
431               vec_idx  = _mm256_i32gather_epi32(aj,vec_ipos,_MM_SCALE_4);
432               vec_vals = _mm512_i32gather_pd(vec_ipos,aa,_MM_SCALE_8);
433               vec_x    = _mm512_i32gather_pd(vec_idx,x,_MM_SCALE_8);
434               vec_y    = _mm512_fmadd_pd(vec_x,vec_vals,vec_y);
435               _mm512_storeu_pd(&yp[i],vec_y);
436             }
437             for (i=isize-(isize&0x07); i<isize; i++) {
438               ipos = ip[i]+j;
439               yp[i] += aa[ipos]*x[aj[ipos]];
440             }
441 #else
442             for (i=0; i<isize; i++) {
443               ipos   = ip[i] + j;
444               yp[i] += aa[ipos] * x[aj[ipos]];
445             }
446 #endif
447           }
448         }
449 
450 #if defined(PETSC_HAVE_CRAY_VECTOR)
451 #pragma _CRI ivdep
452 #endif
453         /* Put results from yp[] into non-permuted result vector y. */
454         for (i=0; i<isize; i++) {
455           y[iperm[istart+i]] = yp[i];
456         }
457       } /* End processing chunk of isize rows of a group. */
458     } /* End handling matvec for chunk with nz > 1. */
459   } /* End loop over igroup. */
460 #endif
461   PetscCall(PetscLogFlops(PetscMax(2.0*a->nz - A->rmap->n,0)));
462   PetscCall(VecRestoreArrayRead(xx,&x));
463   PetscCall(VecRestoreArray(yy,&y));
464   PetscFunctionReturn(0);
465 }
466 
467 /* MatMultAdd_SeqAIJPERM() calculates yy = ww + A * xx.
468  * Note that the names I used to designate the vectors differs from that
469  * used in MatMultAdd_SeqAIJ().  I did this to keep my notation consistent
470  * with the MatMult_SeqAIJPERM() routine, which is very similar to this one. */
471 /*
472     I hate having virtually identical code for the mult and the multadd!!!
473 */
474 PetscErrorCode MatMultAdd_SeqAIJPERM(Mat A,Vec xx,Vec ww,Vec yy)
475 {
476   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
477   const PetscScalar *x;
478   PetscScalar       *y,*w;
479   const MatScalar   *aa;
480   const PetscInt    *aj,*ai;
481 #if !defined(PETSC_USE_FORTRAN_KERNEL_MULTADDAIJPERM)
482   PetscInt i,j;
483 #endif
484 
485   /* Variables that don't appear in MatMultAdd_SeqAIJ. */
486   Mat_SeqAIJPERM * aijperm;
487   PetscInt       *iperm;    /* Points to the permutation vector. */
488   PetscInt       *xgroup;
489   /* Denotes where groups of rows with same number of nonzeros
490    * begin and end in iperm. */
491   PetscInt *nzgroup;
492   PetscInt ngroup;
493   PetscInt igroup;
494   PetscInt jstart,jend;
495   /* jstart is used in loops to denote the position in iperm where a
496    * group starts; jend denotes the position where it ends.
497    * (jend + 1 is where the next group starts.) */
498   PetscInt    iold,nz;
499   PetscInt    istart,iend,isize;
500   PetscInt    ipos;
501   PetscScalar yp[NDIM];
502   PetscInt    ip[NDIM];
503   /* yp[] and ip[] are treated as vector "registers" for performing
504    * the mat-vec. */
505 
506 #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
507 #pragma disjoint(*x,*y,*aa)
508 #endif
509 
510   PetscFunctionBegin;
511   PetscCall(VecGetArrayRead(xx,&x));
512   PetscCall(VecGetArrayPair(yy,ww,&y,&w));
513 
514   aj = a->j;   /* aj[k] gives column index for element aa[k]. */
515   aa = a->a;   /* Nonzero elements stored row-by-row. */
516   ai = a->i;   /* ai[k] is the position in aa and aj where row k starts. */
517 
518   /* Get the info we need about the permutations and groupings. */
519   aijperm = (Mat_SeqAIJPERM*) A->spptr;
520   iperm   = aijperm->iperm;
521   ngroup  = aijperm->ngroup;
522   xgroup  = aijperm->xgroup;
523   nzgroup = aijperm->nzgroup;
524 
525 #if defined(PETSC_USE_FORTRAN_KERNEL_MULTADDAIJPERM)
526   fortranmultaddaijperm_(&m,x,ii,aj,aa,y,w);
527 #else
528 
529   for (igroup=0; igroup<ngroup; igroup++) {
530     jstart = xgroup[igroup];
531     jend   = xgroup[igroup+1] - 1;
532 
533     nz = nzgroup[igroup];
534 
535     /* Handle the special cases where the number of nonzeros per row
536      * in the group is either 0 or 1. */
537     if (nz == 0) {
538       for (i=jstart; i<=jend; i++) {
539         iold    = iperm[i];
540         y[iold] = w[iold];
541       }
542     }
543     else if (nz == 1) {
544       for (i=jstart; i<=jend; i++) {
545         iold    = iperm[i];
546         ipos    = ai[iold];
547         y[iold] = w[iold] + aa[ipos] * x[aj[ipos]];
548       }
549     }
550     /* For the general case: */
551     else {
552 
553       /* We work our way through the current group in chunks of NDIM rows
554        * at a time. */
555 
556       for (istart=jstart; istart<=jend; istart+=NDIM) {
557         /* Figure out where the chunk of 'isize' rows ends in iperm.
558          * 'isize may of course be less than NDIM for the last chunk. */
559         iend = istart + (NDIM - 1);
560         if (iend > jend) iend = jend;
561         isize = iend - istart + 1;
562 
563         /* Initialize the yp[] array that will be used to hold part of
564          * the permuted results vector, and figure out where in aa each
565          * row of the chunk will begin. */
566         for (i=0; i<isize; i++) {
567           iold = iperm[istart + i];
568           /* iold is a row number from the matrix A *before* reordering. */
569           ip[i] = ai[iold];
570           /* ip[i] tells us where the ith row of the chunk begins in aa. */
571           yp[i] = w[iold];
572         }
573 
574         /* If the number of zeros per row exceeds the number of rows in
575          * the chunk, we should vectorize along nz, that is, perform the
576          * mat-vec one row at a time as in the usual CSR case. */
577         if (nz > isize) {
578 #if defined(PETSC_HAVE_CRAY_VECTOR)
579 #pragma _CRI preferstream
580 #endif
581           for (i=0; i<isize; i++) {
582 #if defined(PETSC_HAVE_CRAY_VECTOR)
583 #pragma _CRI prefervector
584 #endif
585             for (j=0; j<nz; j++) {
586               ipos   = ip[i] + j;
587               yp[i] += aa[ipos] * x[aj[ipos]];
588             }
589           }
590         }
591         /* Otherwise, there are enough rows in the chunk to make it
592          * worthwhile to vectorize across the rows, that is, to do the
593          * matvec by operating with "columns" of the chunk. */
594         else {
595           for (j=0; j<nz; j++) {
596             for (i=0; i<isize; i++) {
597               ipos   = ip[i] + j;
598               yp[i] += aa[ipos] * x[aj[ipos]];
599             }
600           }
601         }
602 
603 #if defined(PETSC_HAVE_CRAY_VECTOR)
604 #pragma _CRI ivdep
605 #endif
606         /* Put results from yp[] into non-permuted result vector y. */
607         for (i=0; i<isize; i++) {
608           y[iperm[istart+i]] = yp[i];
609         }
610       } /* End processing chunk of isize rows of a group. */
611 
612     } /* End handling matvec for chunk with nz > 1. */
613   } /* End loop over igroup. */
614 
615 #endif
616   PetscCall(PetscLogFlops(2.0*a->nz));
617   PetscCall(VecRestoreArrayRead(xx,&x));
618   PetscCall(VecRestoreArrayPair(yy,ww,&y,&w));
619   PetscFunctionReturn(0);
620 }
621 
622 /* MatConvert_SeqAIJ_SeqAIJPERM converts a SeqAIJ matrix into a
623  * SeqAIJPERM matrix.  This routine is called by the MatCreate_SeqAIJPERM()
624  * routine, but can also be used to convert an assembled SeqAIJ matrix
625  * into a SeqAIJPERM one. */
626 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJPERM(Mat A,MatType type,MatReuse reuse,Mat *newmat)
627 {
628   Mat            B = *newmat;
629   Mat_SeqAIJPERM *aijperm;
630   PetscBool      sametype;
631 
632   PetscFunctionBegin;
633   if (reuse == MAT_INITIAL_MATRIX) {
634     PetscCall(MatDuplicate(A,MAT_COPY_VALUES,&B));
635   }
636   PetscCall(PetscObjectTypeCompare((PetscObject)A,type,&sametype));
637   if (sametype) PetscFunctionReturn(0);
638 
639   PetscCall(PetscNewLog(B,&aijperm));
640   B->spptr = (void*) aijperm;
641 
642   /* Set function pointers for methods that we inherit from AIJ but override. */
643   B->ops->duplicate   = MatDuplicate_SeqAIJPERM;
644   B->ops->assemblyend = MatAssemblyEnd_SeqAIJPERM;
645   B->ops->destroy     = MatDestroy_SeqAIJPERM;
646   B->ops->mult        = MatMult_SeqAIJPERM;
647   B->ops->multadd     = MatMultAdd_SeqAIJPERM;
648 
649   aijperm->nonzerostate = -1;  /* this will trigger the generation of the permutation information the first time through MatAssembly()*/
650   /* If A has already been assembled, compute the permutation. */
651   if (A->assembled) {
652     PetscCall(MatSeqAIJPERM_create_perm(B));
653   }
654 
655   PetscCall(PetscObjectComposeFunction((PetscObject)B,"MatConvert_seqaijperm_seqaij_C",MatConvert_SeqAIJPERM_SeqAIJ));
656 
657   PetscCall(PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJPERM));
658   *newmat = B;
659   PetscFunctionReturn(0);
660 }
661 
662 /*@C
663    MatCreateSeqAIJPERM - Creates a sparse matrix of type SEQAIJPERM.
664    This type inherits from AIJ, but calculates some additional permutation
665    information that is used to allow better vectorization of some
666    operations.  At the cost of increased storage, the AIJ formatted
667    matrix can be copied to a format in which pieces of the matrix are
668    stored in ELLPACK format, allowing the vectorized matrix multiply
669    routine to use stride-1 memory accesses.  As with the AIJ type, it is
670    important to preallocate matrix storage in order to get good assembly
671    performance.
672 
673    Collective
674 
675    Input Parameters:
676 +  comm - MPI communicator, set to PETSC_COMM_SELF
677 .  m - number of rows
678 .  n - number of columns
679 .  nz - number of nonzeros per row (same for all rows)
680 -  nnz - array containing the number of nonzeros in the various rows
681          (possibly different for each row) or NULL
682 
683    Output Parameter:
684 .  A - the matrix
685 
686    Notes:
687    If nnz is given then nz is ignored
688 
689    Level: intermediate
690 
691 .seealso: `MatCreate()`, `MatCreateMPIAIJPERM()`, `MatSetValues()`
692 @*/
693 PetscErrorCode  MatCreateSeqAIJPERM(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
694 {
695   PetscFunctionBegin;
696   PetscCall(MatCreate(comm,A));
697   PetscCall(MatSetSizes(*A,m,n,m,n));
698   PetscCall(MatSetType(*A,MATSEQAIJPERM));
699   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,nnz));
700   PetscFunctionReturn(0);
701 }
702 
703 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJPERM(Mat A)
704 {
705   PetscFunctionBegin;
706   PetscCall(MatSetType(A,MATSEQAIJ));
707   PetscCall(MatConvert_SeqAIJ_SeqAIJPERM(A,MATSEQAIJPERM,MAT_INPLACE_MATRIX,&A));
708   PetscFunctionReturn(0);
709 }
710