xref: /petsc/src/mat/impls/aij/seq/aijperm/aijperm.c (revision 37eeb8152ec6a2cf24186d3591c2c5de5dfd8fa5)
1 
2 /*
3   Defines basic operations for the MATSEQAIJPERM matrix class.
4   This class is derived from the MATSEQAIJ class and retains the
5   compressed row storage (aka Yale sparse matrix format) but augments
6   it with some permutation information that enables some operations
7   to be more vectorizable.  A physically rearranged copy of the matrix
8   may be stored if the user desires.
9 
10   Eventually a variety of permutations may be supported.
11 */
12 
13 #include <../src/mat/impls/aij/seq/aij.h>
14 
15 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
16 #include <immintrin.h>
17 
18 #if !defined(_MM_SCALE_8)
19 #define _MM_SCALE_8    8
20 #endif
21 #if !defined(_MM_SCALE_4)
22 #define _MM_SCALE_4    4
23 #endif
24 #endif
25 
26 #define NDIM 512
27 /* NDIM specifies how many rows at a time we should work with when
28  * performing the vectorized mat-vec.  This depends on various factors
29  * such as vector register length, etc., and I really need to add a
30  * way for the user (or the library) to tune this.  I'm setting it to
31  * 512 for now since that is what Ed D'Azevedo was using in his Fortran
32  * routines. */
33 
34 typedef struct {
35   PetscObjectState nonzerostate; /* used to determine if the nonzero structure has changed and hence the permutations need updating */
36 
37   PetscInt         ngroup;
38   PetscInt         *xgroup;
39   /* Denotes where groups of rows with same number of nonzeros
40    * begin and end, i.e., xgroup[i] gives us the position in iperm[]
41    * where the ith group begins. */
42 
43   PetscInt         *nzgroup; /*  how many nonzeros each row that is a member of group i has. */
44   PetscInt         *iperm;  /* The permutation vector. */
45 
46   /* Some of this stuff is for Ed's recursive triangular solve.
47    * I'm not sure what I need yet. */
48   PetscInt         blocksize;
49   PetscInt         nstep;
50   PetscInt         *jstart_list;
51   PetscInt         *jend_list;
52   PetscInt         *action_list;
53   PetscInt         *ngroup_list;
54   PetscInt         **ipointer_list;
55   PetscInt         **xgroup_list;
56   PetscInt         **nzgroup_list;
57   PetscInt         **iperm_list;
58 } Mat_SeqAIJPERM;
59 
60 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJPERM_SeqAIJ(Mat A,MatType type,MatReuse reuse,Mat *newmat)
61 {
62   /* This routine is only called to convert a MATAIJPERM to its base PETSc type, */
63   /* so we will ignore 'MatType type'. */
64   PetscErrorCode ierr;
65   Mat            B       = *newmat;
66   Mat_SeqAIJPERM *aijperm=(Mat_SeqAIJPERM*)A->spptr;
67 
68   PetscFunctionBegin;
69   if (reuse == MAT_INITIAL_MATRIX) {
70     ierr = MatDuplicate(A,MAT_COPY_VALUES,&B);CHKERRQ(ierr);
71     aijperm=(Mat_SeqAIJPERM*)B->spptr;
72   }
73 
74   /* Reset the original function pointers. */
75   B->ops->assemblyend = MatAssemblyEnd_SeqAIJ;
76   B->ops->destroy     = MatDestroy_SeqAIJ;
77   B->ops->duplicate   = MatDuplicate_SeqAIJ;
78   B->ops->mult        = MatMult_SeqAIJ;
79   B->ops->multadd     = MatMultAdd_SeqAIJ;
80 
81   ierr = PetscObjectComposeFunction((PetscObject)B,"MatConvert_seqaijperm_seqaij_C",NULL);CHKERRQ(ierr);
82   ierr = PetscObjectComposeFunction((PetscObject)B,"MatMatMult_seqdense_seqaijperm_C",NULL);CHKERRQ(ierr);
83   ierr = PetscObjectComposeFunction((PetscObject)B,"MatMatMultSymbolic_seqdense_seqaijperm_C",NULL);CHKERRQ(ierr);
84   ierr = PetscObjectComposeFunction((PetscObject)B,"MatMatMultNumeric_seqdense_seqaijperm_C",NULL);CHKERRQ(ierr);
85   ierr = PetscObjectComposeFunction((PetscObject)B,"MatPtAP_is_seqaijperm_C",NULL);CHKERRQ(ierr);
86 
87   /* Free everything in the Mat_SeqAIJPERM data structure.*/
88   ierr = PetscFree(aijperm->xgroup);CHKERRQ(ierr);
89   ierr = PetscFree(aijperm->nzgroup);CHKERRQ(ierr);
90   ierr = PetscFree(aijperm->iperm);CHKERRQ(ierr);
91   ierr = PetscFree(B->spptr);CHKERRQ(ierr);
92 
93   /* Change the type of B to MATSEQAIJ. */
94   ierr = PetscObjectChangeTypeName((PetscObject)B, MATSEQAIJ);CHKERRQ(ierr);
95 
96   *newmat = B;
97   PetscFunctionReturn(0);
98 }
99 
100 PetscErrorCode MatDestroy_SeqAIJPERM(Mat A)
101 {
102   PetscErrorCode ierr;
103   Mat_SeqAIJPERM *aijperm = (Mat_SeqAIJPERM*) A->spptr;
104 
105   PetscFunctionBegin;
106   if (aijperm) {
107     /* If MatHeaderMerge() was used then this SeqAIJPERM matrix will not have a spprt. */
108     ierr = PetscFree(aijperm->xgroup);CHKERRQ(ierr);
109     ierr = PetscFree(aijperm->nzgroup);CHKERRQ(ierr);
110     ierr = PetscFree(aijperm->iperm);CHKERRQ(ierr);
111     ierr = PetscFree(A->spptr);CHKERRQ(ierr);
112   }
113   /* Change the type of A back to SEQAIJ and use MatDestroy_SeqAIJ()
114    * to destroy everything that remains. */
115   ierr = PetscObjectChangeTypeName((PetscObject)A, MATSEQAIJ);CHKERRQ(ierr);
116   /* Note that I don't call MatSetType().  I believe this is because that
117    * is only to be called when *building* a matrix.  I could be wrong, but
118    * that is how things work for the SuperLU matrix class. */
119   ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr);
120   PetscFunctionReturn(0);
121 }
122 
123 PetscErrorCode MatDuplicate_SeqAIJPERM(Mat A, MatDuplicateOption op, Mat *M)
124 {
125   PetscErrorCode ierr;
126   Mat_SeqAIJPERM *aijperm      = (Mat_SeqAIJPERM*) A->spptr;
127   Mat_SeqAIJPERM *aijperm_dest;
128   PetscBool      perm;
129 
130   PetscFunctionBegin;
131   ierr        = MatDuplicate_SeqAIJ(A,op,M);CHKERRQ(ierr);
132   ierr        = PetscObjectTypeCompare((PetscObject)*M,MATSEQAIJPERM,&perm);CHKERRQ(ierr);
133   if (perm) {
134     aijperm_dest = (Mat_SeqAIJPERM *) (*M)->spptr;
135     ierr = PetscFree(aijperm_dest->xgroup);CHKERRQ(ierr);
136     ierr = PetscFree(aijperm_dest->nzgroup);CHKERRQ(ierr);
137     ierr = PetscFree(aijperm_dest->iperm);CHKERRQ(ierr);
138   } else {
139     ierr        = PetscNewLog(*M,&aijperm_dest);CHKERRQ(ierr);
140     (*M)->spptr = (void*) aijperm_dest;
141     ierr        = PetscObjectChangeTypeName((PetscObject)*M,MATSEQAIJPERM);CHKERRQ(ierr);
142     ierr        = PetscObjectComposeFunction((PetscObject)*M,"MatConvert_seqaijperm_seqaij_C",MatConvert_SeqAIJPERM_SeqAIJ);CHKERRQ(ierr);
143     ierr        = PetscObjectComposeFunction((PetscObject)*M,"MatMatMult_seqdense_seqaijperm_C",MatMatMult_SeqDense_SeqAIJ);CHKERRQ(ierr);
144     ierr        = PetscObjectComposeFunction((PetscObject)*M,"MatMatMultSymbolic_seqdense_seqaijperm_C",MatMatMultSymbolic_SeqDense_SeqAIJ);CHKERRQ(ierr);
145     ierr        = PetscObjectComposeFunction((PetscObject)*M,"MatMatMultNumeric_seqdense_seqaijperm_C",MatMatMultNumeric_SeqDense_SeqAIJ);CHKERRQ(ierr);
146   }
147   ierr        = PetscArraycpy(aijperm_dest,aijperm,1);CHKERRQ(ierr);
148   /* Allocate space for, and copy the grouping and permutation info.
149    * I note that when the groups are initially determined in
150    * MatSeqAIJPERM_create_perm, xgroup and nzgroup may be sized larger than
151    * necessary.  But at this point, we know how large they need to be, and
152    * allocate only the necessary amount of memory.  So the duplicated matrix
153    * may actually use slightly less storage than the original! */
154   ierr = PetscMalloc1(A->rmap->n, &aijperm_dest->iperm);CHKERRQ(ierr);
155   ierr = PetscMalloc1(aijperm->ngroup+1, &aijperm_dest->xgroup);CHKERRQ(ierr);
156   ierr = PetscMalloc1(aijperm->ngroup, &aijperm_dest->nzgroup);CHKERRQ(ierr);
157   ierr = PetscArraycpy(aijperm_dest->iperm,aijperm->iperm,A->rmap->n);CHKERRQ(ierr);
158   ierr = PetscArraycpy(aijperm_dest->xgroup,aijperm->xgroup,aijperm->ngroup+1);CHKERRQ(ierr);
159   ierr = PetscArraycpy(aijperm_dest->nzgroup,aijperm->nzgroup,aijperm->ngroup);CHKERRQ(ierr);
160   PetscFunctionReturn(0);
161 }
162 
163 PetscErrorCode MatSeqAIJPERM_create_perm(Mat A)
164 {
165   PetscErrorCode ierr;
166   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)(A)->data;
167   Mat_SeqAIJPERM *aijperm = (Mat_SeqAIJPERM*) A->spptr;
168   PetscInt       m;       /* Number of rows in the matrix. */
169   PetscInt       *ia;       /* From the CSR representation; points to the beginning  of each row. */
170   PetscInt       maxnz;      /* Maximum number of nonzeros in any row. */
171   PetscInt       *rows_in_bucket;
172   /* To construct the permutation, we sort each row into one of maxnz
173    * buckets based on how many nonzeros are in the row. */
174   PetscInt       nz;
175   PetscInt       *nz_in_row;         /* the number of nonzero elements in row k. */
176   PetscInt       *ipnz;
177   /* When constructing the iperm permutation vector,
178    * ipnz[nz] is used to point to the next place in the permutation vector
179    * that a row with nz nonzero elements should be placed.*/
180   PetscInt       i, ngroup, istart, ipos;
181 
182   PetscFunctionBegin;
183   if (aijperm->nonzerostate == A->nonzerostate) PetscFunctionReturn(0); /* permutation exists and matches current nonzero structure */
184   aijperm->nonzerostate = A->nonzerostate;
185  /* Free anything previously put in the Mat_SeqAIJPERM data structure. */
186   ierr = PetscFree(aijperm->xgroup);CHKERRQ(ierr);
187   ierr = PetscFree(aijperm->nzgroup);CHKERRQ(ierr);
188   ierr = PetscFree(aijperm->iperm);CHKERRQ(ierr);
189 
190   m  = A->rmap->n;
191   ia = a->i;
192 
193   /* Allocate the arrays that will hold the permutation vector. */
194   ierr = PetscMalloc1(m, &aijperm->iperm);CHKERRQ(ierr);
195 
196   /* Allocate some temporary work arrays that will be used in
197    * calculating the permuation vector and groupings. */
198   ierr = PetscMalloc1(m, &nz_in_row);CHKERRQ(ierr);
199 
200   /* Now actually figure out the permutation and grouping. */
201 
202   /* First pass: Determine number of nonzeros in each row, maximum
203    * number of nonzeros in any row, and how many rows fall into each
204    * "bucket" of rows with same number of nonzeros. */
205   maxnz = 0;
206   for (i=0; i<m; i++) {
207     nz_in_row[i] = ia[i+1]-ia[i];
208     if (nz_in_row[i] > maxnz) maxnz = nz_in_row[i];
209   }
210   ierr = PetscMalloc1(PetscMax(maxnz,m)+1, &rows_in_bucket);CHKERRQ(ierr);
211   ierr = PetscMalloc1(PetscMax(maxnz,m)+1, &ipnz);CHKERRQ(ierr);
212 
213   for (i=0; i<=maxnz; i++) {
214     rows_in_bucket[i] = 0;
215   }
216   for (i=0; i<m; i++) {
217     nz = nz_in_row[i];
218     rows_in_bucket[nz]++;
219   }
220 
221   /* Allocate space for the grouping info.  There will be at most (maxnz + 1)
222    * groups.  (It is maxnz + 1 instead of simply maxnz because there may be
223    * rows with no nonzero elements.)  If there are (maxnz + 1) groups,
224    * then xgroup[] must consist of (maxnz + 2) elements, since the last
225    * element of xgroup will tell us where the (maxnz + 1)th group ends.
226    * We allocate space for the maximum number of groups;
227    * that is potentially a little wasteful, but not too much so.
228    * Perhaps I should fix it later. */
229   ierr = PetscMalloc1(maxnz+2, &aijperm->xgroup);CHKERRQ(ierr);
230   ierr = PetscMalloc1(maxnz+1, &aijperm->nzgroup);CHKERRQ(ierr);
231 
232   /* Second pass.  Look at what is in the buckets and create the groupings.
233    * Note that it is OK to have a group of rows with no non-zero values. */
234   ngroup = 0;
235   istart = 0;
236   for (i=0; i<=maxnz; i++) {
237     if (rows_in_bucket[i] > 0) {
238       aijperm->nzgroup[ngroup] = i;
239       aijperm->xgroup[ngroup]  = istart;
240       ngroup++;
241       istart += rows_in_bucket[i];
242     }
243   }
244 
245   aijperm->xgroup[ngroup] = istart;
246   aijperm->ngroup         = ngroup;
247 
248   /* Now fill in the permutation vector iperm. */
249   ipnz[0] = 0;
250   for (i=0; i<maxnz; i++) {
251     ipnz[i+1] = ipnz[i] + rows_in_bucket[i];
252   }
253 
254   for (i=0; i<m; i++) {
255     nz                   = nz_in_row[i];
256     ipos                 = ipnz[nz];
257     aijperm->iperm[ipos] = i;
258     ipnz[nz]++;
259   }
260 
261   /* Clean up temporary work arrays. */
262   ierr = PetscFree(rows_in_bucket);CHKERRQ(ierr);
263   ierr = PetscFree(ipnz);CHKERRQ(ierr);
264   ierr = PetscFree(nz_in_row);CHKERRQ(ierr);
265   PetscFunctionReturn(0);
266 }
267 
268 
269 PetscErrorCode MatAssemblyEnd_SeqAIJPERM(Mat A, MatAssemblyType mode)
270 {
271   PetscErrorCode ierr;
272   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
273 
274   PetscFunctionBegin;
275   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0);
276 
277   /* Since a MATSEQAIJPERM matrix is really just a MATSEQAIJ with some
278    * extra information, call the AssemblyEnd routine for a MATSEQAIJ.
279    * I'm not sure if this is the best way to do this, but it avoids
280    * a lot of code duplication.
281    * I also note that currently MATSEQAIJPERM doesn't know anything about
282    * the Mat_CompressedRow data structure that SeqAIJ now uses when there
283    * are many zero rows.  If the SeqAIJ assembly end routine decides to use
284    * this, this may break things.  (Don't know... haven't looked at it.) */
285   a->inode.use = PETSC_FALSE;
286   ierr         = MatAssemblyEnd_SeqAIJ(A, mode);CHKERRQ(ierr);
287 
288   /* Now calculate the permutation and grouping information. */
289   ierr = MatSeqAIJPERM_create_perm(A);CHKERRQ(ierr);
290   PetscFunctionReturn(0);
291 }
292 
293 PetscErrorCode MatMult_SeqAIJPERM(Mat A,Vec xx,Vec yy)
294 {
295   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
296   const PetscScalar *x;
297   PetscScalar       *y;
298   const MatScalar   *aa;
299   PetscErrorCode    ierr;
300   const PetscInt    *aj,*ai;
301 #if !(defined(PETSC_USE_FORTRAN_KERNEL_MULTAIJPERM) && defined(notworking))
302   PetscInt          i,j;
303 #endif
304 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
305   __m512d           vec_x,vec_y,vec_vals;
306   __m256i           vec_idx,vec_ipos,vec_j;
307   __mmask8           mask;
308 #endif
309 
310   /* Variables that don't appear in MatMult_SeqAIJ. */
311   Mat_SeqAIJPERM    *aijperm = (Mat_SeqAIJPERM*) A->spptr;
312   PetscInt          *iperm;  /* Points to the permutation vector. */
313   PetscInt          *xgroup;
314   /* Denotes where groups of rows with same number of nonzeros
315    * begin and end in iperm. */
316   PetscInt          *nzgroup;
317   PetscInt          ngroup;
318   PetscInt          igroup;
319   PetscInt          jstart,jend;
320   /* jstart is used in loops to denote the position in iperm where a
321    * group starts; jend denotes the position where it ends.
322    * (jend + 1 is where the next group starts.) */
323   PetscInt          iold,nz;
324   PetscInt          istart,iend,isize;
325   PetscInt          ipos;
326   PetscScalar       yp[NDIM];
327   PetscInt          ip[NDIM];    /* yp[] and ip[] are treated as vector "registers" for performing the mat-vec. */
328 
329 #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
330 #pragma disjoint(*x,*y,*aa)
331 #endif
332 
333   PetscFunctionBegin;
334   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
335   ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
336   aj   = a->j;   /* aj[k] gives column index for element aa[k]. */
337   aa   = a->a; /* Nonzero elements stored row-by-row. */
338   ai   = a->i;  /* ai[k] is the position in aa and aj where row k starts. */
339 
340   /* Get the info we need about the permutations and groupings. */
341   iperm   = aijperm->iperm;
342   ngroup  = aijperm->ngroup;
343   xgroup  = aijperm->xgroup;
344   nzgroup = aijperm->nzgroup;
345 
346 #if defined(PETSC_USE_FORTRAN_KERNEL_MULTAIJPERM) && defined(notworking)
347   fortranmultaijperm_(&m,x,ii,aj,aa,y);
348 #else
349 
350   for (igroup=0; igroup<ngroup; igroup++) {
351     jstart = xgroup[igroup];
352     jend   = xgroup[igroup+1] - 1;
353     nz     = nzgroup[igroup];
354 
355     /* Handle the special cases where the number of nonzeros per row
356      * in the group is either 0 or 1. */
357     if (nz == 0) {
358       for (i=jstart; i<=jend; i++) {
359         y[iperm[i]] = 0.0;
360       }
361     } else if (nz == 1) {
362       for (i=jstart; i<=jend; i++) {
363         iold    = iperm[i];
364         ipos    = ai[iold];
365         y[iold] = aa[ipos] * x[aj[ipos]];
366       }
367     } else {
368 
369       /* We work our way through the current group in chunks of NDIM rows
370        * at a time. */
371 
372       for (istart=jstart; istart<=jend; istart+=NDIM) {
373         /* Figure out where the chunk of 'isize' rows ends in iperm.
374          * 'isize may of course be less than NDIM for the last chunk. */
375         iend = istart + (NDIM - 1);
376 
377         if (iend > jend) iend = jend;
378 
379         isize = iend - istart + 1;
380 
381         /* Initialize the yp[] array that will be used to hold part of
382          * the permuted results vector, and figure out where in aa each
383          * row of the chunk will begin. */
384         for (i=0; i<isize; i++) {
385           iold = iperm[istart + i];
386           /* iold is a row number from the matrix A *before* reordering. */
387           ip[i] = ai[iold];
388           /* ip[i] tells us where the ith row of the chunk begins in aa. */
389           yp[i] = (PetscScalar) 0.0;
390         }
391 
392         /* If the number of zeros per row exceeds the number of rows in
393          * the chunk, we should vectorize along nz, that is, perform the
394          * mat-vec one row at a time as in the usual CSR case. */
395         if (nz > isize) {
396 #if defined(PETSC_HAVE_CRAY_VECTOR)
397 #pragma _CRI preferstream
398 #endif
399           for (i=0; i<isize; i++) {
400 #if defined(PETSC_HAVE_CRAY_VECTOR)
401 #pragma _CRI prefervector
402 #endif
403 
404 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
405             vec_y = _mm512_setzero_pd();
406             ipos = ip[i];
407             for (j=0; j<(nz>>3); j++) {
408               vec_idx  = _mm256_loadu_si256((__m256i const*)&aj[ipos]);
409               vec_vals = _mm512_loadu_pd(&aa[ipos]);
410               vec_x    = _mm512_i32gather_pd(vec_idx,x,_MM_SCALE_8);
411               vec_y    = _mm512_fmadd_pd(vec_x,vec_vals,vec_y);
412               ipos += 8;
413             }
414             if ((nz&0x07)>2) {
415               mask     = (__mmask8)(0xff >> (8-(nz&0x07)));
416               vec_idx  = _mm256_loadu_si256((__m256i const*)&aj[ipos]);
417               vec_vals = _mm512_loadu_pd(&aa[ipos]);
418               vec_x    = _mm512_mask_i32gather_pd(vec_x,mask,vec_idx,x,_MM_SCALE_8);
419               vec_y    = _mm512_mask3_fmadd_pd(vec_x,vec_vals,vec_y,mask);
420             } else if ((nz&0x07)==2) {
421               yp[i] += aa[ipos]*x[aj[ipos]];
422               yp[i] += aa[ipos+1]*x[aj[ipos+1]];
423             } else if ((nz&0x07)==1) {
424               yp[i] += aa[ipos]*x[aj[ipos]];
425             }
426             yp[i] += _mm512_reduce_add_pd(vec_y);
427 #else
428             for (j=0; j<nz; j++) {
429               ipos   = ip[i] + j;
430               yp[i] += aa[ipos] * x[aj[ipos]];
431             }
432 #endif
433           }
434         } else {
435           /* Otherwise, there are enough rows in the chunk to make it
436            * worthwhile to vectorize across the rows, that is, to do the
437            * matvec by operating with "columns" of the chunk. */
438           for (j=0; j<nz; j++) {
439 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
440             vec_j = _mm256_set1_epi32(j);
441             for (i=0; i<((isize>>3)<<3); i+=8) {
442               vec_y    = _mm512_loadu_pd(&yp[i]);
443               vec_ipos = _mm256_loadu_si256((__m256i const*)&ip[i]);
444               vec_ipos = _mm256_add_epi32(vec_ipos,vec_j);
445               vec_idx  = _mm256_i32gather_epi32(aj,vec_ipos,_MM_SCALE_4);
446               vec_vals = _mm512_i32gather_pd(vec_ipos,aa,_MM_SCALE_8);
447               vec_x    = _mm512_i32gather_pd(vec_idx,x,_MM_SCALE_8);
448               vec_y    = _mm512_fmadd_pd(vec_x,vec_vals,vec_y);
449               _mm512_storeu_pd(&yp[i],vec_y);
450             }
451             for (i=isize-(isize&0x07); i<isize; i++) {
452               ipos = ip[i]+j;
453               yp[i] += aa[ipos]*x[aj[ipos]];
454             }
455 #else
456             for (i=0; i<isize; i++) {
457               ipos   = ip[i] + j;
458               yp[i] += aa[ipos] * x[aj[ipos]];
459             }
460 #endif
461           }
462         }
463 
464 #if defined(PETSC_HAVE_CRAY_VECTOR)
465 #pragma _CRI ivdep
466 #endif
467         /* Put results from yp[] into non-permuted result vector y. */
468         for (i=0; i<isize; i++) {
469           y[iperm[istart+i]] = yp[i];
470         }
471       } /* End processing chunk of isize rows of a group. */
472     } /* End handling matvec for chunk with nz > 1. */
473   } /* End loop over igroup. */
474 #endif
475   ierr = PetscLogFlops(PetscMax(2.0*a->nz - A->rmap->n,0));CHKERRQ(ierr);
476   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
477   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
478   PetscFunctionReturn(0);
479 }
480 
481 
482 /* MatMultAdd_SeqAIJPERM() calculates yy = ww + A * xx.
483  * Note that the names I used to designate the vectors differs from that
484  * used in MatMultAdd_SeqAIJ().  I did this to keep my notation consistent
485  * with the MatMult_SeqAIJPERM() routine, which is very similar to this one. */
486 /*
487     I hate having virtually identical code for the mult and the multadd!!!
488 */
489 PetscErrorCode MatMultAdd_SeqAIJPERM(Mat A,Vec xx,Vec ww,Vec yy)
490 {
491   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
492   const PetscScalar *x;
493   PetscScalar       *y,*w;
494   const MatScalar   *aa;
495   PetscErrorCode    ierr;
496   const PetscInt    *aj,*ai;
497 #if !defined(PETSC_USE_FORTRAN_KERNEL_MULTADDAIJPERM)
498   PetscInt i,j;
499 #endif
500 
501   /* Variables that don't appear in MatMultAdd_SeqAIJ. */
502   Mat_SeqAIJPERM * aijperm;
503   PetscInt       *iperm;    /* Points to the permutation vector. */
504   PetscInt       *xgroup;
505   /* Denotes where groups of rows with same number of nonzeros
506    * begin and end in iperm. */
507   PetscInt *nzgroup;
508   PetscInt ngroup;
509   PetscInt igroup;
510   PetscInt jstart,jend;
511   /* jstart is used in loops to denote the position in iperm where a
512    * group starts; jend denotes the position where it ends.
513    * (jend + 1 is where the next group starts.) */
514   PetscInt    iold,nz;
515   PetscInt    istart,iend,isize;
516   PetscInt    ipos;
517   PetscScalar yp[NDIM];
518   PetscInt    ip[NDIM];
519   /* yp[] and ip[] are treated as vector "registers" for performing
520    * the mat-vec. */
521 
522 #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
523 #pragma disjoint(*x,*y,*aa)
524 #endif
525 
526   PetscFunctionBegin;
527   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
528   ierr = VecGetArrayPair(yy,ww,&y,&w);CHKERRQ(ierr);
529 
530   aj = a->j;   /* aj[k] gives column index for element aa[k]. */
531   aa = a->a;   /* Nonzero elements stored row-by-row. */
532   ai = a->i;   /* ai[k] is the position in aa and aj where row k starts. */
533 
534   /* Get the info we need about the permutations and groupings. */
535   aijperm = (Mat_SeqAIJPERM*) A->spptr;
536   iperm   = aijperm->iperm;
537   ngroup  = aijperm->ngroup;
538   xgroup  = aijperm->xgroup;
539   nzgroup = aijperm->nzgroup;
540 
541 #if defined(PETSC_USE_FORTRAN_KERNEL_MULTADDAIJPERM)
542   fortranmultaddaijperm_(&m,x,ii,aj,aa,y,w);
543 #else
544 
545   for (igroup=0; igroup<ngroup; igroup++) {
546     jstart = xgroup[igroup];
547     jend   = xgroup[igroup+1] - 1;
548 
549     nz = nzgroup[igroup];
550 
551     /* Handle the special cases where the number of nonzeros per row
552      * in the group is either 0 or 1. */
553     if (nz == 0) {
554       for (i=jstart; i<=jend; i++) {
555         iold    = iperm[i];
556         y[iold] = w[iold];
557       }
558     }
559     else if (nz == 1) {
560       for (i=jstart; i<=jend; i++) {
561         iold    = iperm[i];
562         ipos    = ai[iold];
563         y[iold] = w[iold] + aa[ipos] * x[aj[ipos]];
564       }
565     }
566     /* For the general case: */
567     else {
568 
569       /* We work our way through the current group in chunks of NDIM rows
570        * at a time. */
571 
572       for (istart=jstart; istart<=jend; istart+=NDIM) {
573         /* Figure out where the chunk of 'isize' rows ends in iperm.
574          * 'isize may of course be less than NDIM for the last chunk. */
575         iend = istart + (NDIM - 1);
576         if (iend > jend) iend = jend;
577         isize = iend - istart + 1;
578 
579         /* Initialize the yp[] array that will be used to hold part of
580          * the permuted results vector, and figure out where in aa each
581          * row of the chunk will begin. */
582         for (i=0; i<isize; i++) {
583           iold = iperm[istart + i];
584           /* iold is a row number from the matrix A *before* reordering. */
585           ip[i] = ai[iold];
586           /* ip[i] tells us where the ith row of the chunk begins in aa. */
587           yp[i] = w[iold];
588         }
589 
590         /* If the number of zeros per row exceeds the number of rows in
591          * the chunk, we should vectorize along nz, that is, perform the
592          * mat-vec one row at a time as in the usual CSR case. */
593         if (nz > isize) {
594 #if defined(PETSC_HAVE_CRAY_VECTOR)
595 #pragma _CRI preferstream
596 #endif
597           for (i=0; i<isize; i++) {
598 #if defined(PETSC_HAVE_CRAY_VECTOR)
599 #pragma _CRI prefervector
600 #endif
601             for (j=0; j<nz; j++) {
602               ipos   = ip[i] + j;
603               yp[i] += aa[ipos] * x[aj[ipos]];
604             }
605           }
606         }
607         /* Otherwise, there are enough rows in the chunk to make it
608          * worthwhile to vectorize across the rows, that is, to do the
609          * matvec by operating with "columns" of the chunk. */
610         else {
611           for (j=0; j<nz; j++) {
612             for (i=0; i<isize; i++) {
613               ipos   = ip[i] + j;
614               yp[i] += aa[ipos] * x[aj[ipos]];
615             }
616           }
617         }
618 
619 #if defined(PETSC_HAVE_CRAY_VECTOR)
620 #pragma _CRI ivdep
621 #endif
622         /* Put results from yp[] into non-permuted result vector y. */
623         for (i=0; i<isize; i++) {
624           y[iperm[istart+i]] = yp[i];
625         }
626       } /* End processing chunk of isize rows of a group. */
627 
628     } /* End handling matvec for chunk with nz > 1. */
629   } /* End loop over igroup. */
630 
631 #endif
632   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
633   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
634   ierr = VecRestoreArrayPair(yy,ww,&y,&w);CHKERRQ(ierr);
635   PetscFunctionReturn(0);
636 }
637 
638 /* This function prototype is needed in MatConvert_SeqAIJ_SeqAIJPERM(), below. */
639 PETSC_INTERN PetscErrorCode MatPtAP_IS_XAIJ(Mat,Mat,MatReuse,PetscReal,Mat*);
640 
641 /* MatConvert_SeqAIJ_SeqAIJPERM converts a SeqAIJ matrix into a
642  * SeqAIJPERM matrix.  This routine is called by the MatCreate_SeqAIJPERM()
643  * routine, but can also be used to convert an assembled SeqAIJ matrix
644  * into a SeqAIJPERM one. */
645 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJPERM(Mat A,MatType type,MatReuse reuse,Mat *newmat)
646 {
647   PetscErrorCode ierr;
648   Mat            B = *newmat;
649   Mat_SeqAIJPERM *aijperm;
650   PetscBool      sametype;
651 
652   PetscFunctionBegin;
653   if (reuse == MAT_INITIAL_MATRIX) {
654     ierr = MatDuplicate(A,MAT_COPY_VALUES,&B);CHKERRQ(ierr);
655   }
656   ierr = PetscObjectTypeCompare((PetscObject)A,type,&sametype);CHKERRQ(ierr);
657   if (sametype) PetscFunctionReturn(0);
658 
659   ierr     = PetscNewLog(B,&aijperm);CHKERRQ(ierr);
660   B->spptr = (void*) aijperm;
661 
662   /* Set function pointers for methods that we inherit from AIJ but override. */
663   B->ops->duplicate   = MatDuplicate_SeqAIJPERM;
664   B->ops->assemblyend = MatAssemblyEnd_SeqAIJPERM;
665   B->ops->destroy     = MatDestroy_SeqAIJPERM;
666   B->ops->mult        = MatMult_SeqAIJPERM;
667   B->ops->multadd     = MatMultAdd_SeqAIJPERM;
668 
669   aijperm->nonzerostate = -1;  /* this will trigger the generation of the permutation information the first time through MatAssembly()*/
670   /* If A has already been assembled, compute the permutation. */
671   if (A->assembled) {
672     ierr = MatSeqAIJPERM_create_perm(B);CHKERRQ(ierr);
673   }
674 
675   ierr = PetscObjectComposeFunction((PetscObject)B,"MatConvert_seqaijperm_seqaij_C",MatConvert_SeqAIJPERM_SeqAIJ);CHKERRQ(ierr);
676   ierr = PetscObjectComposeFunction((PetscObject)B,"MatMatMult_seqdense_seqaijperm_C",MatMatMult_SeqDense_SeqAIJ);CHKERRQ(ierr);
677   ierr = PetscObjectComposeFunction((PetscObject)B,"MatMatMultSymbolic_seqdense_seqaijperm_C",MatMatMultSymbolic_SeqDense_SeqAIJ);CHKERRQ(ierr);
678   ierr = PetscObjectComposeFunction((PetscObject)B,"MatMatMultNumeric_seqdense_seqaijperm_C",MatMatMultNumeric_SeqDense_SeqAIJ);CHKERRQ(ierr);
679   ierr = PetscObjectComposeFunction((PetscObject)B,"MatPtAP_is_seqaijperm_C",MatPtAP_IS_XAIJ);CHKERRQ(ierr);
680 
681   ierr    = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJPERM);CHKERRQ(ierr);
682   *newmat = B;
683   PetscFunctionReturn(0);
684 }
685 
686 /*@C
687    MatCreateSeqAIJPERM - Creates a sparse matrix of type SEQAIJPERM.
688    This type inherits from AIJ, but calculates some additional permutation
689    information that is used to allow better vectorization of some
690    operations.  At the cost of increased storage, the AIJ formatted
691    matrix can be copied to a format in which pieces of the matrix are
692    stored in ELLPACK format, allowing the vectorized matrix multiply
693    routine to use stride-1 memory accesses.  As with the AIJ type, it is
694    important to preallocate matrix storage in order to get good assembly
695    performance.
696 
697    Collective
698 
699    Input Parameters:
700 +  comm - MPI communicator, set to PETSC_COMM_SELF
701 .  m - number of rows
702 .  n - number of columns
703 .  nz - number of nonzeros per row (same for all rows)
704 -  nnz - array containing the number of nonzeros in the various rows
705          (possibly different for each row) or NULL
706 
707    Output Parameter:
708 .  A - the matrix
709 
710    Notes:
711    If nnz is given then nz is ignored
712 
713    Level: intermediate
714 
715 .seealso: MatCreate(), MatCreateMPIAIJPERM(), MatSetValues()
716 @*/
717 PetscErrorCode  MatCreateSeqAIJPERM(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
718 {
719   PetscErrorCode ierr;
720 
721   PetscFunctionBegin;
722   ierr = MatCreate(comm,A);CHKERRQ(ierr);
723   ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr);
724   ierr = MatSetType(*A,MATSEQAIJPERM);CHKERRQ(ierr);
725   ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,nnz);CHKERRQ(ierr);
726   PetscFunctionReturn(0);
727 }
728 
729 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJPERM(Mat A)
730 {
731   PetscErrorCode ierr;
732 
733   PetscFunctionBegin;
734   ierr = MatSetType(A,MATSEQAIJ);CHKERRQ(ierr);
735   ierr = MatConvert_SeqAIJ_SeqAIJPERM(A,MATSEQAIJPERM,MAT_INPLACE_MATRIX,&A);CHKERRQ(ierr);
736   PetscFunctionReturn(0);
737 }
738 
739