xref: /petsc/src/mat/impls/aij/seq/aijperm/aijperm.c (revision d0e6bf2ad94dcc89b258ce16c7987200a4714786)
1 /*
2   Defines basic operations for the MATSEQAIJPERM matrix class.
3   This class is derived from the MATSEQAIJ class and retains the
4   compressed row storage (aka Yale sparse matrix format) but augments
5   it with some permutation information that enables some operations
6   to be more vectorizable.  A physically rearranged copy of the matrix
7   may be stored if the user desires.
8 
9   Eventually a variety of permutations may be supported.
10 */
11 
12 #include <../src/mat/impls/aij/seq/aij.h>
13 
14 #if defined(PETSC_USE_AVX512_KERNELS) && defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
15   #include <immintrin.h>
16 
17   #if !defined(_MM_SCALE_8)
18     #define _MM_SCALE_8 8
19   #endif
20   #if !defined(_MM_SCALE_4)
21     #define _MM_SCALE_4 4
22   #endif
23 #endif
24 
25 #define NDIM 512
26 /* NDIM specifies how many rows at a time we should work with when
27  * performing the vectorized mat-vec.  This depends on various factors
28  * such as vector register length, etc., and I really need to add a
29  * way for the user (or the library) to tune this.  I'm setting it to
30  * 512 for now since that is what Ed D'Azevedo was using in his Fortran
31  * routines. */
32 
33 typedef struct {
34   PetscObjectState nonzerostate; /* used to determine if the nonzero structure has changed and hence the permutations need updating */
35 
36   PetscInt  ngroup;
37   PetscInt *xgroup;
38   /* Denotes where groups of rows with same number of nonzeros
39    * begin and end, i.e., xgroup[i] gives us the position in iperm[]
40    * where the ith group begins. */
41 
42   PetscInt *nzgroup; /*  how many nonzeros each row that is a member of group i has. */
43   PetscInt *iperm;   /* The permutation vector. */
44 
45   /* Some of this stuff is for Ed's recursive triangular solve.
46    * I'm not sure what I need yet. */
47   PetscInt   blocksize;
48   PetscInt   nstep;
49   PetscInt  *jstart_list;
50   PetscInt  *jend_list;
51   PetscInt  *action_list;
52   PetscInt  *ngroup_list;
53   PetscInt **ipointer_list;
54   PetscInt **xgroup_list;
55   PetscInt **nzgroup_list;
56   PetscInt **iperm_list;
57 } Mat_SeqAIJPERM;
58 
MatConvert_SeqAIJPERM_SeqAIJ(Mat A,MatType type,MatReuse reuse,Mat * newmat)59 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJPERM_SeqAIJ(Mat A, MatType type, MatReuse reuse, Mat *newmat)
60 {
61   /* This routine is only called to convert a MATAIJPERM to its base PETSc type, */
62   /* so we will ignore 'MatType type'. */
63   Mat             B       = *newmat;
64   Mat_SeqAIJPERM *aijperm = (Mat_SeqAIJPERM *)A->spptr;
65 
66   PetscFunctionBegin;
67   if (reuse == MAT_INITIAL_MATRIX) {
68     PetscCall(MatDuplicate(A, MAT_COPY_VALUES, &B));
69     aijperm = (Mat_SeqAIJPERM *)B->spptr;
70   }
71 
72   /* Reset the original function pointers. */
73   B->ops->assemblyend = MatAssemblyEnd_SeqAIJ;
74   B->ops->destroy     = MatDestroy_SeqAIJ;
75   B->ops->duplicate   = MatDuplicate_SeqAIJ;
76   B->ops->mult        = MatMult_SeqAIJ;
77   B->ops->multadd     = MatMultAdd_SeqAIJ;
78 
79   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_seqaijperm_seqaij_C", NULL));
80 
81   /* Free everything in the Mat_SeqAIJPERM data structure.*/
82   PetscCall(PetscFree(aijperm->xgroup));
83   PetscCall(PetscFree(aijperm->nzgroup));
84   PetscCall(PetscFree(aijperm->iperm));
85   PetscCall(PetscFree(B->spptr));
86 
87   /* Change the type of B to MATSEQAIJ. */
88   PetscCall(PetscObjectChangeTypeName((PetscObject)B, MATSEQAIJ));
89 
90   *newmat = B;
91   PetscFunctionReturn(PETSC_SUCCESS);
92 }
93 
MatDestroy_SeqAIJPERM(Mat A)94 static PetscErrorCode MatDestroy_SeqAIJPERM(Mat A)
95 {
96   Mat_SeqAIJPERM *aijperm = (Mat_SeqAIJPERM *)A->spptr;
97 
98   PetscFunctionBegin;
99   if (aijperm) {
100     /* If MatHeaderMerge() was used then this SeqAIJPERM matrix will not have a spprt. */
101     PetscCall(PetscFree(aijperm->xgroup));
102     PetscCall(PetscFree(aijperm->nzgroup));
103     PetscCall(PetscFree(aijperm->iperm));
104     PetscCall(PetscFree(A->spptr));
105   }
106   /* Change the type of A back to SEQAIJ and use MatDestroy_SeqAIJ()
107    * to destroy everything that remains. */
108   PetscCall(PetscObjectChangeTypeName((PetscObject)A, MATSEQAIJ));
109   /* Note that I don't call MatSetType().  I believe this is because that
110    * is only to be called when *building* a matrix.  I could be wrong, but
111    * that is how things work for the SuperLU matrix class. */
112   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqaijperm_seqaij_C", NULL));
113   PetscCall(MatDestroy_SeqAIJ(A));
114   PetscFunctionReturn(PETSC_SUCCESS);
115 }
116 
MatDuplicate_SeqAIJPERM(Mat A,MatDuplicateOption op,Mat * M)117 static PetscErrorCode MatDuplicate_SeqAIJPERM(Mat A, MatDuplicateOption op, Mat *M)
118 {
119   Mat_SeqAIJPERM *aijperm = (Mat_SeqAIJPERM *)A->spptr;
120   Mat_SeqAIJPERM *aijperm_dest;
121   PetscBool       perm;
122 
123   PetscFunctionBegin;
124   PetscCall(MatDuplicate_SeqAIJ(A, op, M));
125   PetscCall(PetscObjectTypeCompare((PetscObject)*M, MATSEQAIJPERM, &perm));
126   if (perm) {
127     aijperm_dest = (Mat_SeqAIJPERM *)(*M)->spptr;
128     PetscCall(PetscFree(aijperm_dest->xgroup));
129     PetscCall(PetscFree(aijperm_dest->nzgroup));
130     PetscCall(PetscFree(aijperm_dest->iperm));
131   } else {
132     PetscCall(PetscNew(&aijperm_dest));
133     (*M)->spptr = (void *)aijperm_dest;
134     PetscCall(PetscObjectChangeTypeName((PetscObject)*M, MATSEQAIJPERM));
135     PetscCall(PetscObjectComposeFunction((PetscObject)*M, "MatConvert_seqaijperm_seqaij_C", MatConvert_SeqAIJPERM_SeqAIJ));
136   }
137   PetscCall(PetscArraycpy(aijperm_dest, aijperm, 1));
138   /* Allocate space for, and copy the grouping and permutation info.
139    * I note that when the groups are initially determined in
140    * MatSeqAIJPERM_create_perm, xgroup and nzgroup may be sized larger than
141    * necessary.  But at this point, we know how large they need to be, and
142    * allocate only the necessary amount of memory.  So the duplicated matrix
143    * may actually use slightly less storage than the original! */
144   PetscCall(PetscMalloc1(A->rmap->n, &aijperm_dest->iperm));
145   PetscCall(PetscMalloc1(aijperm->ngroup + 1, &aijperm_dest->xgroup));
146   PetscCall(PetscMalloc1(aijperm->ngroup, &aijperm_dest->nzgroup));
147   PetscCall(PetscArraycpy(aijperm_dest->iperm, aijperm->iperm, A->rmap->n));
148   PetscCall(PetscArraycpy(aijperm_dest->xgroup, aijperm->xgroup, aijperm->ngroup + 1));
149   PetscCall(PetscArraycpy(aijperm_dest->nzgroup, aijperm->nzgroup, aijperm->ngroup));
150   PetscFunctionReturn(PETSC_SUCCESS);
151 }
152 
MatSeqAIJPERM_create_perm(Mat A)153 static PetscErrorCode MatSeqAIJPERM_create_perm(Mat A)
154 {
155   Mat_SeqAIJ     *a       = (Mat_SeqAIJ *)A->data;
156   Mat_SeqAIJPERM *aijperm = (Mat_SeqAIJPERM *)A->spptr;
157   PetscInt        m;     /* Number of rows in the matrix. */
158   PetscInt       *ia;    /* From the CSR representation; points to the beginning  of each row. */
159   PetscInt        maxnz; /* Maximum number of nonzeros in any row. */
160   PetscInt       *rows_in_bucket;
161   /* To construct the permutation, we sort each row into one of maxnz
162    * buckets based on how many nonzeros are in the row. */
163   PetscInt  nz;
164   PetscInt *nz_in_row; /* the number of nonzero elements in row k. */
165   PetscInt *ipnz;
166   /* When constructing the iperm permutation vector,
167    * ipnz[nz] is used to point to the next place in the permutation vector
168    * that a row with nz nonzero elements should be placed.*/
169   PetscInt i, ngroup, istart, ipos;
170 
171   PetscFunctionBegin;
172   if (aijperm->nonzerostate == A->nonzerostate) PetscFunctionReturn(PETSC_SUCCESS); /* permutation exists and matches current nonzero structure */
173   aijperm->nonzerostate = A->nonzerostate;
174   /* Free anything previously put in the Mat_SeqAIJPERM data structure. */
175   PetscCall(PetscFree(aijperm->xgroup));
176   PetscCall(PetscFree(aijperm->nzgroup));
177   PetscCall(PetscFree(aijperm->iperm));
178 
179   m  = A->rmap->n;
180   ia = a->i;
181 
182   /* Allocate the arrays that will hold the permutation vector. */
183   PetscCall(PetscMalloc1(m, &aijperm->iperm));
184 
185   /* Allocate some temporary work arrays that will be used in
186    * calculating the permutation vector and groupings. */
187   PetscCall(PetscMalloc1(m, &nz_in_row));
188 
189   /* Now actually figure out the permutation and grouping. */
190 
191   /* First pass: Determine number of nonzeros in each row, maximum
192    * number of nonzeros in any row, and how many rows fall into each
193    * "bucket" of rows with same number of nonzeros. */
194   maxnz = 0;
195   for (i = 0; i < m; i++) {
196     nz_in_row[i] = ia[i + 1] - ia[i];
197     if (nz_in_row[i] > maxnz) maxnz = nz_in_row[i];
198   }
199   PetscCall(PetscMalloc1(PetscMax(maxnz, m) + 1, &rows_in_bucket));
200   PetscCall(PetscMalloc1(PetscMax(maxnz, m) + 1, &ipnz));
201 
202   for (i = 0; i <= maxnz; i++) rows_in_bucket[i] = 0;
203   for (i = 0; i < m; i++) {
204     nz = nz_in_row[i];
205     rows_in_bucket[nz]++;
206   }
207 
208   /* Allocate space for the grouping info.  There will be at most (maxnz + 1)
209    * groups.  (It is maxnz + 1 instead of simply maxnz because there may be
210    * rows with no nonzero elements.)  If there are (maxnz + 1) groups,
211    * then xgroup[] must consist of (maxnz + 2) elements, since the last
212    * element of xgroup will tell us where the (maxnz + 1)th group ends.
213    * We allocate space for the maximum number of groups;
214    * that is potentially a little wasteful, but not too much so.
215    * Perhaps I should fix it later. */
216   PetscCall(PetscMalloc1(maxnz + 2, &aijperm->xgroup));
217   PetscCall(PetscMalloc1(maxnz + 1, &aijperm->nzgroup));
218 
219   /* Second pass.  Look at what is in the buckets and create the groupings.
220    * Note that it is OK to have a group of rows with no non-zero values. */
221   ngroup = 0;
222   istart = 0;
223   for (i = 0; i <= maxnz; i++) {
224     if (rows_in_bucket[i] > 0) {
225       aijperm->nzgroup[ngroup] = i;
226       aijperm->xgroup[ngroup]  = istart;
227       ngroup++;
228       istart += rows_in_bucket[i];
229     }
230   }
231 
232   aijperm->xgroup[ngroup] = istart;
233   aijperm->ngroup         = ngroup;
234 
235   /* Now fill in the permutation vector iperm. */
236   ipnz[0] = 0;
237   for (i = 0; i < maxnz; i++) ipnz[i + 1] = ipnz[i] + rows_in_bucket[i];
238 
239   for (i = 0; i < m; i++) {
240     nz                   = nz_in_row[i];
241     ipos                 = ipnz[nz];
242     aijperm->iperm[ipos] = i;
243     ipnz[nz]++;
244   }
245 
246   /* Clean up temporary work arrays. */
247   PetscCall(PetscFree(rows_in_bucket));
248   PetscCall(PetscFree(ipnz));
249   PetscCall(PetscFree(nz_in_row));
250   PetscFunctionReturn(PETSC_SUCCESS);
251 }
252 
MatAssemblyEnd_SeqAIJPERM(Mat A,MatAssemblyType mode)253 static PetscErrorCode MatAssemblyEnd_SeqAIJPERM(Mat A, MatAssemblyType mode)
254 {
255   Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
256 
257   PetscFunctionBegin;
258   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(PETSC_SUCCESS);
259 
260   /* Since a MATSEQAIJPERM matrix is really just a MATSEQAIJ with some
261    * extra information, call the AssemblyEnd routine for a MATSEQAIJ.
262    * I'm not sure if this is the best way to do this, but it avoids
263    * a lot of code duplication.
264    * I also note that currently MATSEQAIJPERM doesn't know anything about
265    * the Mat_CompressedRow data structure that SeqAIJ now uses when there
266    * are many zero rows.  If the SeqAIJ assembly end routine decides to use
267    * this, this may break things.  (Don't know... haven't looked at it.) */
268   a->inode.use = PETSC_FALSE;
269   PetscCall(MatAssemblyEnd_SeqAIJ(A, mode));
270 
271   /* Now calculate the permutation and grouping information. */
272   PetscCall(MatSeqAIJPERM_create_perm(A));
273   PetscFunctionReturn(PETSC_SUCCESS);
274 }
275 
MatMult_SeqAIJPERM(Mat A,Vec xx,Vec yy)276 static PetscErrorCode MatMult_SeqAIJPERM(Mat A, Vec xx, Vec yy)
277 {
278   Mat_SeqAIJ        *a = (Mat_SeqAIJ *)A->data;
279   const PetscScalar *x;
280   PetscScalar       *y;
281   const MatScalar   *aa;
282   const PetscInt    *aj, *ai;
283   PetscInt           i, j;
284 #if defined(PETSC_USE_AVX512_KERNELS) && defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
285   __m512d  vec_x, vec_y, vec_vals;
286   __m256i  vec_idx, vec_ipos, vec_j;
287   __mmask8 mask;
288 #endif
289 
290   /* Variables that don't appear in MatMult_SeqAIJ. */
291   Mat_SeqAIJPERM *aijperm = (Mat_SeqAIJPERM *)A->spptr;
292   PetscInt       *iperm; /* Points to the permutation vector. */
293   PetscInt       *xgroup;
294   /* Denotes where groups of rows with same number of nonzeros
295    * begin and end in iperm. */
296   PetscInt *nzgroup;
297   PetscInt  ngroup;
298   PetscInt  igroup;
299   PetscInt  jstart, jend;
300   /* jstart is used in loops to denote the position in iperm where a
301    * group starts; jend denotes the position where it ends.
302    * (jend + 1 is where the next group starts.) */
303   PetscInt    iold, nz;
304   PetscInt    istart, iend, isize;
305   PetscInt    ipos;
306   PetscScalar yp[NDIM];
307   PetscInt    ip[NDIM]; /* yp[] and ip[] are treated as vector "registers" for performing the mat-vec. */
308 
309 #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
310   #pragma disjoint(*x, *y, *aa)
311 #endif
312 
313   PetscFunctionBegin;
314   PetscCall(VecGetArrayRead(xx, &x));
315   PetscCall(VecGetArray(yy, &y));
316   aj = a->j; /* aj[k] gives column index for element aa[k]. */
317   aa = a->a; /* Nonzero elements stored row-by-row. */
318   ai = a->i; /* ai[k] is the position in aa and aj where row k starts. */
319 
320   /* Get the info we need about the permutations and groupings. */
321   iperm   = aijperm->iperm;
322   ngroup  = aijperm->ngroup;
323   xgroup  = aijperm->xgroup;
324   nzgroup = aijperm->nzgroup;
325 
326   for (igroup = 0; igroup < ngroup; igroup++) {
327     jstart = xgroup[igroup];
328     jend   = xgroup[igroup + 1] - 1;
329     nz     = nzgroup[igroup];
330 
331     /* Handle the special cases where the number of nonzeros per row
332      * in the group is either 0 or 1. */
333     if (nz == 0) {
334       for (i = jstart; i <= jend; i++) y[iperm[i]] = 0.0;
335     } else if (nz == 1) {
336       for (i = jstart; i <= jend; i++) {
337         iold    = iperm[i];
338         ipos    = ai[iold];
339         y[iold] = aa[ipos] * x[aj[ipos]];
340       }
341     } else {
342       /* We work our way through the current group in chunks of NDIM rows
343        * at a time. */
344 
345       for (istart = jstart; istart <= jend; istart += NDIM) {
346         /* Figure out where the chunk of 'isize' rows ends in iperm.
347          * 'isize may of course be less than NDIM for the last chunk. */
348         iend = istart + (NDIM - 1);
349 
350         if (iend > jend) iend = jend;
351 
352         isize = iend - istart + 1;
353 
354         /* Initialize the yp[] array that will be used to hold part of
355          * the permuted results vector, and figure out where in aa each
356          * row of the chunk will begin. */
357         for (i = 0; i < isize; i++) {
358           iold = iperm[istart + i];
359           /* iold is a row number from the matrix A *before* reordering. */
360           ip[i] = ai[iold];
361           /* ip[i] tells us where the ith row of the chunk begins in aa. */
362           yp[i] = (PetscScalar)0.0;
363         }
364 
365         /* If the number of zeros per row exceeds the number of rows in
366          * the chunk, we should vectorize along nz, that is, perform the
367          * mat-vec one row at a time as in the usual CSR case. */
368         if (nz > isize) {
369 #if defined(PETSC_HAVE_CRAY_VECTOR)
370   #pragma _CRI preferstream
371 #endif
372           for (i = 0; i < isize; i++) {
373 #if defined(PETSC_HAVE_CRAY_VECTOR)
374   #pragma _CRI prefervector
375 #endif
376 
377 #if defined(PETSC_USE_AVX512_KERNELS) && defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
378             vec_y = _mm512_setzero_pd();
379             ipos  = ip[i];
380             for (j = 0; j < (nz >> 3); j++) {
381               vec_idx  = _mm256_loadu_si256((__m256i const *)&aj[ipos]);
382               vec_vals = _mm512_loadu_pd(&aa[ipos]);
383               vec_x    = _mm512_i32gather_pd(vec_idx, x, _MM_SCALE_8);
384               vec_y    = _mm512_fmadd_pd(vec_x, vec_vals, vec_y);
385               ipos += 8;
386             }
387             if ((nz & 0x07) > 2) {
388               mask     = (__mmask8)(0xff >> (8 - (nz & 0x07)));
389               vec_idx  = _mm256_loadu_si256((__m256i const *)&aj[ipos]);
390               vec_vals = _mm512_loadu_pd(&aa[ipos]);
391               vec_x    = _mm512_mask_i32gather_pd(vec_x, mask, vec_idx, x, _MM_SCALE_8);
392               vec_y    = _mm512_mask3_fmadd_pd(vec_x, vec_vals, vec_y, mask);
393             } else if ((nz & 0x07) == 2) {
394               yp[i] += aa[ipos] * x[aj[ipos]];
395               yp[i] += aa[ipos + 1] * x[aj[ipos + 1]];
396             } else if ((nz & 0x07) == 1) {
397               yp[i] += aa[ipos] * x[aj[ipos]];
398             }
399             yp[i] += _mm512_reduce_add_pd(vec_y);
400 #else
401             for (j = 0; j < nz; j++) {
402               ipos = ip[i] + j;
403               yp[i] += aa[ipos] * x[aj[ipos]];
404             }
405 #endif
406           }
407         } else {
408           /* Otherwise, there are enough rows in the chunk to make it
409            * worthwhile to vectorize across the rows, that is, to do the
410            * matvec by operating with "columns" of the chunk. */
411           for (j = 0; j < nz; j++) {
412 #if defined(PETSC_USE_AVX512_KERNELS) && defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
413             vec_j = _mm256_set1_epi32(j);
414             for (i = 0; i < ((isize >> 3) << 3); i += 8) {
415               vec_y    = _mm512_loadu_pd(&yp[i]);
416               vec_ipos = _mm256_loadu_si256((__m256i const *)&ip[i]);
417               vec_ipos = _mm256_add_epi32(vec_ipos, vec_j);
418               vec_idx  = _mm256_i32gather_epi32(aj, vec_ipos, _MM_SCALE_4);
419               vec_vals = _mm512_i32gather_pd(vec_ipos, aa, _MM_SCALE_8);
420               vec_x    = _mm512_i32gather_pd(vec_idx, x, _MM_SCALE_8);
421               vec_y    = _mm512_fmadd_pd(vec_x, vec_vals, vec_y);
422               _mm512_storeu_pd(&yp[i], vec_y);
423             }
424             for (i = isize - (isize & 0x07); i < isize; i++) {
425               ipos = ip[i] + j;
426               yp[i] += aa[ipos] * x[aj[ipos]];
427             }
428 #else
429             for (i = 0; i < isize; i++) {
430               ipos = ip[i] + j;
431               yp[i] += aa[ipos] * x[aj[ipos]];
432             }
433 #endif
434           }
435         }
436 
437 #if defined(PETSC_HAVE_CRAY_VECTOR)
438   #pragma _CRI ivdep
439 #endif
440         /* Put results from yp[] into non-permuted result vector y. */
441         for (i = 0; i < isize; i++) y[iperm[istart + i]] = yp[i];
442       } /* End processing chunk of isize rows of a group. */
443     } /* End handling matvec for chunk with nz > 1. */
444   } /* End loop over igroup. */
445   PetscCall(PetscLogFlops(PetscMax(2.0 * a->nz - A->rmap->n, 0)));
446   PetscCall(VecRestoreArrayRead(xx, &x));
447   PetscCall(VecRestoreArray(yy, &y));
448   PetscFunctionReturn(PETSC_SUCCESS);
449 }
450 
451 /* MatMultAdd_SeqAIJPERM() calculates yy = ww + A * xx.
452  * Note that the names I used to designate the vectors differs from that
453  * used in MatMultAdd_SeqAIJ().  I did this to keep my notation consistent
454  * with the MatMult_SeqAIJPERM() routine, which is very similar to this one. */
455 /*
456     I hate having virtually identical code for the mult and the multadd!!!
457 */
MatMultAdd_SeqAIJPERM(Mat A,Vec xx,Vec ww,Vec yy)458 static PetscErrorCode MatMultAdd_SeqAIJPERM(Mat A, Vec xx, Vec ww, Vec yy)
459 {
460   Mat_SeqAIJ        *a = (Mat_SeqAIJ *)A->data;
461   const PetscScalar *x;
462   PetscScalar       *y, *w;
463   const MatScalar   *aa;
464   const PetscInt    *aj, *ai;
465   PetscInt           i, j;
466 
467   /* Variables that don't appear in MatMultAdd_SeqAIJ. */
468   Mat_SeqAIJPERM *aijperm;
469   PetscInt       *iperm; /* Points to the permutation vector. */
470   PetscInt       *xgroup;
471   /* Denotes where groups of rows with same number of nonzeros
472    * begin and end in iperm. */
473   PetscInt *nzgroup;
474   PetscInt  ngroup;
475   PetscInt  igroup;
476   PetscInt  jstart, jend;
477   /* jstart is used in loops to denote the position in iperm where a
478    * group starts; jend denotes the position where it ends.
479    * (jend + 1 is where the next group starts.) */
480   PetscInt    iold, nz;
481   PetscInt    istart, iend, isize;
482   PetscInt    ipos;
483   PetscScalar yp[NDIM];
484   PetscInt    ip[NDIM];
485   /* yp[] and ip[] are treated as vector "registers" for performing
486    * the mat-vec. */
487 
488 #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
489   #pragma disjoint(*x, *y, *aa)
490 #endif
491 
492   PetscFunctionBegin;
493   PetscCall(VecGetArrayRead(xx, &x));
494   PetscCall(VecGetArrayPair(yy, ww, &y, &w));
495 
496   aj = a->j; /* aj[k] gives column index for element aa[k]. */
497   aa = a->a; /* Nonzero elements stored row-by-row. */
498   ai = a->i; /* ai[k] is the position in aa and aj where row k starts. */
499 
500   /* Get the info we need about the permutations and groupings. */
501   aijperm = (Mat_SeqAIJPERM *)A->spptr;
502   iperm   = aijperm->iperm;
503   ngroup  = aijperm->ngroup;
504   xgroup  = aijperm->xgroup;
505   nzgroup = aijperm->nzgroup;
506 
507   for (igroup = 0; igroup < ngroup; igroup++) {
508     jstart = xgroup[igroup];
509     jend   = xgroup[igroup + 1] - 1;
510 
511     nz = nzgroup[igroup];
512 
513     /* Handle the special cases where the number of nonzeros per row
514      * in the group is either 0 or 1. */
515     if (nz == 0) {
516       for (i = jstart; i <= jend; i++) {
517         iold    = iperm[i];
518         y[iold] = w[iold];
519       }
520     } else if (nz == 1) {
521       for (i = jstart; i <= jend; i++) {
522         iold    = iperm[i];
523         ipos    = ai[iold];
524         y[iold] = w[iold] + aa[ipos] * x[aj[ipos]];
525       }
526     }
527     /* For the general case: */
528     else {
529       /* We work our way through the current group in chunks of NDIM rows
530        * at a time. */
531 
532       for (istart = jstart; istart <= jend; istart += NDIM) {
533         /* Figure out where the chunk of 'isize' rows ends in iperm.
534          * 'isize may of course be less than NDIM for the last chunk. */
535         iend = istart + (NDIM - 1);
536         if (iend > jend) iend = jend;
537         isize = iend - istart + 1;
538 
539         /* Initialize the yp[] array that will be used to hold part of
540          * the permuted results vector, and figure out where in aa each
541          * row of the chunk will begin. */
542         for (i = 0; i < isize; i++) {
543           iold = iperm[istart + i];
544           /* iold is a row number from the matrix A *before* reordering. */
545           ip[i] = ai[iold];
546           /* ip[i] tells us where the ith row of the chunk begins in aa. */
547           yp[i] = w[iold];
548         }
549 
550         /* If the number of zeros per row exceeds the number of rows in
551          * the chunk, we should vectorize along nz, that is, perform the
552          * mat-vec one row at a time as in the usual CSR case. */
553         if (nz > isize) {
554 #if defined(PETSC_HAVE_CRAY_VECTOR)
555   #pragma _CRI preferstream
556 #endif
557           for (i = 0; i < isize; i++) {
558 #if defined(PETSC_HAVE_CRAY_VECTOR)
559   #pragma _CRI prefervector
560 #endif
561             for (j = 0; j < nz; j++) {
562               ipos = ip[i] + j;
563               yp[i] += aa[ipos] * x[aj[ipos]];
564             }
565           }
566         }
567         /* Otherwise, there are enough rows in the chunk to make it
568          * worthwhile to vectorize across the rows, that is, to do the
569          * matvec by operating with "columns" of the chunk. */
570         else {
571           for (j = 0; j < nz; j++) {
572             for (i = 0; i < isize; i++) {
573               ipos = ip[i] + j;
574               yp[i] += aa[ipos] * x[aj[ipos]];
575             }
576           }
577         }
578 
579 #if defined(PETSC_HAVE_CRAY_VECTOR)
580   #pragma _CRI ivdep
581 #endif
582         /* Put results from yp[] into non-permuted result vector y. */
583         for (i = 0; i < isize; i++) y[iperm[istart + i]] = yp[i];
584       } /* End processing chunk of isize rows of a group. */
585 
586     } /* End handling matvec for chunk with nz > 1. */
587   } /* End loop over igroup. */
588 
589   PetscCall(PetscLogFlops(2.0 * a->nz));
590   PetscCall(VecRestoreArrayRead(xx, &x));
591   PetscCall(VecRestoreArrayPair(yy, ww, &y, &w));
592   PetscFunctionReturn(PETSC_SUCCESS);
593 }
594 
595 /* MatConvert_SeqAIJ_SeqAIJPERM converts a SeqAIJ matrix into a
596  * SeqAIJPERM matrix.  This routine is called by the MatCreate_SeqAIJPERM()
597  * routine, but can also be used to convert an assembled SeqAIJ matrix
598  * into a SeqAIJPERM one. */
MatConvert_SeqAIJ_SeqAIJPERM(Mat A,MatType type,MatReuse reuse,Mat * newmat)599 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJPERM(Mat A, MatType type, MatReuse reuse, Mat *newmat)
600 {
601   Mat             B = *newmat;
602   Mat_SeqAIJPERM *aijperm;
603   PetscBool       sametype;
604 
605   PetscFunctionBegin;
606   if (reuse == MAT_INITIAL_MATRIX) PetscCall(MatDuplicate(A, MAT_COPY_VALUES, &B));
607   PetscCall(PetscObjectTypeCompare((PetscObject)A, type, &sametype));
608   if (sametype) PetscFunctionReturn(PETSC_SUCCESS);
609 
610   PetscCall(PetscNew(&aijperm));
611   B->spptr = (void *)aijperm;
612 
613   /* Set function pointers for methods that we inherit from AIJ but override. */
614   B->ops->duplicate   = MatDuplicate_SeqAIJPERM;
615   B->ops->assemblyend = MatAssemblyEnd_SeqAIJPERM;
616   B->ops->destroy     = MatDestroy_SeqAIJPERM;
617   B->ops->mult        = MatMult_SeqAIJPERM;
618   B->ops->multadd     = MatMultAdd_SeqAIJPERM;
619 
620   aijperm->nonzerostate = -1; /* this will trigger the generation of the permutation information the first time through MatAssembly()*/
621   /* If A has already been assembled, compute the permutation. */
622   if (A->assembled) PetscCall(MatSeqAIJPERM_create_perm(B));
623 
624   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_seqaijperm_seqaij_C", MatConvert_SeqAIJPERM_SeqAIJ));
625 
626   PetscCall(PetscObjectChangeTypeName((PetscObject)B, MATSEQAIJPERM));
627   *newmat = B;
628   PetscFunctionReturn(PETSC_SUCCESS);
629 }
630 
631 /*@C
632   MatCreateSeqAIJPERM - Creates a sparse matrix of type `MATSEQAIJPERM`.
633 
634   Collective
635 
636   Input Parameters:
637 + comm - MPI communicator, set to `PETSC_COMM_SELF`
638 . m    - number of rows
639 . n    - number of columns
640 . nz   - number of nonzeros per row (same for all rows), ignored if `nnz` is given
641 - nnz  - array containing the number of nonzeros in the various rows (possibly different for each row) or `NULL`
642 
643   Output Parameter:
644 . A - the matrix
645 
646   Level: intermediate
647 
648   Notes:
649   This type inherits from `MATSEQAIJ`, but calculates some additional permutation information
650   that is used to allow better vectorization of some operations.  At the cost of increased
651   storage, the `MATSEQAIJ` formatted matrix can be copied to a format in which pieces of the
652   matrix are stored in ELLPACK format, allowing the vectorized matrix multiply routine to use
653   stride-1 memory accesses.
654 
655 .seealso: [](ch_matrices), `Mat`, `MatCreate()`, `MatCreateMPIAIJPERM()`, `MatSetValues()`
656 @*/
MatCreateSeqAIJPERM(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat * A)657 PetscErrorCode MatCreateSeqAIJPERM(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt nz, const PetscInt nnz[], Mat *A)
658 {
659   PetscFunctionBegin;
660   PetscCall(MatCreate(comm, A));
661   PetscCall(MatSetSizes(*A, m, n, m, n));
662   PetscCall(MatSetType(*A, MATSEQAIJPERM));
663   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A, nz, nnz));
664   PetscFunctionReturn(PETSC_SUCCESS);
665 }
666 
MatCreate_SeqAIJPERM(Mat A)667 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJPERM(Mat A)
668 {
669   PetscFunctionBegin;
670   PetscCall(MatSetType(A, MATSEQAIJ));
671   PetscCall(MatConvert_SeqAIJ_SeqAIJPERM(A, MATSEQAIJPERM, MAT_INPLACE_MATRIX, &A));
672   PetscFunctionReturn(PETSC_SUCCESS);
673 }
674