1 /*
2 Defines basic operations for the MATSEQAIJPERM matrix class.
3 This class is derived from the MATSEQAIJ class and retains the
4 compressed row storage (aka Yale sparse matrix format) but augments
5 it with some permutation information that enables some operations
6 to be more vectorizable. A physically rearranged copy of the matrix
7 may be stored if the user desires.
8
9 Eventually a variety of permutations may be supported.
10 */
11
12 #include <../src/mat/impls/aij/seq/aij.h>
13
14 #if defined(PETSC_USE_AVX512_KERNELS) && defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
15 #include <immintrin.h>
16
17 #if !defined(_MM_SCALE_8)
18 #define _MM_SCALE_8 8
19 #endif
20 #if !defined(_MM_SCALE_4)
21 #define _MM_SCALE_4 4
22 #endif
23 #endif
24
25 #define NDIM 512
26 /* NDIM specifies how many rows at a time we should work with when
27 * performing the vectorized mat-vec. This depends on various factors
28 * such as vector register length, etc., and I really need to add a
29 * way for the user (or the library) to tune this. I'm setting it to
30 * 512 for now since that is what Ed D'Azevedo was using in his Fortran
31 * routines. */
32
33 typedef struct {
34 PetscObjectState nonzerostate; /* used to determine if the nonzero structure has changed and hence the permutations need updating */
35
36 PetscInt ngroup;
37 PetscInt *xgroup;
38 /* Denotes where groups of rows with same number of nonzeros
39 * begin and end, i.e., xgroup[i] gives us the position in iperm[]
40 * where the ith group begins. */
41
42 PetscInt *nzgroup; /* how many nonzeros each row that is a member of group i has. */
43 PetscInt *iperm; /* The permutation vector. */
44
45 /* Some of this stuff is for Ed's recursive triangular solve.
46 * I'm not sure what I need yet. */
47 PetscInt blocksize;
48 PetscInt nstep;
49 PetscInt *jstart_list;
50 PetscInt *jend_list;
51 PetscInt *action_list;
52 PetscInt *ngroup_list;
53 PetscInt **ipointer_list;
54 PetscInt **xgroup_list;
55 PetscInt **nzgroup_list;
56 PetscInt **iperm_list;
57 } Mat_SeqAIJPERM;
58
MatConvert_SeqAIJPERM_SeqAIJ(Mat A,MatType type,MatReuse reuse,Mat * newmat)59 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJPERM_SeqAIJ(Mat A, MatType type, MatReuse reuse, Mat *newmat)
60 {
61 /* This routine is only called to convert a MATAIJPERM to its base PETSc type, */
62 /* so we will ignore 'MatType type'. */
63 Mat B = *newmat;
64 Mat_SeqAIJPERM *aijperm = (Mat_SeqAIJPERM *)A->spptr;
65
66 PetscFunctionBegin;
67 if (reuse == MAT_INITIAL_MATRIX) {
68 PetscCall(MatDuplicate(A, MAT_COPY_VALUES, &B));
69 aijperm = (Mat_SeqAIJPERM *)B->spptr;
70 }
71
72 /* Reset the original function pointers. */
73 B->ops->assemblyend = MatAssemblyEnd_SeqAIJ;
74 B->ops->destroy = MatDestroy_SeqAIJ;
75 B->ops->duplicate = MatDuplicate_SeqAIJ;
76 B->ops->mult = MatMult_SeqAIJ;
77 B->ops->multadd = MatMultAdd_SeqAIJ;
78
79 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_seqaijperm_seqaij_C", NULL));
80
81 /* Free everything in the Mat_SeqAIJPERM data structure.*/
82 PetscCall(PetscFree(aijperm->xgroup));
83 PetscCall(PetscFree(aijperm->nzgroup));
84 PetscCall(PetscFree(aijperm->iperm));
85 PetscCall(PetscFree(B->spptr));
86
87 /* Change the type of B to MATSEQAIJ. */
88 PetscCall(PetscObjectChangeTypeName((PetscObject)B, MATSEQAIJ));
89
90 *newmat = B;
91 PetscFunctionReturn(PETSC_SUCCESS);
92 }
93
MatDestroy_SeqAIJPERM(Mat A)94 static PetscErrorCode MatDestroy_SeqAIJPERM(Mat A)
95 {
96 Mat_SeqAIJPERM *aijperm = (Mat_SeqAIJPERM *)A->spptr;
97
98 PetscFunctionBegin;
99 if (aijperm) {
100 /* If MatHeaderMerge() was used then this SeqAIJPERM matrix will not have a spprt. */
101 PetscCall(PetscFree(aijperm->xgroup));
102 PetscCall(PetscFree(aijperm->nzgroup));
103 PetscCall(PetscFree(aijperm->iperm));
104 PetscCall(PetscFree(A->spptr));
105 }
106 /* Change the type of A back to SEQAIJ and use MatDestroy_SeqAIJ()
107 * to destroy everything that remains. */
108 PetscCall(PetscObjectChangeTypeName((PetscObject)A, MATSEQAIJ));
109 /* Note that I don't call MatSetType(). I believe this is because that
110 * is only to be called when *building* a matrix. I could be wrong, but
111 * that is how things work for the SuperLU matrix class. */
112 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqaijperm_seqaij_C", NULL));
113 PetscCall(MatDestroy_SeqAIJ(A));
114 PetscFunctionReturn(PETSC_SUCCESS);
115 }
116
MatDuplicate_SeqAIJPERM(Mat A,MatDuplicateOption op,Mat * M)117 static PetscErrorCode MatDuplicate_SeqAIJPERM(Mat A, MatDuplicateOption op, Mat *M)
118 {
119 Mat_SeqAIJPERM *aijperm = (Mat_SeqAIJPERM *)A->spptr;
120 Mat_SeqAIJPERM *aijperm_dest;
121 PetscBool perm;
122
123 PetscFunctionBegin;
124 PetscCall(MatDuplicate_SeqAIJ(A, op, M));
125 PetscCall(PetscObjectTypeCompare((PetscObject)*M, MATSEQAIJPERM, &perm));
126 if (perm) {
127 aijperm_dest = (Mat_SeqAIJPERM *)(*M)->spptr;
128 PetscCall(PetscFree(aijperm_dest->xgroup));
129 PetscCall(PetscFree(aijperm_dest->nzgroup));
130 PetscCall(PetscFree(aijperm_dest->iperm));
131 } else {
132 PetscCall(PetscNew(&aijperm_dest));
133 (*M)->spptr = (void *)aijperm_dest;
134 PetscCall(PetscObjectChangeTypeName((PetscObject)*M, MATSEQAIJPERM));
135 PetscCall(PetscObjectComposeFunction((PetscObject)*M, "MatConvert_seqaijperm_seqaij_C", MatConvert_SeqAIJPERM_SeqAIJ));
136 }
137 PetscCall(PetscArraycpy(aijperm_dest, aijperm, 1));
138 /* Allocate space for, and copy the grouping and permutation info.
139 * I note that when the groups are initially determined in
140 * MatSeqAIJPERM_create_perm, xgroup and nzgroup may be sized larger than
141 * necessary. But at this point, we know how large they need to be, and
142 * allocate only the necessary amount of memory. So the duplicated matrix
143 * may actually use slightly less storage than the original! */
144 PetscCall(PetscMalloc1(A->rmap->n, &aijperm_dest->iperm));
145 PetscCall(PetscMalloc1(aijperm->ngroup + 1, &aijperm_dest->xgroup));
146 PetscCall(PetscMalloc1(aijperm->ngroup, &aijperm_dest->nzgroup));
147 PetscCall(PetscArraycpy(aijperm_dest->iperm, aijperm->iperm, A->rmap->n));
148 PetscCall(PetscArraycpy(aijperm_dest->xgroup, aijperm->xgroup, aijperm->ngroup + 1));
149 PetscCall(PetscArraycpy(aijperm_dest->nzgroup, aijperm->nzgroup, aijperm->ngroup));
150 PetscFunctionReturn(PETSC_SUCCESS);
151 }
152
MatSeqAIJPERM_create_perm(Mat A)153 static PetscErrorCode MatSeqAIJPERM_create_perm(Mat A)
154 {
155 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
156 Mat_SeqAIJPERM *aijperm = (Mat_SeqAIJPERM *)A->spptr;
157 PetscInt m; /* Number of rows in the matrix. */
158 PetscInt *ia; /* From the CSR representation; points to the beginning of each row. */
159 PetscInt maxnz; /* Maximum number of nonzeros in any row. */
160 PetscInt *rows_in_bucket;
161 /* To construct the permutation, we sort each row into one of maxnz
162 * buckets based on how many nonzeros are in the row. */
163 PetscInt nz;
164 PetscInt *nz_in_row; /* the number of nonzero elements in row k. */
165 PetscInt *ipnz;
166 /* When constructing the iperm permutation vector,
167 * ipnz[nz] is used to point to the next place in the permutation vector
168 * that a row with nz nonzero elements should be placed.*/
169 PetscInt i, ngroup, istart, ipos;
170
171 PetscFunctionBegin;
172 if (aijperm->nonzerostate == A->nonzerostate) PetscFunctionReturn(PETSC_SUCCESS); /* permutation exists and matches current nonzero structure */
173 aijperm->nonzerostate = A->nonzerostate;
174 /* Free anything previously put in the Mat_SeqAIJPERM data structure. */
175 PetscCall(PetscFree(aijperm->xgroup));
176 PetscCall(PetscFree(aijperm->nzgroup));
177 PetscCall(PetscFree(aijperm->iperm));
178
179 m = A->rmap->n;
180 ia = a->i;
181
182 /* Allocate the arrays that will hold the permutation vector. */
183 PetscCall(PetscMalloc1(m, &aijperm->iperm));
184
185 /* Allocate some temporary work arrays that will be used in
186 * calculating the permutation vector and groupings. */
187 PetscCall(PetscMalloc1(m, &nz_in_row));
188
189 /* Now actually figure out the permutation and grouping. */
190
191 /* First pass: Determine number of nonzeros in each row, maximum
192 * number of nonzeros in any row, and how many rows fall into each
193 * "bucket" of rows with same number of nonzeros. */
194 maxnz = 0;
195 for (i = 0; i < m; i++) {
196 nz_in_row[i] = ia[i + 1] - ia[i];
197 if (nz_in_row[i] > maxnz) maxnz = nz_in_row[i];
198 }
199 PetscCall(PetscMalloc1(PetscMax(maxnz, m) + 1, &rows_in_bucket));
200 PetscCall(PetscMalloc1(PetscMax(maxnz, m) + 1, &ipnz));
201
202 for (i = 0; i <= maxnz; i++) rows_in_bucket[i] = 0;
203 for (i = 0; i < m; i++) {
204 nz = nz_in_row[i];
205 rows_in_bucket[nz]++;
206 }
207
208 /* Allocate space for the grouping info. There will be at most (maxnz + 1)
209 * groups. (It is maxnz + 1 instead of simply maxnz because there may be
210 * rows with no nonzero elements.) If there are (maxnz + 1) groups,
211 * then xgroup[] must consist of (maxnz + 2) elements, since the last
212 * element of xgroup will tell us where the (maxnz + 1)th group ends.
213 * We allocate space for the maximum number of groups;
214 * that is potentially a little wasteful, but not too much so.
215 * Perhaps I should fix it later. */
216 PetscCall(PetscMalloc1(maxnz + 2, &aijperm->xgroup));
217 PetscCall(PetscMalloc1(maxnz + 1, &aijperm->nzgroup));
218
219 /* Second pass. Look at what is in the buckets and create the groupings.
220 * Note that it is OK to have a group of rows with no non-zero values. */
221 ngroup = 0;
222 istart = 0;
223 for (i = 0; i <= maxnz; i++) {
224 if (rows_in_bucket[i] > 0) {
225 aijperm->nzgroup[ngroup] = i;
226 aijperm->xgroup[ngroup] = istart;
227 ngroup++;
228 istart += rows_in_bucket[i];
229 }
230 }
231
232 aijperm->xgroup[ngroup] = istart;
233 aijperm->ngroup = ngroup;
234
235 /* Now fill in the permutation vector iperm. */
236 ipnz[0] = 0;
237 for (i = 0; i < maxnz; i++) ipnz[i + 1] = ipnz[i] + rows_in_bucket[i];
238
239 for (i = 0; i < m; i++) {
240 nz = nz_in_row[i];
241 ipos = ipnz[nz];
242 aijperm->iperm[ipos] = i;
243 ipnz[nz]++;
244 }
245
246 /* Clean up temporary work arrays. */
247 PetscCall(PetscFree(rows_in_bucket));
248 PetscCall(PetscFree(ipnz));
249 PetscCall(PetscFree(nz_in_row));
250 PetscFunctionReturn(PETSC_SUCCESS);
251 }
252
MatAssemblyEnd_SeqAIJPERM(Mat A,MatAssemblyType mode)253 static PetscErrorCode MatAssemblyEnd_SeqAIJPERM(Mat A, MatAssemblyType mode)
254 {
255 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
256
257 PetscFunctionBegin;
258 if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(PETSC_SUCCESS);
259
260 /* Since a MATSEQAIJPERM matrix is really just a MATSEQAIJ with some
261 * extra information, call the AssemblyEnd routine for a MATSEQAIJ.
262 * I'm not sure if this is the best way to do this, but it avoids
263 * a lot of code duplication.
264 * I also note that currently MATSEQAIJPERM doesn't know anything about
265 * the Mat_CompressedRow data structure that SeqAIJ now uses when there
266 * are many zero rows. If the SeqAIJ assembly end routine decides to use
267 * this, this may break things. (Don't know... haven't looked at it.) */
268 a->inode.use = PETSC_FALSE;
269 PetscCall(MatAssemblyEnd_SeqAIJ(A, mode));
270
271 /* Now calculate the permutation and grouping information. */
272 PetscCall(MatSeqAIJPERM_create_perm(A));
273 PetscFunctionReturn(PETSC_SUCCESS);
274 }
275
MatMult_SeqAIJPERM(Mat A,Vec xx,Vec yy)276 static PetscErrorCode MatMult_SeqAIJPERM(Mat A, Vec xx, Vec yy)
277 {
278 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
279 const PetscScalar *x;
280 PetscScalar *y;
281 const MatScalar *aa;
282 const PetscInt *aj, *ai;
283 PetscInt i, j;
284 #if defined(PETSC_USE_AVX512_KERNELS) && defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
285 __m512d vec_x, vec_y, vec_vals;
286 __m256i vec_idx, vec_ipos, vec_j;
287 __mmask8 mask;
288 #endif
289
290 /* Variables that don't appear in MatMult_SeqAIJ. */
291 Mat_SeqAIJPERM *aijperm = (Mat_SeqAIJPERM *)A->spptr;
292 PetscInt *iperm; /* Points to the permutation vector. */
293 PetscInt *xgroup;
294 /* Denotes where groups of rows with same number of nonzeros
295 * begin and end in iperm. */
296 PetscInt *nzgroup;
297 PetscInt ngroup;
298 PetscInt igroup;
299 PetscInt jstart, jend;
300 /* jstart is used in loops to denote the position in iperm where a
301 * group starts; jend denotes the position where it ends.
302 * (jend + 1 is where the next group starts.) */
303 PetscInt iold, nz;
304 PetscInt istart, iend, isize;
305 PetscInt ipos;
306 PetscScalar yp[NDIM];
307 PetscInt ip[NDIM]; /* yp[] and ip[] are treated as vector "registers" for performing the mat-vec. */
308
309 #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
310 #pragma disjoint(*x, *y, *aa)
311 #endif
312
313 PetscFunctionBegin;
314 PetscCall(VecGetArrayRead(xx, &x));
315 PetscCall(VecGetArray(yy, &y));
316 aj = a->j; /* aj[k] gives column index for element aa[k]. */
317 aa = a->a; /* Nonzero elements stored row-by-row. */
318 ai = a->i; /* ai[k] is the position in aa and aj where row k starts. */
319
320 /* Get the info we need about the permutations and groupings. */
321 iperm = aijperm->iperm;
322 ngroup = aijperm->ngroup;
323 xgroup = aijperm->xgroup;
324 nzgroup = aijperm->nzgroup;
325
326 for (igroup = 0; igroup < ngroup; igroup++) {
327 jstart = xgroup[igroup];
328 jend = xgroup[igroup + 1] - 1;
329 nz = nzgroup[igroup];
330
331 /* Handle the special cases where the number of nonzeros per row
332 * in the group is either 0 or 1. */
333 if (nz == 0) {
334 for (i = jstart; i <= jend; i++) y[iperm[i]] = 0.0;
335 } else if (nz == 1) {
336 for (i = jstart; i <= jend; i++) {
337 iold = iperm[i];
338 ipos = ai[iold];
339 y[iold] = aa[ipos] * x[aj[ipos]];
340 }
341 } else {
342 /* We work our way through the current group in chunks of NDIM rows
343 * at a time. */
344
345 for (istart = jstart; istart <= jend; istart += NDIM) {
346 /* Figure out where the chunk of 'isize' rows ends in iperm.
347 * 'isize may of course be less than NDIM for the last chunk. */
348 iend = istart + (NDIM - 1);
349
350 if (iend > jend) iend = jend;
351
352 isize = iend - istart + 1;
353
354 /* Initialize the yp[] array that will be used to hold part of
355 * the permuted results vector, and figure out where in aa each
356 * row of the chunk will begin. */
357 for (i = 0; i < isize; i++) {
358 iold = iperm[istart + i];
359 /* iold is a row number from the matrix A *before* reordering. */
360 ip[i] = ai[iold];
361 /* ip[i] tells us where the ith row of the chunk begins in aa. */
362 yp[i] = (PetscScalar)0.0;
363 }
364
365 /* If the number of zeros per row exceeds the number of rows in
366 * the chunk, we should vectorize along nz, that is, perform the
367 * mat-vec one row at a time as in the usual CSR case. */
368 if (nz > isize) {
369 #if defined(PETSC_HAVE_CRAY_VECTOR)
370 #pragma _CRI preferstream
371 #endif
372 for (i = 0; i < isize; i++) {
373 #if defined(PETSC_HAVE_CRAY_VECTOR)
374 #pragma _CRI prefervector
375 #endif
376
377 #if defined(PETSC_USE_AVX512_KERNELS) && defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
378 vec_y = _mm512_setzero_pd();
379 ipos = ip[i];
380 for (j = 0; j < (nz >> 3); j++) {
381 vec_idx = _mm256_loadu_si256((__m256i const *)&aj[ipos]);
382 vec_vals = _mm512_loadu_pd(&aa[ipos]);
383 vec_x = _mm512_i32gather_pd(vec_idx, x, _MM_SCALE_8);
384 vec_y = _mm512_fmadd_pd(vec_x, vec_vals, vec_y);
385 ipos += 8;
386 }
387 if ((nz & 0x07) > 2) {
388 mask = (__mmask8)(0xff >> (8 - (nz & 0x07)));
389 vec_idx = _mm256_loadu_si256((__m256i const *)&aj[ipos]);
390 vec_vals = _mm512_loadu_pd(&aa[ipos]);
391 vec_x = _mm512_mask_i32gather_pd(vec_x, mask, vec_idx, x, _MM_SCALE_8);
392 vec_y = _mm512_mask3_fmadd_pd(vec_x, vec_vals, vec_y, mask);
393 } else if ((nz & 0x07) == 2) {
394 yp[i] += aa[ipos] * x[aj[ipos]];
395 yp[i] += aa[ipos + 1] * x[aj[ipos + 1]];
396 } else if ((nz & 0x07) == 1) {
397 yp[i] += aa[ipos] * x[aj[ipos]];
398 }
399 yp[i] += _mm512_reduce_add_pd(vec_y);
400 #else
401 for (j = 0; j < nz; j++) {
402 ipos = ip[i] + j;
403 yp[i] += aa[ipos] * x[aj[ipos]];
404 }
405 #endif
406 }
407 } else {
408 /* Otherwise, there are enough rows in the chunk to make it
409 * worthwhile to vectorize across the rows, that is, to do the
410 * matvec by operating with "columns" of the chunk. */
411 for (j = 0; j < nz; j++) {
412 #if defined(PETSC_USE_AVX512_KERNELS) && defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
413 vec_j = _mm256_set1_epi32(j);
414 for (i = 0; i < ((isize >> 3) << 3); i += 8) {
415 vec_y = _mm512_loadu_pd(&yp[i]);
416 vec_ipos = _mm256_loadu_si256((__m256i const *)&ip[i]);
417 vec_ipos = _mm256_add_epi32(vec_ipos, vec_j);
418 vec_idx = _mm256_i32gather_epi32(aj, vec_ipos, _MM_SCALE_4);
419 vec_vals = _mm512_i32gather_pd(vec_ipos, aa, _MM_SCALE_8);
420 vec_x = _mm512_i32gather_pd(vec_idx, x, _MM_SCALE_8);
421 vec_y = _mm512_fmadd_pd(vec_x, vec_vals, vec_y);
422 _mm512_storeu_pd(&yp[i], vec_y);
423 }
424 for (i = isize - (isize & 0x07); i < isize; i++) {
425 ipos = ip[i] + j;
426 yp[i] += aa[ipos] * x[aj[ipos]];
427 }
428 #else
429 for (i = 0; i < isize; i++) {
430 ipos = ip[i] + j;
431 yp[i] += aa[ipos] * x[aj[ipos]];
432 }
433 #endif
434 }
435 }
436
437 #if defined(PETSC_HAVE_CRAY_VECTOR)
438 #pragma _CRI ivdep
439 #endif
440 /* Put results from yp[] into non-permuted result vector y. */
441 for (i = 0; i < isize; i++) y[iperm[istart + i]] = yp[i];
442 } /* End processing chunk of isize rows of a group. */
443 } /* End handling matvec for chunk with nz > 1. */
444 } /* End loop over igroup. */
445 PetscCall(PetscLogFlops(PetscMax(2.0 * a->nz - A->rmap->n, 0)));
446 PetscCall(VecRestoreArrayRead(xx, &x));
447 PetscCall(VecRestoreArray(yy, &y));
448 PetscFunctionReturn(PETSC_SUCCESS);
449 }
450
451 /* MatMultAdd_SeqAIJPERM() calculates yy = ww + A * xx.
452 * Note that the names I used to designate the vectors differs from that
453 * used in MatMultAdd_SeqAIJ(). I did this to keep my notation consistent
454 * with the MatMult_SeqAIJPERM() routine, which is very similar to this one. */
455 /*
456 I hate having virtually identical code for the mult and the multadd!!!
457 */
MatMultAdd_SeqAIJPERM(Mat A,Vec xx,Vec ww,Vec yy)458 static PetscErrorCode MatMultAdd_SeqAIJPERM(Mat A, Vec xx, Vec ww, Vec yy)
459 {
460 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
461 const PetscScalar *x;
462 PetscScalar *y, *w;
463 const MatScalar *aa;
464 const PetscInt *aj, *ai;
465 PetscInt i, j;
466
467 /* Variables that don't appear in MatMultAdd_SeqAIJ. */
468 Mat_SeqAIJPERM *aijperm;
469 PetscInt *iperm; /* Points to the permutation vector. */
470 PetscInt *xgroup;
471 /* Denotes where groups of rows with same number of nonzeros
472 * begin and end in iperm. */
473 PetscInt *nzgroup;
474 PetscInt ngroup;
475 PetscInt igroup;
476 PetscInt jstart, jend;
477 /* jstart is used in loops to denote the position in iperm where a
478 * group starts; jend denotes the position where it ends.
479 * (jend + 1 is where the next group starts.) */
480 PetscInt iold, nz;
481 PetscInt istart, iend, isize;
482 PetscInt ipos;
483 PetscScalar yp[NDIM];
484 PetscInt ip[NDIM];
485 /* yp[] and ip[] are treated as vector "registers" for performing
486 * the mat-vec. */
487
488 #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
489 #pragma disjoint(*x, *y, *aa)
490 #endif
491
492 PetscFunctionBegin;
493 PetscCall(VecGetArrayRead(xx, &x));
494 PetscCall(VecGetArrayPair(yy, ww, &y, &w));
495
496 aj = a->j; /* aj[k] gives column index for element aa[k]. */
497 aa = a->a; /* Nonzero elements stored row-by-row. */
498 ai = a->i; /* ai[k] is the position in aa and aj where row k starts. */
499
500 /* Get the info we need about the permutations and groupings. */
501 aijperm = (Mat_SeqAIJPERM *)A->spptr;
502 iperm = aijperm->iperm;
503 ngroup = aijperm->ngroup;
504 xgroup = aijperm->xgroup;
505 nzgroup = aijperm->nzgroup;
506
507 for (igroup = 0; igroup < ngroup; igroup++) {
508 jstart = xgroup[igroup];
509 jend = xgroup[igroup + 1] - 1;
510
511 nz = nzgroup[igroup];
512
513 /* Handle the special cases where the number of nonzeros per row
514 * in the group is either 0 or 1. */
515 if (nz == 0) {
516 for (i = jstart; i <= jend; i++) {
517 iold = iperm[i];
518 y[iold] = w[iold];
519 }
520 } else if (nz == 1) {
521 for (i = jstart; i <= jend; i++) {
522 iold = iperm[i];
523 ipos = ai[iold];
524 y[iold] = w[iold] + aa[ipos] * x[aj[ipos]];
525 }
526 }
527 /* For the general case: */
528 else {
529 /* We work our way through the current group in chunks of NDIM rows
530 * at a time. */
531
532 for (istart = jstart; istart <= jend; istart += NDIM) {
533 /* Figure out where the chunk of 'isize' rows ends in iperm.
534 * 'isize may of course be less than NDIM for the last chunk. */
535 iend = istart + (NDIM - 1);
536 if (iend > jend) iend = jend;
537 isize = iend - istart + 1;
538
539 /* Initialize the yp[] array that will be used to hold part of
540 * the permuted results vector, and figure out where in aa each
541 * row of the chunk will begin. */
542 for (i = 0; i < isize; i++) {
543 iold = iperm[istart + i];
544 /* iold is a row number from the matrix A *before* reordering. */
545 ip[i] = ai[iold];
546 /* ip[i] tells us where the ith row of the chunk begins in aa. */
547 yp[i] = w[iold];
548 }
549
550 /* If the number of zeros per row exceeds the number of rows in
551 * the chunk, we should vectorize along nz, that is, perform the
552 * mat-vec one row at a time as in the usual CSR case. */
553 if (nz > isize) {
554 #if defined(PETSC_HAVE_CRAY_VECTOR)
555 #pragma _CRI preferstream
556 #endif
557 for (i = 0; i < isize; i++) {
558 #if defined(PETSC_HAVE_CRAY_VECTOR)
559 #pragma _CRI prefervector
560 #endif
561 for (j = 0; j < nz; j++) {
562 ipos = ip[i] + j;
563 yp[i] += aa[ipos] * x[aj[ipos]];
564 }
565 }
566 }
567 /* Otherwise, there are enough rows in the chunk to make it
568 * worthwhile to vectorize across the rows, that is, to do the
569 * matvec by operating with "columns" of the chunk. */
570 else {
571 for (j = 0; j < nz; j++) {
572 for (i = 0; i < isize; i++) {
573 ipos = ip[i] + j;
574 yp[i] += aa[ipos] * x[aj[ipos]];
575 }
576 }
577 }
578
579 #if defined(PETSC_HAVE_CRAY_VECTOR)
580 #pragma _CRI ivdep
581 #endif
582 /* Put results from yp[] into non-permuted result vector y. */
583 for (i = 0; i < isize; i++) y[iperm[istart + i]] = yp[i];
584 } /* End processing chunk of isize rows of a group. */
585
586 } /* End handling matvec for chunk with nz > 1. */
587 } /* End loop over igroup. */
588
589 PetscCall(PetscLogFlops(2.0 * a->nz));
590 PetscCall(VecRestoreArrayRead(xx, &x));
591 PetscCall(VecRestoreArrayPair(yy, ww, &y, &w));
592 PetscFunctionReturn(PETSC_SUCCESS);
593 }
594
595 /* MatConvert_SeqAIJ_SeqAIJPERM converts a SeqAIJ matrix into a
596 * SeqAIJPERM matrix. This routine is called by the MatCreate_SeqAIJPERM()
597 * routine, but can also be used to convert an assembled SeqAIJ matrix
598 * into a SeqAIJPERM one. */
MatConvert_SeqAIJ_SeqAIJPERM(Mat A,MatType type,MatReuse reuse,Mat * newmat)599 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJPERM(Mat A, MatType type, MatReuse reuse, Mat *newmat)
600 {
601 Mat B = *newmat;
602 Mat_SeqAIJPERM *aijperm;
603 PetscBool sametype;
604
605 PetscFunctionBegin;
606 if (reuse == MAT_INITIAL_MATRIX) PetscCall(MatDuplicate(A, MAT_COPY_VALUES, &B));
607 PetscCall(PetscObjectTypeCompare((PetscObject)A, type, &sametype));
608 if (sametype) PetscFunctionReturn(PETSC_SUCCESS);
609
610 PetscCall(PetscNew(&aijperm));
611 B->spptr = (void *)aijperm;
612
613 /* Set function pointers for methods that we inherit from AIJ but override. */
614 B->ops->duplicate = MatDuplicate_SeqAIJPERM;
615 B->ops->assemblyend = MatAssemblyEnd_SeqAIJPERM;
616 B->ops->destroy = MatDestroy_SeqAIJPERM;
617 B->ops->mult = MatMult_SeqAIJPERM;
618 B->ops->multadd = MatMultAdd_SeqAIJPERM;
619
620 aijperm->nonzerostate = -1; /* this will trigger the generation of the permutation information the first time through MatAssembly()*/
621 /* If A has already been assembled, compute the permutation. */
622 if (A->assembled) PetscCall(MatSeqAIJPERM_create_perm(B));
623
624 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_seqaijperm_seqaij_C", MatConvert_SeqAIJPERM_SeqAIJ));
625
626 PetscCall(PetscObjectChangeTypeName((PetscObject)B, MATSEQAIJPERM));
627 *newmat = B;
628 PetscFunctionReturn(PETSC_SUCCESS);
629 }
630
631 /*@C
632 MatCreateSeqAIJPERM - Creates a sparse matrix of type `MATSEQAIJPERM`.
633
634 Collective
635
636 Input Parameters:
637 + comm - MPI communicator, set to `PETSC_COMM_SELF`
638 . m - number of rows
639 . n - number of columns
640 . nz - number of nonzeros per row (same for all rows), ignored if `nnz` is given
641 - nnz - array containing the number of nonzeros in the various rows (possibly different for each row) or `NULL`
642
643 Output Parameter:
644 . A - the matrix
645
646 Level: intermediate
647
648 Notes:
649 This type inherits from `MATSEQAIJ`, but calculates some additional permutation information
650 that is used to allow better vectorization of some operations. At the cost of increased
651 storage, the `MATSEQAIJ` formatted matrix can be copied to a format in which pieces of the
652 matrix are stored in ELLPACK format, allowing the vectorized matrix multiply routine to use
653 stride-1 memory accesses.
654
655 .seealso: [](ch_matrices), `Mat`, `MatCreate()`, `MatCreateMPIAIJPERM()`, `MatSetValues()`
656 @*/
MatCreateSeqAIJPERM(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat * A)657 PetscErrorCode MatCreateSeqAIJPERM(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt nz, const PetscInt nnz[], Mat *A)
658 {
659 PetscFunctionBegin;
660 PetscCall(MatCreate(comm, A));
661 PetscCall(MatSetSizes(*A, m, n, m, n));
662 PetscCall(MatSetType(*A, MATSEQAIJPERM));
663 PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A, nz, nnz));
664 PetscFunctionReturn(PETSC_SUCCESS);
665 }
666
MatCreate_SeqAIJPERM(Mat A)667 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJPERM(Mat A)
668 {
669 PetscFunctionBegin;
670 PetscCall(MatSetType(A, MATSEQAIJ));
671 PetscCall(MatConvert_SeqAIJ_SeqAIJPERM(A, MATSEQAIJPERM, MAT_INPLACE_MATRIX, &A));
672 PetscFunctionReturn(PETSC_SUCCESS);
673 }
674