xref: /petsc/src/mat/impls/sell/seq/sell.c (revision fdf6c4e30aafdbc795e4f76379caa977fd5cdf5a)
1 
2 /*
3   Defines the basic matrix operations for the SELL matrix storage format.
4 */
5 #include <../src/mat/impls/sell/seq/sell.h>  /*I   "petscmat.h"  I*/
6 #include <petscblaslapack.h>
7 #include <petsc/private/kernels/blocktranspose.h>
8 
9 static PetscBool  cited = PETSC_FALSE;
10 static const char citation[] =
11   "@inproceedings{ZhangELLPACK2018,\n"
12   " author = {Hong Zhang and Richard T. Mills and Karl Rupp and Barry F. Smith},\n"
13   " title = {Vectorized Parallel Sparse Matrix-Vector Multiplication in {PETSc} Using {AVX-512}},\n"
14   " booktitle = {Proceedings of the 47th International Conference on Parallel Processing},\n"
15   " year = 2018\n"
16 "}\n";
17 
18 #if defined(PETSC_HAVE_IMMINTRIN_H) && (defined(__AVX512F__) || (defined(__AVX2__) && defined(__FMA__)) || defined(__AVX__)) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
19 
20   #include <immintrin.h>
21 
22   #if !defined(_MM_SCALE_8)
23   #define _MM_SCALE_8    8
24   #endif
25 
26   #if defined(__AVX512F__)
27   /* these do not work
28    vec_idx  = _mm512_loadunpackhi_epi32(vec_idx,acolidx);
29    vec_vals = _mm512_loadunpackhi_pd(vec_vals,aval);
30   */
31     #define AVX512_Mult_Private(vec_idx,vec_x,vec_vals,vec_y) \
32     /* if the mask bit is set, copy from acolidx, otherwise from vec_idx */ \
33     vec_idx  = _mm256_loadu_si256((__m256i const*)acolidx); \
34     vec_vals = _mm512_loadu_pd(aval); \
35     vec_x    = _mm512_i32gather_pd(vec_idx,x,_MM_SCALE_8); \
36     vec_y    = _mm512_fmadd_pd(vec_x,vec_vals,vec_y)
37   #elif defined(__AVX2__) && defined(__FMA__)
38     #define AVX2_Mult_Private(vec_idx,vec_x,vec_vals,vec_y) \
39     vec_vals = _mm256_loadu_pd(aval); \
40     vec_idx  = _mm_loadu_si128((__m128i const*)acolidx); /* SSE2 */ \
41     vec_x    = _mm256_i32gather_pd(x,vec_idx,_MM_SCALE_8); \
42     vec_y    = _mm256_fmadd_pd(vec_x,vec_vals,vec_y)
43   #endif
44 #endif  /* PETSC_HAVE_IMMINTRIN_H */
45 
46 /*@C
47  MatSeqSELLSetPreallocation - For good matrix assembly performance
48  the user should preallocate the matrix storage by setting the parameter nz
49  (or the array nnz).  By setting these parameters accurately, performance
50  during matrix assembly can be increased significantly.
51 
52  Collective
53 
54  Input Parameters:
55  +  B - The matrix
56  .  nz - number of nonzeros per row (same for all rows)
57  -  nnz - array containing the number of nonzeros in the various rows
58  (possibly different for each row) or NULL
59 
60  Notes:
61  If nnz is given then nz is ignored.
62 
63  Specify the preallocated storage with either nz or nnz (not both).
64  Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
65  allocation.  For large problems you MUST preallocate memory or you
66  will get TERRIBLE performance, see the users' manual chapter on matrices.
67 
68  You can call MatGetInfo() to get information on how effective the preallocation was;
69  for example the fields mallocs,nz_allocated,nz_used,nz_unneeded;
70  You can also run with the option -info and look for messages with the string
71  malloc in them to see if additional memory allocation was needed.
72 
73  Developers: Use nz of MAT_SKIP_ALLOCATION to not allocate any space for the matrix
74  entries or columns indices.
75 
76  The maximum number of nonzeos in any row should be as accurate as possible.
77  If it is underestimated, you will get bad performance due to reallocation
78  (MatSeqXSELLReallocateSELL).
79 
80  Level: intermediate
81 
82  .seealso: `MatCreate()`, `MatCreateSELL()`, `MatSetValues()`, `MatGetInfo()`
83 
84  @*/
85 PetscErrorCode MatSeqSELLSetPreallocation(Mat B,PetscInt rlenmax,const PetscInt rlen[])
86 {
87   PetscFunctionBegin;
88   PetscValidHeaderSpecific(B,MAT_CLASSID,1);
89   PetscValidType(B,1);
90   PetscTryMethod(B,"MatSeqSELLSetPreallocation_C",(Mat,PetscInt,const PetscInt[]),(B,rlenmax,rlen));
91   PetscFunctionReturn(0);
92 }
93 
94 PetscErrorCode MatSeqSELLSetPreallocation_SeqSELL(Mat B,PetscInt maxallocrow,const PetscInt rlen[])
95 {
96   Mat_SeqSELL    *b;
97   PetscInt       i,j,totalslices;
98   PetscBool      skipallocation=PETSC_FALSE,realalloc=PETSC_FALSE;
99 
100   PetscFunctionBegin;
101   if (maxallocrow >= 0 || rlen) realalloc = PETSC_TRUE;
102   if (maxallocrow == MAT_SKIP_ALLOCATION) {
103     skipallocation = PETSC_TRUE;
104     maxallocrow    = 0;
105   }
106 
107   PetscCall(PetscLayoutSetUp(B->rmap));
108   PetscCall(PetscLayoutSetUp(B->cmap));
109 
110   /* FIXME: if one preallocates more space than needed, the matrix does not shrink automatically, but for best performance it should */
111   if (maxallocrow == PETSC_DEFAULT || maxallocrow == PETSC_DECIDE) maxallocrow = 5;
112   PetscCheck(maxallocrow >= 0,PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"maxallocrow cannot be less than 0: value %" PetscInt_FMT,maxallocrow);
113   if (rlen) {
114     for (i=0; i<B->rmap->n; i++) {
115       PetscCheck(rlen[i] >= 0,PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"rlen cannot be less than 0: local row %" PetscInt_FMT " value %" PetscInt_FMT,i,rlen[i]);
116       PetscCheck(rlen[i] <= B->cmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"rlen cannot be greater than row length: local row %" PetscInt_FMT " value %" PetscInt_FMT " rowlength %" PetscInt_FMT,i,rlen[i],B->cmap->n);
117     }
118   }
119 
120   B->preallocated = PETSC_TRUE;
121 
122   b = (Mat_SeqSELL*)B->data;
123 
124   totalslices = PetscCeilInt(B->rmap->n,8);
125   b->totalslices = totalslices;
126   if (!skipallocation) {
127     if (B->rmap->n & 0x07) PetscCall(PetscInfo(B,"Padding rows to the SEQSELL matrix because the number of rows is not the multiple of 8 (value %" PetscInt_FMT ")\n",B->rmap->n));
128 
129     if (!b->sliidx) { /* sliidx gives the starting index of each slice, the last element is the total space allocated */
130       PetscCall(PetscMalloc1(totalslices+1,&b->sliidx));
131       PetscCall(PetscLogObjectMemory((PetscObject)B,(totalslices+1)*sizeof(PetscInt)));
132     }
133     if (!rlen) { /* if rlen is not provided, allocate same space for all the slices */
134       if (maxallocrow == PETSC_DEFAULT || maxallocrow == PETSC_DECIDE) maxallocrow = 10;
135       else if (maxallocrow < 0) maxallocrow = 1;
136       for (i=0; i<=totalslices; i++) b->sliidx[i] = i*8*maxallocrow;
137     } else {
138       maxallocrow = 0;
139       b->sliidx[0] = 0;
140       for (i=1; i<totalslices; i++) {
141         b->sliidx[i] = 0;
142         for (j=0;j<8;j++) {
143           b->sliidx[i] = PetscMax(b->sliidx[i],rlen[8*(i-1)+j]);
144         }
145         maxallocrow = PetscMax(b->sliidx[i],maxallocrow);
146         PetscCall(PetscIntSumError(b->sliidx[i-1],8*b->sliidx[i],&b->sliidx[i]));
147       }
148       /* last slice */
149       b->sliidx[totalslices] = 0;
150       for (j=(totalslices-1)*8;j<B->rmap->n;j++) b->sliidx[totalslices] = PetscMax(b->sliidx[totalslices],rlen[j]);
151       maxallocrow = PetscMax(b->sliidx[totalslices],maxallocrow);
152       b->sliidx[totalslices] = b->sliidx[totalslices-1] + 8*b->sliidx[totalslices];
153     }
154 
155     /* allocate space for val, colidx, rlen */
156     /* FIXME: should B's old memory be unlogged? */
157     PetscCall(MatSeqXSELLFreeSELL(B,&b->val,&b->colidx));
158     /* FIXME: assuming an element of the bit array takes 8 bits */
159     PetscCall(PetscMalloc2(b->sliidx[totalslices],&b->val,b->sliidx[totalslices],&b->colidx));
160     PetscCall(PetscLogObjectMemory((PetscObject)B,b->sliidx[totalslices]*(sizeof(PetscScalar)+sizeof(PetscInt))));
161     /* b->rlen will count nonzeros in each row so far. We dont copy rlen to b->rlen because the matrix has not been set. */
162     PetscCall(PetscCalloc1(8*totalslices,&b->rlen));
163     PetscCall(PetscLogObjectMemory((PetscObject)B,8*totalslices*sizeof(PetscInt)));
164 
165     b->singlemalloc = PETSC_TRUE;
166     b->free_val     = PETSC_TRUE;
167     b->free_colidx  = PETSC_TRUE;
168   } else {
169     b->free_val    = PETSC_FALSE;
170     b->free_colidx = PETSC_FALSE;
171   }
172 
173   b->nz               = 0;
174   b->maxallocrow      = maxallocrow;
175   b->rlenmax          = maxallocrow;
176   b->maxallocmat      = b->sliidx[totalslices];
177   B->info.nz_unneeded = (double)b->maxallocmat;
178   if (realalloc) PetscCall(MatSetOption(B,MAT_NEW_NONZERO_ALLOCATION_ERR,PETSC_TRUE));
179   PetscFunctionReturn(0);
180 }
181 
182 PetscErrorCode MatGetRow_SeqSELL(Mat A,PetscInt row,PetscInt *nz,PetscInt **idx,PetscScalar **v)
183 {
184   Mat_SeqSELL *a = (Mat_SeqSELL*)A->data;
185   PetscInt    shift;
186 
187   PetscFunctionBegin;
188   PetscCheck(row >= 0 && row < A->rmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Row %" PetscInt_FMT " out of range",row);
189   if (nz) *nz = a->rlen[row];
190   shift = a->sliidx[row>>3]+(row&0x07);
191   if (!a->getrowcols) {
192 
193     PetscCall(PetscMalloc2(a->rlenmax,&a->getrowcols,a->rlenmax,&a->getrowvals));
194   }
195   if (idx) {
196     PetscInt j;
197     for (j=0; j<a->rlen[row]; j++) a->getrowcols[j] = a->colidx[shift+8*j];
198     *idx = a->getrowcols;
199   }
200   if (v) {
201     PetscInt j;
202     for (j=0; j<a->rlen[row]; j++) a->getrowvals[j] = a->val[shift+8*j];
203     *v = a->getrowvals;
204   }
205   PetscFunctionReturn(0);
206 }
207 
208 PetscErrorCode MatRestoreRow_SeqSELL(Mat A,PetscInt row,PetscInt *nz,PetscInt **idx,PetscScalar **v)
209 {
210   PetscFunctionBegin;
211   PetscFunctionReturn(0);
212 }
213 
214 PetscErrorCode MatConvert_SeqSELL_SeqAIJ(Mat A, MatType newtype,MatReuse reuse,Mat *newmat)
215 {
216   Mat            B;
217   Mat_SeqSELL    *a=(Mat_SeqSELL*)A->data;
218   PetscInt       i;
219 
220   PetscFunctionBegin;
221   if (reuse == MAT_REUSE_MATRIX) {
222     B    = *newmat;
223     PetscCall(MatZeroEntries(B));
224   } else {
225     PetscCall(MatCreate(PetscObjectComm((PetscObject)A),&B));
226     PetscCall(MatSetSizes(B,A->rmap->n,A->cmap->n,A->rmap->N,A->cmap->N));
227     PetscCall(MatSetType(B,MATSEQAIJ));
228     PetscCall(MatSeqAIJSetPreallocation(B,0,a->rlen));
229   }
230 
231   for (i=0; i<A->rmap->n; i++) {
232     PetscInt    nz = 0,*cols = NULL;
233     PetscScalar *vals = NULL;
234 
235     PetscCall(MatGetRow_SeqSELL(A,i,&nz,&cols,&vals));
236     PetscCall(MatSetValues(B,1,&i,nz,cols,vals,INSERT_VALUES));
237     PetscCall(MatRestoreRow_SeqSELL(A,i,&nz,&cols,&vals));
238   }
239 
240   PetscCall(MatAssemblyBegin(B,MAT_FINAL_ASSEMBLY));
241   PetscCall(MatAssemblyEnd(B,MAT_FINAL_ASSEMBLY));
242   B->rmap->bs = A->rmap->bs;
243 
244   if (reuse == MAT_INPLACE_MATRIX) {
245     PetscCall(MatHeaderReplace(A,&B));
246   } else {
247     *newmat = B;
248   }
249   PetscFunctionReturn(0);
250 }
251 
252 #include <../src/mat/impls/aij/seq/aij.h>
253 
254 PetscErrorCode MatConvert_SeqAIJ_SeqSELL(Mat A,MatType newtype,MatReuse reuse,Mat *newmat)
255 {
256   Mat               B;
257   Mat_SeqAIJ        *a=(Mat_SeqAIJ*)A->data;
258   PetscInt          *ai=a->i,m=A->rmap->N,n=A->cmap->N,i,*rowlengths,row,ncols;
259   const PetscInt    *cols;
260   const PetscScalar *vals;
261 
262   PetscFunctionBegin;
263 
264   if (reuse == MAT_REUSE_MATRIX) {
265     B = *newmat;
266   } else {
267     if (PetscDefined(USE_DEBUG) || !a->ilen) {
268       PetscCall(PetscMalloc1(m,&rowlengths));
269       for (i=0; i<m; i++) {
270         rowlengths[i] = ai[i+1] - ai[i];
271       }
272     }
273     if (PetscDefined(USE_DEBUG) && a->ilen) {
274       PetscBool eq;
275       PetscCall(PetscMemcmp(rowlengths,a->ilen,m*sizeof(PetscInt),&eq));
276       PetscCheck(eq,PETSC_COMM_SELF,PETSC_ERR_PLIB,"SeqAIJ ilen array incorrect");
277       PetscCall(PetscFree(rowlengths));
278       rowlengths = a->ilen;
279     } else if (a->ilen) rowlengths = a->ilen;
280     PetscCall(MatCreate(PetscObjectComm((PetscObject)A),&B));
281     PetscCall(MatSetSizes(B,m,n,m,n));
282     PetscCall(MatSetType(B,MATSEQSELL));
283     PetscCall(MatSeqSELLSetPreallocation(B,0,rowlengths));
284     if (rowlengths != a->ilen) PetscCall(PetscFree(rowlengths));
285   }
286 
287   for (row=0; row<m; row++) {
288     PetscCall(MatGetRow_SeqAIJ(A,row,&ncols,(PetscInt**)&cols,(PetscScalar**)&vals));
289     PetscCall(MatSetValues_SeqSELL(B,1,&row,ncols,cols,vals,INSERT_VALUES));
290     PetscCall(MatRestoreRow_SeqAIJ(A,row,&ncols,(PetscInt**)&cols,(PetscScalar**)&vals));
291   }
292   PetscCall(MatAssemblyBegin(B,MAT_FINAL_ASSEMBLY));
293   PetscCall(MatAssemblyEnd(B,MAT_FINAL_ASSEMBLY));
294   B->rmap->bs = A->rmap->bs;
295 
296   if (reuse == MAT_INPLACE_MATRIX) {
297     PetscCall(MatHeaderReplace(A,&B));
298   } else {
299     *newmat = B;
300   }
301   PetscFunctionReturn(0);
302 }
303 
304 PetscErrorCode MatMult_SeqSELL(Mat A,Vec xx,Vec yy)
305 {
306   Mat_SeqSELL       *a=(Mat_SeqSELL*)A->data;
307   PetscScalar       *y;
308   const PetscScalar *x;
309   const MatScalar   *aval=a->val;
310   PetscInt          totalslices=a->totalslices;
311   const PetscInt    *acolidx=a->colidx;
312   PetscInt          i,j;
313 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
314   __m512d           vec_x,vec_y,vec_vals;
315   __m256i           vec_idx;
316   __mmask8          mask;
317   __m512d           vec_x2,vec_y2,vec_vals2,vec_x3,vec_y3,vec_vals3,vec_x4,vec_y4,vec_vals4;
318   __m256i           vec_idx2,vec_idx3,vec_idx4;
319 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
320   __m128i           vec_idx;
321   __m256d           vec_x,vec_y,vec_y2,vec_vals;
322   MatScalar         yval;
323   PetscInt          r,rows_left,row,nnz_in_row;
324 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
325   __m128d           vec_x_tmp;
326   __m256d           vec_x,vec_y,vec_y2,vec_vals;
327   MatScalar         yval;
328   PetscInt          r,rows_left,row,nnz_in_row;
329 #else
330   PetscScalar       sum[8];
331 #endif
332 
333 #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
334 #pragma disjoint(*x,*y,*aval)
335 #endif
336 
337   PetscFunctionBegin;
338   PetscCall(VecGetArrayRead(xx,&x));
339   PetscCall(VecGetArray(yy,&y));
340 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
341   for (i=0; i<totalslices; i++) { /* loop over slices */
342     PetscPrefetchBlock(acolidx,a->sliidx[i+1]-a->sliidx[i],0,PETSC_PREFETCH_HINT_T0);
343     PetscPrefetchBlock(aval,a->sliidx[i+1]-a->sliidx[i],0,PETSC_PREFETCH_HINT_T0);
344 
345     vec_y  = _mm512_setzero_pd();
346     vec_y2 = _mm512_setzero_pd();
347     vec_y3 = _mm512_setzero_pd();
348     vec_y4 = _mm512_setzero_pd();
349 
350     j = a->sliidx[i]>>3; /* 8 bytes are read at each time, corresponding to a slice columnn */
351     switch ((a->sliidx[i+1]-a->sliidx[i])/8 & 3) {
352     case 3:
353       AVX512_Mult_Private(vec_idx,vec_x,vec_vals,vec_y);
354       acolidx += 8; aval += 8;
355       AVX512_Mult_Private(vec_idx2,vec_x2,vec_vals2,vec_y2);
356       acolidx += 8; aval += 8;
357       AVX512_Mult_Private(vec_idx3,vec_x3,vec_vals3,vec_y3);
358       acolidx += 8; aval += 8;
359       j += 3;
360       break;
361     case 2:
362       AVX512_Mult_Private(vec_idx,vec_x,vec_vals,vec_y);
363       acolidx += 8; aval += 8;
364       AVX512_Mult_Private(vec_idx2,vec_x2,vec_vals2,vec_y2);
365       acolidx += 8; aval += 8;
366       j += 2;
367       break;
368     case 1:
369       AVX512_Mult_Private(vec_idx,vec_x,vec_vals,vec_y);
370       acolidx += 8; aval += 8;
371       j += 1;
372       break;
373     }
374     #pragma novector
375     for (; j<(a->sliidx[i+1]>>3); j+=4) {
376       AVX512_Mult_Private(vec_idx,vec_x,vec_vals,vec_y);
377       acolidx += 8; aval += 8;
378       AVX512_Mult_Private(vec_idx2,vec_x2,vec_vals2,vec_y2);
379       acolidx += 8; aval += 8;
380       AVX512_Mult_Private(vec_idx3,vec_x3,vec_vals3,vec_y3);
381       acolidx += 8; aval += 8;
382       AVX512_Mult_Private(vec_idx4,vec_x4,vec_vals4,vec_y4);
383       acolidx += 8; aval += 8;
384     }
385 
386     vec_y = _mm512_add_pd(vec_y,vec_y2);
387     vec_y = _mm512_add_pd(vec_y,vec_y3);
388     vec_y = _mm512_add_pd(vec_y,vec_y4);
389     if (i == totalslices-1 && A->rmap->n & 0x07) { /* if last slice has padding rows */
390       mask = (__mmask8)(0xff >> (8-(A->rmap->n & 0x07)));
391       _mm512_mask_storeu_pd(&y[8*i],mask,vec_y);
392     } else {
393       _mm512_storeu_pd(&y[8*i],vec_y);
394     }
395   }
396 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
397   for (i=0; i<totalslices; i++) { /* loop over full slices */
398     PetscPrefetchBlock(acolidx,a->sliidx[i+1]-a->sliidx[i],0,PETSC_PREFETCH_HINT_T0);
399     PetscPrefetchBlock(aval,a->sliidx[i+1]-a->sliidx[i],0,PETSC_PREFETCH_HINT_T0);
400 
401     /* last slice may have padding rows. Don't use vectorization. */
402     if (i == totalslices-1 && (A->rmap->n & 0x07)) {
403       rows_left = A->rmap->n - 8*i;
404       for (r=0; r<rows_left; ++r) {
405         yval = (MatScalar)0;
406         row = 8*i + r;
407         nnz_in_row = a->rlen[row];
408         for (j=0; j<nnz_in_row; ++j) yval += aval[8*j+r] * x[acolidx[8*j+r]];
409         y[row] = yval;
410       }
411       break;
412     }
413 
414     vec_y  = _mm256_setzero_pd();
415     vec_y2 = _mm256_setzero_pd();
416 
417     /* Process slice of height 8 (512 bits) via two subslices of height 4 (256 bits) via AVX */
418     #pragma novector
419     #pragma unroll(2)
420     for (j=a->sliidx[i]; j<a->sliidx[i+1]; j+=8) {
421       AVX2_Mult_Private(vec_idx,vec_x,vec_vals,vec_y);
422       aval += 4; acolidx += 4;
423       AVX2_Mult_Private(vec_idx,vec_x,vec_vals,vec_y2);
424       aval += 4; acolidx += 4;
425     }
426 
427     _mm256_storeu_pd(y+i*8,vec_y);
428     _mm256_storeu_pd(y+i*8+4,vec_y2);
429   }
430 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
431   for (i=0; i<totalslices; i++) { /* loop over full slices */
432     PetscPrefetchBlock(acolidx,a->sliidx[i+1]-a->sliidx[i],0,PETSC_PREFETCH_HINT_T0);
433     PetscPrefetchBlock(aval,a->sliidx[i+1]-a->sliidx[i],0,PETSC_PREFETCH_HINT_T0);
434 
435     vec_y  = _mm256_setzero_pd();
436     vec_y2 = _mm256_setzero_pd();
437 
438     /* last slice may have padding rows. Don't use vectorization. */
439     if (i == totalslices-1 && (A->rmap->n & 0x07)) {
440       rows_left = A->rmap->n - 8*i;
441       for (r=0; r<rows_left; ++r) {
442         yval = (MatScalar)0;
443         row = 8*i + r;
444         nnz_in_row = a->rlen[row];
445         for (j=0; j<nnz_in_row; ++j) yval += aval[8*j + r] * x[acolidx[8*j + r]];
446         y[row] = yval;
447       }
448       break;
449     }
450 
451     /* Process slice of height 8 (512 bits) via two subslices of height 4 (256 bits) via AVX */
452     #pragma novector
453     #pragma unroll(2)
454     for (j=a->sliidx[i]; j<a->sliidx[i+1]; j+=8) {
455       vec_vals  = _mm256_loadu_pd(aval);
456       vec_x_tmp = _mm_setzero_pd();
457       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
458       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
459       vec_x     = _mm256_insertf128_pd(vec_x,vec_x_tmp,0);
460       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
461       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
462       vec_x     = _mm256_insertf128_pd(vec_x,vec_x_tmp,1);
463       vec_y     = _mm256_add_pd(_mm256_mul_pd(vec_x,vec_vals),vec_y);
464       aval     += 4;
465 
466       vec_vals  = _mm256_loadu_pd(aval);
467       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
468       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
469       vec_x     = _mm256_insertf128_pd(vec_x,vec_x_tmp,0);
470       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
471       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
472       vec_x     = _mm256_insertf128_pd(vec_x,vec_x_tmp,1);
473       vec_y2    = _mm256_add_pd(_mm256_mul_pd(vec_x,vec_vals),vec_y2);
474       aval     += 4;
475     }
476 
477     _mm256_storeu_pd(y + i*8,     vec_y);
478     _mm256_storeu_pd(y + i*8 + 4, vec_y2);
479   }
480 #else
481   for (i=0; i<totalslices; i++) { /* loop over slices */
482     for (j=0; j<8; j++) sum[j] = 0.0;
483     for (j=a->sliidx[i]; j<a->sliidx[i+1]; j+=8) {
484       sum[0] += aval[j] * x[acolidx[j]];
485       sum[1] += aval[j+1] * x[acolidx[j+1]];
486       sum[2] += aval[j+2] * x[acolidx[j+2]];
487       sum[3] += aval[j+3] * x[acolidx[j+3]];
488       sum[4] += aval[j+4] * x[acolidx[j+4]];
489       sum[5] += aval[j+5] * x[acolidx[j+5]];
490       sum[6] += aval[j+6] * x[acolidx[j+6]];
491       sum[7] += aval[j+7] * x[acolidx[j+7]];
492     }
493     if (i == totalslices-1 && (A->rmap->n & 0x07)) { /* if last slice has padding rows */
494       for (j=0; j<(A->rmap->n & 0x07); j++) y[8*i+j] = sum[j];
495     } else {
496       for (j=0; j<8; j++) y[8*i+j] = sum[j];
497     }
498   }
499 #endif
500 
501   PetscCall(PetscLogFlops(2.0*a->nz-a->nonzerorowcnt)); /* theoretical minimal FLOPs */
502   PetscCall(VecRestoreArrayRead(xx,&x));
503   PetscCall(VecRestoreArray(yy,&y));
504   PetscFunctionReturn(0);
505 }
506 
507 #include <../src/mat/impls/aij/seq/ftn-kernels/fmultadd.h>
508 PetscErrorCode MatMultAdd_SeqSELL(Mat A,Vec xx,Vec yy,Vec zz)
509 {
510   Mat_SeqSELL       *a=(Mat_SeqSELL*)A->data;
511   PetscScalar       *y,*z;
512   const PetscScalar *x;
513   const MatScalar   *aval=a->val;
514   PetscInt          totalslices=a->totalslices;
515   const PetscInt    *acolidx=a->colidx;
516   PetscInt          i,j;
517 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
518   __m512d           vec_x,vec_y,vec_vals;
519   __m256i           vec_idx;
520   __mmask8          mask;
521   __m512d           vec_x2,vec_y2,vec_vals2,vec_x3,vec_y3,vec_vals3,vec_x4,vec_y4,vec_vals4;
522   __m256i           vec_idx2,vec_idx3,vec_idx4;
523 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
524   __m128d           vec_x_tmp;
525   __m256d           vec_x,vec_y,vec_y2,vec_vals;
526   MatScalar         yval;
527   PetscInt          r,row,nnz_in_row;
528 #else
529   PetscScalar       sum[8];
530 #endif
531 
532 #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
533 #pragma disjoint(*x,*y,*aval)
534 #endif
535 
536   PetscFunctionBegin;
537   PetscCall(VecGetArrayRead(xx,&x));
538   PetscCall(VecGetArrayPair(yy,zz,&y,&z));
539 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
540   for (i=0; i<totalslices; i++) { /* loop over slices */
541     PetscPrefetchBlock(acolidx,a->sliidx[i+1]-a->sliidx[i],0,PETSC_PREFETCH_HINT_T0);
542     PetscPrefetchBlock(aval,a->sliidx[i+1]-a->sliidx[i],0,PETSC_PREFETCH_HINT_T0);
543 
544     if (i == totalslices-1 && A->rmap->n & 0x07) { /* if last slice has padding rows */
545       mask   = (__mmask8)(0xff >> (8-(A->rmap->n & 0x07)));
546       vec_y  = _mm512_mask_loadu_pd(vec_y,mask,&y[8*i]);
547     } else {
548       vec_y  = _mm512_loadu_pd(&y[8*i]);
549     }
550     vec_y2 = _mm512_setzero_pd();
551     vec_y3 = _mm512_setzero_pd();
552     vec_y4 = _mm512_setzero_pd();
553 
554     j = a->sliidx[i]>>3; /* 8 bytes are read at each time, corresponding to a slice columnn */
555     switch ((a->sliidx[i+1]-a->sliidx[i])/8 & 3) {
556     case 3:
557       AVX512_Mult_Private(vec_idx,vec_x,vec_vals,vec_y);
558       acolidx += 8; aval += 8;
559       AVX512_Mult_Private(vec_idx2,vec_x2,vec_vals2,vec_y2);
560       acolidx += 8; aval += 8;
561       AVX512_Mult_Private(vec_idx3,vec_x3,vec_vals3,vec_y3);
562       acolidx += 8; aval += 8;
563       j += 3;
564       break;
565     case 2:
566       AVX512_Mult_Private(vec_idx,vec_x,vec_vals,vec_y);
567       acolidx += 8; aval += 8;
568       AVX512_Mult_Private(vec_idx2,vec_x2,vec_vals2,vec_y2);
569       acolidx += 8; aval += 8;
570       j += 2;
571       break;
572     case 1:
573       AVX512_Mult_Private(vec_idx,vec_x,vec_vals,vec_y);
574       acolidx += 8; aval += 8;
575       j += 1;
576       break;
577     }
578     #pragma novector
579     for (; j<(a->sliidx[i+1]>>3); j+=4) {
580       AVX512_Mult_Private(vec_idx,vec_x,vec_vals,vec_y);
581       acolidx += 8; aval += 8;
582       AVX512_Mult_Private(vec_idx2,vec_x2,vec_vals2,vec_y2);
583       acolidx += 8; aval += 8;
584       AVX512_Mult_Private(vec_idx3,vec_x3,vec_vals3,vec_y3);
585       acolidx += 8; aval += 8;
586       AVX512_Mult_Private(vec_idx4,vec_x4,vec_vals4,vec_y4);
587       acolidx += 8; aval += 8;
588     }
589 
590     vec_y = _mm512_add_pd(vec_y,vec_y2);
591     vec_y = _mm512_add_pd(vec_y,vec_y3);
592     vec_y = _mm512_add_pd(vec_y,vec_y4);
593     if (i == totalslices-1 && A->rmap->n & 0x07) { /* if last slice has padding rows */
594       _mm512_mask_storeu_pd(&z[8*i],mask,vec_y);
595     } else {
596       _mm512_storeu_pd(&z[8*i],vec_y);
597     }
598   }
599 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
600   for (i=0; i<totalslices; i++) { /* loop over full slices */
601     PetscPrefetchBlock(acolidx,a->sliidx[i+1]-a->sliidx[i],0,PETSC_PREFETCH_HINT_T0);
602     PetscPrefetchBlock(aval,a->sliidx[i+1]-a->sliidx[i],0,PETSC_PREFETCH_HINT_T0);
603 
604     /* last slice may have padding rows. Don't use vectorization. */
605     if (i == totalslices-1 && (A->rmap->n & 0x07)) {
606       for (r=0; r<(A->rmap->n & 0x07); ++r) {
607         row        = 8*i + r;
608         yval       = (MatScalar)0.0;
609         nnz_in_row = a->rlen[row];
610         for (j=0; j<nnz_in_row; ++j) yval += aval[8*j+r] * x[acolidx[8*j+r]];
611         z[row] = y[row] + yval;
612       }
613       break;
614     }
615 
616     vec_y  = _mm256_loadu_pd(y+8*i);
617     vec_y2 = _mm256_loadu_pd(y+8*i+4);
618 
619     /* Process slice of height 8 (512 bits) via two subslices of height 4 (256 bits) via AVX */
620     for (j=a->sliidx[i]; j<a->sliidx[i+1]; j+=8) {
621       vec_vals  = _mm256_loadu_pd(aval);
622       vec_x_tmp = _mm_setzero_pd();
623       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
624       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
625       vec_x     = _mm256_setzero_pd();
626       vec_x     = _mm256_insertf128_pd(vec_x,vec_x_tmp,0);
627       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
628       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
629       vec_x     = _mm256_insertf128_pd(vec_x,vec_x_tmp,1);
630       vec_y     = _mm256_add_pd(_mm256_mul_pd(vec_x,vec_vals),vec_y);
631       aval     += 4;
632 
633       vec_vals  = _mm256_loadu_pd(aval);
634       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
635       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
636       vec_x     = _mm256_insertf128_pd(vec_x,vec_x_tmp,0);
637       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
638       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
639       vec_x     = _mm256_insertf128_pd(vec_x,vec_x_tmp,1);
640       vec_y2    = _mm256_add_pd(_mm256_mul_pd(vec_x,vec_vals),vec_y2);
641       aval     += 4;
642     }
643 
644     _mm256_storeu_pd(z+i*8,vec_y);
645     _mm256_storeu_pd(z+i*8+4,vec_y2);
646   }
647 #else
648   for (i=0; i<totalslices; i++) { /* loop over slices */
649     for (j=0; j<8; j++) sum[j] = 0.0;
650     for (j=a->sliidx[i]; j<a->sliidx[i+1]; j+=8) {
651       sum[0] += aval[j] * x[acolidx[j]];
652       sum[1] += aval[j+1] * x[acolidx[j+1]];
653       sum[2] += aval[j+2] * x[acolidx[j+2]];
654       sum[3] += aval[j+3] * x[acolidx[j+3]];
655       sum[4] += aval[j+4] * x[acolidx[j+4]];
656       sum[5] += aval[j+5] * x[acolidx[j+5]];
657       sum[6] += aval[j+6] * x[acolidx[j+6]];
658       sum[7] += aval[j+7] * x[acolidx[j+7]];
659     }
660     if (i == totalslices-1 && (A->rmap->n & 0x07)) {
661       for (j=0; j<(A->rmap->n & 0x07); j++) z[8*i+j] = y[8*i+j] + sum[j];
662     } else {
663       for (j=0; j<8; j++) z[8*i+j] = y[8*i+j] + sum[j];
664     }
665   }
666 #endif
667 
668   PetscCall(PetscLogFlops(2.0*a->nz));
669   PetscCall(VecRestoreArrayRead(xx,&x));
670   PetscCall(VecRestoreArrayPair(yy,zz,&y,&z));
671   PetscFunctionReturn(0);
672 }
673 
674 PetscErrorCode MatMultTransposeAdd_SeqSELL(Mat A,Vec xx,Vec zz,Vec yy)
675 {
676   Mat_SeqSELL       *a=(Mat_SeqSELL*)A->data;
677   PetscScalar       *y;
678   const PetscScalar *x;
679   const MatScalar   *aval=a->val;
680   const PetscInt    *acolidx=a->colidx;
681   PetscInt          i,j,r,row,nnz_in_row,totalslices=a->totalslices;
682 
683 #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
684 #pragma disjoint(*x,*y,*aval)
685 #endif
686 
687   PetscFunctionBegin;
688   if (A->symmetric == PETSC_BOOL3_TRUE) {
689     PetscCall(MatMultAdd_SeqSELL(A,xx,zz,yy));
690     PetscFunctionReturn(0);
691   }
692   if (zz != yy) PetscCall(VecCopy(zz,yy));
693   PetscCall(VecGetArrayRead(xx,&x));
694   PetscCall(VecGetArray(yy,&y));
695   for (i=0; i<a->totalslices; i++) { /* loop over slices */
696     if (i == totalslices-1 && (A->rmap->n & 0x07)) {
697       for (r=0; r<(A->rmap->n & 0x07); ++r) {
698         row        = 8*i + r;
699         nnz_in_row = a->rlen[row];
700         for (j=0; j<nnz_in_row; ++j) y[acolidx[8*j+r]] += aval[8*j+r] * x[row];
701       }
702       break;
703     }
704     for (j=a->sliidx[i]; j<a->sliidx[i+1]; j+=8) {
705       y[acolidx[j]]   += aval[j] * x[8*i];
706       y[acolidx[j+1]] += aval[j+1] * x[8*i+1];
707       y[acolidx[j+2]] += aval[j+2] * x[8*i+2];
708       y[acolidx[j+3]] += aval[j+3] * x[8*i+3];
709       y[acolidx[j+4]] += aval[j+4] * x[8*i+4];
710       y[acolidx[j+5]] += aval[j+5] * x[8*i+5];
711       y[acolidx[j+6]] += aval[j+6] * x[8*i+6];
712       y[acolidx[j+7]] += aval[j+7] * x[8*i+7];
713     }
714   }
715   PetscCall(PetscLogFlops(2.0*a->sliidx[a->totalslices]));
716   PetscCall(VecRestoreArrayRead(xx,&x));
717   PetscCall(VecRestoreArray(yy,&y));
718   PetscFunctionReturn(0);
719 }
720 
721 PetscErrorCode MatMultTranspose_SeqSELL(Mat A,Vec xx,Vec yy)
722 {
723   PetscFunctionBegin;
724   if (A->symmetric == PETSC_BOOL3_TRUE) {
725     PetscCall(MatMult_SeqSELL(A,xx,yy));
726   } else {
727     PetscCall(VecSet(yy,0.0));
728     PetscCall(MatMultTransposeAdd_SeqSELL(A,xx,yy,yy));
729   }
730   PetscFunctionReturn(0);
731 }
732 
733 /*
734      Checks for missing diagonals
735 */
736 PetscErrorCode MatMissingDiagonal_SeqSELL(Mat A,PetscBool  *missing,PetscInt *d)
737 {
738   Mat_SeqSELL    *a=(Mat_SeqSELL*)A->data;
739   PetscInt       *diag,i;
740 
741   PetscFunctionBegin;
742   *missing = PETSC_FALSE;
743   if (A->rmap->n > 0 && !(a->colidx)) {
744     *missing = PETSC_TRUE;
745     if (d) *d = 0;
746     PetscCall(PetscInfo(A,"Matrix has no entries therefore is missing diagonal\n"));
747   } else {
748     diag = a->diag;
749     for (i=0; i<A->rmap->n; i++) {
750       if (diag[i] == -1) {
751         *missing = PETSC_TRUE;
752         if (d) *d = i;
753         PetscCall(PetscInfo(A,"Matrix is missing diagonal number %" PetscInt_FMT "\n",i));
754         break;
755       }
756     }
757   }
758   PetscFunctionReturn(0);
759 }
760 
761 PetscErrorCode MatMarkDiagonal_SeqSELL(Mat A)
762 {
763   Mat_SeqSELL    *a=(Mat_SeqSELL*)A->data;
764   PetscInt       i,j,m=A->rmap->n,shift;
765 
766   PetscFunctionBegin;
767   if (!a->diag) {
768     PetscCall(PetscMalloc1(m,&a->diag));
769     PetscCall(PetscLogObjectMemory((PetscObject)A,m*sizeof(PetscInt)));
770     a->free_diag = PETSC_TRUE;
771   }
772   for (i=0; i<m; i++) { /* loop over rows */
773     shift = a->sliidx[i>>3]+(i&0x07); /* starting index of the row i */
774     a->diag[i] = -1;
775     for (j=0; j<a->rlen[i]; j++) {
776       if (a->colidx[shift+j*8] == i) {
777         a->diag[i] = shift+j*8;
778         break;
779       }
780     }
781   }
782   PetscFunctionReturn(0);
783 }
784 
785 /*
786   Negative shift indicates do not generate an error if there is a zero diagonal, just invert it anyways
787 */
788 PetscErrorCode MatInvertDiagonal_SeqSELL(Mat A,PetscScalar omega,PetscScalar fshift)
789 {
790   Mat_SeqSELL    *a=(Mat_SeqSELL*) A->data;
791   PetscInt       i,*diag,m = A->rmap->n;
792   MatScalar      *val = a->val;
793   PetscScalar    *idiag,*mdiag;
794 
795   PetscFunctionBegin;
796   if (a->idiagvalid) PetscFunctionReturn(0);
797   PetscCall(MatMarkDiagonal_SeqSELL(A));
798   diag = a->diag;
799   if (!a->idiag) {
800     PetscCall(PetscMalloc3(m,&a->idiag,m,&a->mdiag,m,&a->ssor_work));
801     PetscCall(PetscLogObjectMemory((PetscObject)A, 3*m*sizeof(PetscScalar)));
802     val  = a->val;
803   }
804   mdiag = a->mdiag;
805   idiag = a->idiag;
806 
807   if (omega == 1.0 && PetscRealPart(fshift) <= 0.0) {
808     for (i=0; i<m; i++) {
809       mdiag[i] = val[diag[i]];
810       if (!PetscAbsScalar(mdiag[i])) { /* zero diagonal */
811         if (PetscRealPart(fshift)) {
812           PetscCall(PetscInfo(A,"Zero diagonal on row %" PetscInt_FMT "\n",i));
813           A->factorerrortype             = MAT_FACTOR_NUMERIC_ZEROPIVOT;
814           A->factorerror_zeropivot_value = 0.0;
815           A->factorerror_zeropivot_row   = i;
816         } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"Zero diagonal on row %" PetscInt_FMT,i);
817       }
818       idiag[i] = 1.0/val[diag[i]];
819     }
820     PetscCall(PetscLogFlops(m));
821   } else {
822     for (i=0; i<m; i++) {
823       mdiag[i] = val[diag[i]];
824       idiag[i] = omega/(fshift + val[diag[i]]);
825     }
826     PetscCall(PetscLogFlops(2.0*m));
827   }
828   a->idiagvalid = PETSC_TRUE;
829   PetscFunctionReturn(0);
830 }
831 
832 PetscErrorCode MatZeroEntries_SeqSELL(Mat A)
833 {
834   Mat_SeqSELL    *a=(Mat_SeqSELL*)A->data;
835 
836   PetscFunctionBegin;
837   PetscCall(PetscArrayzero(a->val,a->sliidx[a->totalslices]));
838   PetscCall(MatSeqSELLInvalidateDiagonal(A));
839   PetscFunctionReturn(0);
840 }
841 
842 PetscErrorCode MatDestroy_SeqSELL(Mat A)
843 {
844   Mat_SeqSELL    *a=(Mat_SeqSELL*)A->data;
845 
846   PetscFunctionBegin;
847 #if defined(PETSC_USE_LOG)
848   PetscLogObjectState((PetscObject)A,"Rows=%" PetscInt_FMT ", Cols=%" PetscInt_FMT ", NZ=%" PetscInt_FMT,A->rmap->n,A->cmap->n,a->nz);
849 #endif
850   PetscCall(MatSeqXSELLFreeSELL(A,&a->val,&a->colidx));
851   PetscCall(ISDestroy(&a->row));
852   PetscCall(ISDestroy(&a->col));
853   PetscCall(PetscFree(a->diag));
854   PetscCall(PetscFree(a->rlen));
855   PetscCall(PetscFree(a->sliidx));
856   PetscCall(PetscFree3(a->idiag,a->mdiag,a->ssor_work));
857   PetscCall(PetscFree(a->solve_work));
858   PetscCall(ISDestroy(&a->icol));
859   PetscCall(PetscFree(a->saved_values));
860   PetscCall(PetscFree2(a->getrowcols,a->getrowvals));
861 
862   PetscCall(PetscFree(A->data));
863 
864   PetscCall(PetscObjectChangeTypeName((PetscObject)A,NULL));
865   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatStoreValues_C",NULL));
866   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatRetrieveValues_C",NULL));
867   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSeqSELLSetPreallocation_C",NULL));
868   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSeqSELLGetArray_C",NULL));
869   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSeqSELLRestoreArray_C",NULL));
870   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatConvert_seqsell_seqaij_C",NULL));
871   PetscFunctionReturn(0);
872 }
873 
874 PetscErrorCode MatSetOption_SeqSELL(Mat A,MatOption op,PetscBool flg)
875 {
876   Mat_SeqSELL    *a=(Mat_SeqSELL*)A->data;
877 
878   PetscFunctionBegin;
879   switch (op) {
880   case MAT_ROW_ORIENTED:
881     a->roworiented = flg;
882     break;
883   case MAT_KEEP_NONZERO_PATTERN:
884     a->keepnonzeropattern = flg;
885     break;
886   case MAT_NEW_NONZERO_LOCATIONS:
887     a->nonew = (flg ? 0 : 1);
888     break;
889   case MAT_NEW_NONZERO_LOCATION_ERR:
890     a->nonew = (flg ? -1 : 0);
891     break;
892   case MAT_NEW_NONZERO_ALLOCATION_ERR:
893     a->nonew = (flg ? -2 : 0);
894     break;
895   case MAT_UNUSED_NONZERO_LOCATION_ERR:
896     a->nounused = (flg ? -1 : 0);
897     break;
898   case MAT_FORCE_DIAGONAL_ENTRIES:
899   case MAT_IGNORE_OFF_PROC_ENTRIES:
900   case MAT_USE_HASH_TABLE:
901   case MAT_SORTED_FULL:
902     PetscCall(PetscInfo(A,"Option %s ignored\n",MatOptions[op]));
903     break;
904   case MAT_SPD:
905   case MAT_SYMMETRIC:
906   case MAT_STRUCTURALLY_SYMMETRIC:
907   case MAT_HERMITIAN:
908   case MAT_SYMMETRY_ETERNAL:
909   case MAT_STRUCTURAL_SYMMETRY_ETERNAL:
910   case MAT_SPD_ETERNAL:
911     /* These options are handled directly by MatSetOption() */
912     break;
913   default:
914     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"unknown option %d",op);
915   }
916   PetscFunctionReturn(0);
917 }
918 
919 PetscErrorCode MatGetDiagonal_SeqSELL(Mat A,Vec v)
920 {
921   Mat_SeqSELL    *a=(Mat_SeqSELL*)A->data;
922   PetscInt       i,j,n,shift;
923   PetscScalar    *x,zero=0.0;
924 
925   PetscFunctionBegin;
926   PetscCall(VecGetLocalSize(v,&n));
927   PetscCheck(n == A->rmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector");
928 
929   if (A->factortype == MAT_FACTOR_ILU || A->factortype == MAT_FACTOR_LU) {
930     PetscInt *diag=a->diag;
931     PetscCall(VecGetArray(v,&x));
932     for (i=0; i<n; i++) x[i] = 1.0/a->val[diag[i]];
933     PetscCall(VecRestoreArray(v,&x));
934     PetscFunctionReturn(0);
935   }
936 
937   PetscCall(VecSet(v,zero));
938   PetscCall(VecGetArray(v,&x));
939   for (i=0; i<n; i++) { /* loop over rows */
940     shift = a->sliidx[i>>3]+(i&0x07); /* starting index of the row i */
941     x[i] = 0;
942     for (j=0; j<a->rlen[i]; j++) {
943       if (a->colidx[shift+j*8] == i) {
944         x[i] = a->val[shift+j*8];
945         break;
946       }
947     }
948   }
949   PetscCall(VecRestoreArray(v,&x));
950   PetscFunctionReturn(0);
951 }
952 
953 PetscErrorCode MatDiagonalScale_SeqSELL(Mat A,Vec ll,Vec rr)
954 {
955   Mat_SeqSELL       *a=(Mat_SeqSELL*)A->data;
956   const PetscScalar *l,*r;
957   PetscInt          i,j,m,n,row;
958 
959   PetscFunctionBegin;
960   if (ll) {
961     /* The local size is used so that VecMPI can be passed to this routine
962        by MatDiagonalScale_MPISELL */
963     PetscCall(VecGetLocalSize(ll,&m));
964     PetscCheck(m == A->rmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Left scaling vector wrong length");
965     PetscCall(VecGetArrayRead(ll,&l));
966     for (i=0; i<a->totalslices; i++) { /* loop over slices */
967       if (i == a->totalslices-1 && (A->rmap->n & 0x07)) { /* if last slice has padding rows */
968         for (j=a->sliidx[i],row=0; j<a->sliidx[i+1]; j++,row=((row+1)&0x07)) {
969           if (row < (A->rmap->n & 0x07)) a->val[j] *= l[8*i+row];
970         }
971       } else {
972         for (j=a->sliidx[i],row=0; j<a->sliidx[i+1]; j++,row=((row+1)&0x07)) {
973           a->val[j] *= l[8*i+row];
974         }
975       }
976     }
977     PetscCall(VecRestoreArrayRead(ll,&l));
978     PetscCall(PetscLogFlops(a->nz));
979   }
980   if (rr) {
981     PetscCall(VecGetLocalSize(rr,&n));
982     PetscCheck(n == A->cmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Right scaling vector wrong length");
983     PetscCall(VecGetArrayRead(rr,&r));
984     for (i=0; i<a->totalslices; i++) { /* loop over slices */
985       if (i == a->totalslices-1 && (A->rmap->n & 0x07)) { /* if last slice has padding rows */
986         for (j=a->sliidx[i],row=0; j<a->sliidx[i+1]; j++,row=((row+1)&0x07)) {
987           if (row < (A->rmap->n & 0x07)) a->val[j] *= r[a->colidx[j]];
988         }
989       } else {
990         for (j=a->sliidx[i]; j<a->sliidx[i+1]; j++) {
991           a->val[j] *= r[a->colidx[j]];
992         }
993       }
994     }
995     PetscCall(VecRestoreArrayRead(rr,&r));
996     PetscCall(PetscLogFlops(a->nz));
997   }
998   PetscCall(MatSeqSELLInvalidateDiagonal(A));
999   PetscFunctionReturn(0);
1000 }
1001 
1002 PetscErrorCode MatGetValues_SeqSELL(Mat A,PetscInt m,const PetscInt im[],PetscInt n,const PetscInt in[],PetscScalar v[])
1003 {
1004   Mat_SeqSELL *a=(Mat_SeqSELL*)A->data;
1005   PetscInt    *cp,i,k,low,high,t,row,col,l;
1006   PetscInt    shift;
1007   MatScalar   *vp;
1008 
1009   PetscFunctionBegin;
1010   for (k=0; k<m; k++) { /* loop over requested rows */
1011     row = im[k];
1012     if (row<0) continue;
1013     PetscCheck(row < A->rmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Row too large: row %" PetscInt_FMT " max %" PetscInt_FMT,row,A->rmap->n-1);
1014     shift = a->sliidx[row>>3]+(row&0x07); /* starting index of the row */
1015     cp = a->colidx+shift; /* pointer to the row */
1016     vp = a->val+shift; /* pointer to the row */
1017     for (l=0; l<n; l++) { /* loop over requested columns */
1018       col = in[l];
1019       if (col<0) continue;
1020       PetscCheck(col < A->cmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Column too large: row %" PetscInt_FMT " max %" PetscInt_FMT,col,A->cmap->n-1);
1021       high = a->rlen[row]; low = 0; /* assume unsorted */
1022       while (high-low > 5) {
1023         t = (low+high)/2;
1024         if (*(cp+t*8) > col) high = t;
1025         else low = t;
1026       }
1027       for (i=low; i<high; i++) {
1028         if (*(cp+8*i) > col) break;
1029         if (*(cp+8*i) == col) {
1030           *v++ = *(vp+8*i);
1031           goto finished;
1032         }
1033       }
1034       *v++ = 0.0;
1035     finished:;
1036     }
1037   }
1038   PetscFunctionReturn(0);
1039 }
1040 
1041 PetscErrorCode MatView_SeqSELL_ASCII(Mat A,PetscViewer viewer)
1042 {
1043   Mat_SeqSELL       *a=(Mat_SeqSELL*)A->data;
1044   PetscInt          i,j,m=A->rmap->n,shift;
1045   const char        *name;
1046   PetscViewerFormat format;
1047 
1048   PetscFunctionBegin;
1049   PetscCall(PetscViewerGetFormat(viewer,&format));
1050   if (format == PETSC_VIEWER_ASCII_MATLAB) {
1051     PetscInt nofinalvalue = 0;
1052     /*
1053     if (m && ((a->i[m] == a->i[m-1]) || (a->j[a->nz-1] != A->cmap->n-1))) {
1054       nofinalvalue = 1;
1055     }
1056     */
1057     PetscCall(PetscViewerASCIIUseTabs(viewer,PETSC_FALSE));
1058     PetscCall(PetscViewerASCIIPrintf(viewer,"%% Size = %" PetscInt_FMT " %" PetscInt_FMT " \n",m,A->cmap->n));
1059     PetscCall(PetscViewerASCIIPrintf(viewer,"%% Nonzeros = %" PetscInt_FMT " \n",a->nz));
1060 #if defined(PETSC_USE_COMPLEX)
1061     PetscCall(PetscViewerASCIIPrintf(viewer,"zzz = zeros(%" PetscInt_FMT ",4);\n",a->nz+nofinalvalue));
1062 #else
1063     PetscCall(PetscViewerASCIIPrintf(viewer,"zzz = zeros(%" PetscInt_FMT ",3);\n",a->nz+nofinalvalue));
1064 #endif
1065     PetscCall(PetscViewerASCIIPrintf(viewer,"zzz = [\n"));
1066 
1067     for (i=0; i<m; i++) {
1068       shift = a->sliidx[i>>3]+(i&0x07);
1069       for (j=0; j<a->rlen[i]; j++) {
1070 #if defined(PETSC_USE_COMPLEX)
1071         PetscCall(PetscViewerASCIIPrintf(viewer,"%" PetscInt_FMT " %" PetscInt_FMT "  %18.16e %18.16e\n",i+1,a->colidx[shift+8*j]+1,(double)PetscRealPart(a->val[shift+8*j]),(double)PetscImaginaryPart(a->val[shift+8*j])));
1072 #else
1073         PetscCall(PetscViewerASCIIPrintf(viewer,"%" PetscInt_FMT " %" PetscInt_FMT "  %18.16e\n",i+1,a->colidx[shift+8*j]+1,(double)a->val[shift+8*j]));
1074 #endif
1075       }
1076     }
1077     /*
1078     if (nofinalvalue) {
1079 #if defined(PETSC_USE_COMPLEX)
1080       PetscCall(PetscViewerASCIIPrintf(viewer,"%" PetscInt_FMT " %" PetscInt_FMT "  %18.16e %18.16e\n",m,A->cmap->n,0.,0.));
1081 #else
1082       PetscCall(PetscViewerASCIIPrintf(viewer,"%" PetscInt_FMT " %" PetscInt_FMT "  %18.16e\n",m,A->cmap->n,0.0));
1083 #endif
1084     }
1085     */
1086     PetscCall(PetscObjectGetName((PetscObject)A,&name));
1087     PetscCall(PetscViewerASCIIPrintf(viewer,"];\n %s = spconvert(zzz);\n",name));
1088     PetscCall(PetscViewerASCIIUseTabs(viewer,PETSC_TRUE));
1089   } else if (format == PETSC_VIEWER_ASCII_FACTOR_INFO || format == PETSC_VIEWER_ASCII_INFO) {
1090     PetscFunctionReturn(0);
1091   } else if (format == PETSC_VIEWER_ASCII_COMMON) {
1092     PetscCall(PetscViewerASCIIUseTabs(viewer,PETSC_FALSE));
1093     for (i=0; i<m; i++) {
1094       PetscCall(PetscViewerASCIIPrintf(viewer,"row %" PetscInt_FMT ":",i));
1095       shift = a->sliidx[i>>3]+(i&0x07);
1096       for (j=0; j<a->rlen[i]; j++) {
1097 #if defined(PETSC_USE_COMPLEX)
1098         if (PetscImaginaryPart(a->val[shift+8*j]) > 0.0 && PetscRealPart(a->val[shift+8*j]) != 0.0) {
1099           PetscCall(PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g + %g i)",a->colidx[shift+8*j],(double)PetscRealPart(a->val[shift+8*j]),(double)PetscImaginaryPart(a->val[shift+8*j])));
1100         } else if (PetscImaginaryPart(a->val[shift+8*j]) < 0.0 && PetscRealPart(a->val[shift+8*j]) != 0.0) {
1101           PetscCall(PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g - %g i)",a->colidx[shift+8*j],(double)PetscRealPart(a->val[shift+8*j]),(double)-PetscImaginaryPart(a->val[shift+8*j])));
1102         } else if (PetscRealPart(a->val[shift+8*j]) != 0.0) {
1103           PetscCall(PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g) ",a->colidx[shift+8*j],(double)PetscRealPart(a->val[shift+8*j])));
1104         }
1105 #else
1106         if (a->val[shift+8*j] != 0.0) PetscCall(PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g) ",a->colidx[shift+8*j],(double)a->val[shift+8*j]));
1107 #endif
1108       }
1109       PetscCall(PetscViewerASCIIPrintf(viewer,"\n"));
1110     }
1111     PetscCall(PetscViewerASCIIUseTabs(viewer,PETSC_TRUE));
1112   } else if (format == PETSC_VIEWER_ASCII_DENSE) {
1113     PetscInt    cnt=0,jcnt;
1114     PetscScalar value;
1115 #if defined(PETSC_USE_COMPLEX)
1116     PetscBool   realonly=PETSC_TRUE;
1117     for (i=0; i<a->sliidx[a->totalslices]; i++) {
1118       if (PetscImaginaryPart(a->val[i]) != 0.0) {
1119         realonly = PETSC_FALSE;
1120         break;
1121       }
1122     }
1123 #endif
1124 
1125     PetscCall(PetscViewerASCIIUseTabs(viewer,PETSC_FALSE));
1126     for (i=0; i<m; i++) {
1127       jcnt = 0;
1128       shift = a->sliidx[i>>3]+(i&0x07);
1129       for (j=0; j<A->cmap->n; j++) {
1130         if (jcnt < a->rlen[i] && j == a->colidx[shift+8*j]) {
1131           value = a->val[cnt++];
1132           jcnt++;
1133         } else {
1134           value = 0.0;
1135         }
1136 #if defined(PETSC_USE_COMPLEX)
1137         if (realonly) {
1138           PetscCall(PetscViewerASCIIPrintf(viewer," %7.5e ",(double)PetscRealPart(value)));
1139         } else {
1140           PetscCall(PetscViewerASCIIPrintf(viewer," %7.5e+%7.5e i ",(double)PetscRealPart(value),(double)PetscImaginaryPart(value)));
1141         }
1142 #else
1143         PetscCall(PetscViewerASCIIPrintf(viewer," %7.5e ",(double)value));
1144 #endif
1145       }
1146       PetscCall(PetscViewerASCIIPrintf(viewer,"\n"));
1147     }
1148     PetscCall(PetscViewerASCIIUseTabs(viewer,PETSC_TRUE));
1149   } else if (format == PETSC_VIEWER_ASCII_MATRIXMARKET) {
1150     PetscInt fshift=1;
1151     PetscCall(PetscViewerASCIIUseTabs(viewer,PETSC_FALSE));
1152 #if defined(PETSC_USE_COMPLEX)
1153     PetscCall(PetscViewerASCIIPrintf(viewer,"%%%%MatrixMarket matrix coordinate complex general\n"));
1154 #else
1155     PetscCall(PetscViewerASCIIPrintf(viewer,"%%%%MatrixMarket matrix coordinate real general\n"));
1156 #endif
1157     PetscCall(PetscViewerASCIIPrintf(viewer,"%" PetscInt_FMT " %" PetscInt_FMT " %" PetscInt_FMT "\n", m, A->cmap->n, a->nz));
1158     for (i=0; i<m; i++) {
1159       shift = a->sliidx[i>>3]+(i&0x07);
1160       for (j=0; j<a->rlen[i]; j++) {
1161 #if defined(PETSC_USE_COMPLEX)
1162         PetscCall(PetscViewerASCIIPrintf(viewer,"%" PetscInt_FMT " %" PetscInt_FMT " %g %g\n",i+fshift,a->colidx[shift+8*j]+fshift,(double)PetscRealPart(a->val[shift+8*j]),(double)PetscImaginaryPart(a->val[shift+8*j])));
1163 #else
1164         PetscCall(PetscViewerASCIIPrintf(viewer,"%" PetscInt_FMT " %" PetscInt_FMT " %g\n",i+fshift,a->colidx[shift+8*j]+fshift,(double)a->val[shift+8*j]));
1165 #endif
1166       }
1167     }
1168     PetscCall(PetscViewerASCIIUseTabs(viewer,PETSC_TRUE));
1169   } else if (format == PETSC_VIEWER_NATIVE) {
1170     for (i=0; i<a->totalslices; i++) { /* loop over slices */
1171       PetscInt row;
1172       PetscCall(PetscViewerASCIIPrintf(viewer,"slice %" PetscInt_FMT ": %" PetscInt_FMT " %" PetscInt_FMT "\n",i,a->sliidx[i],a->sliidx[i+1]));
1173       for (j=a->sliidx[i],row=0; j<a->sliidx[i+1]; j++,row=((row+1)&0x07)) {
1174 #if defined(PETSC_USE_COMPLEX)
1175         if (PetscImaginaryPart(a->val[j]) > 0.0) {
1176           PetscCall(PetscViewerASCIIPrintf(viewer,"  %" PetscInt_FMT " %" PetscInt_FMT " %g + %g i\n",8*i+row,a->colidx[j],(double)PetscRealPart(a->val[j]),(double)PetscImaginaryPart(a->val[j])));
1177         } else if (PetscImaginaryPart(a->val[j]) < 0.0) {
1178           PetscCall(PetscViewerASCIIPrintf(viewer,"  %" PetscInt_FMT " %" PetscInt_FMT " %g - %g i\n",8*i+row,a->colidx[j],(double)PetscRealPart(a->val[j]),-(double)PetscImaginaryPart(a->val[j])));
1179         } else {
1180           PetscCall(PetscViewerASCIIPrintf(viewer,"  %" PetscInt_FMT " %" PetscInt_FMT " %g\n",8*i+row,a->colidx[j],(double)PetscRealPart(a->val[j])));
1181         }
1182 #else
1183         PetscCall(PetscViewerASCIIPrintf(viewer,"  %" PetscInt_FMT " %" PetscInt_FMT " %g\n",8*i+row,a->colidx[j],(double)a->val[j]));
1184 #endif
1185       }
1186     }
1187   } else {
1188     PetscCall(PetscViewerASCIIUseTabs(viewer,PETSC_FALSE));
1189     if (A->factortype) {
1190       for (i=0; i<m; i++) {
1191         shift = a->sliidx[i>>3]+(i&0x07);
1192         PetscCall(PetscViewerASCIIPrintf(viewer,"row %" PetscInt_FMT ":",i));
1193         /* L part */
1194         for (j=shift; j<a->diag[i]; j+=8) {
1195 #if defined(PETSC_USE_COMPLEX)
1196           if (PetscImaginaryPart(a->val[shift+8*j]) > 0.0) {
1197             PetscCall(PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g + %g i)",a->colidx[j],(double)PetscRealPart(a->val[j]),(double)PetscImaginaryPart(a->val[j])));
1198           } else if (PetscImaginaryPart(a->val[shift+8*j]) < 0.0) {
1199             PetscCall(PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g - %g i)",a->colidx[j],(double)PetscRealPart(a->val[j]),(double)(-PetscImaginaryPart(a->val[j]))));
1200           } else {
1201             PetscCall(PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g) ",a->colidx[j],(double)PetscRealPart(a->val[j])));
1202           }
1203 #else
1204           PetscCall(PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g) ",a->colidx[j],(double)a->val[j]));
1205 #endif
1206         }
1207         /* diagonal */
1208         j = a->diag[i];
1209 #if defined(PETSC_USE_COMPLEX)
1210         if (PetscImaginaryPart(a->val[j]) > 0.0) {
1211           PetscCall(PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g + %g i)",a->colidx[j],(double)PetscRealPart(1.0/a->val[j]),(double)PetscImaginaryPart(1.0/a->val[j])));
1212         } else if (PetscImaginaryPart(a->val[j]) < 0.0) {
1213           PetscCall(PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g - %g i)",a->colidx[j],(double)PetscRealPart(1.0/a->val[j]),(double)(-PetscImaginaryPart(1.0/a->val[j]))));
1214         } else {
1215           PetscCall(PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g) ",a->colidx[j],(double)PetscRealPart(1.0/a->val[j])));
1216         }
1217 #else
1218         PetscCall(PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g) ",a->colidx[j],(double)(1.0/a->val[j])));
1219 #endif
1220 
1221         /* U part */
1222         for (j=a->diag[i]+1; j<shift+8*a->rlen[i]; j+=8) {
1223 #if defined(PETSC_USE_COMPLEX)
1224           if (PetscImaginaryPart(a->val[j]) > 0.0) {
1225             PetscCall(PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g + %g i)",a->colidx[j],(double)PetscRealPart(a->val[j]),(double)PetscImaginaryPart(a->val[j])));
1226           } else if (PetscImaginaryPart(a->val[j]) < 0.0) {
1227             PetscCall(PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g - %g i)",a->colidx[j],(double)PetscRealPart(a->val[j]),(double)(-PetscImaginaryPart(a->val[j]))));
1228           } else {
1229             PetscCall(PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g) ",a->colidx[j],(double)PetscRealPart(a->val[j])));
1230           }
1231 #else
1232           PetscCall(PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g) ",a->colidx[j],(double)a->val[j]));
1233 #endif
1234         }
1235         PetscCall(PetscViewerASCIIPrintf(viewer,"\n"));
1236       }
1237     } else {
1238       for (i=0; i<m; i++) {
1239         shift = a->sliidx[i>>3]+(i&0x07);
1240         PetscCall(PetscViewerASCIIPrintf(viewer,"row %" PetscInt_FMT ":",i));
1241         for (j=0; j<a->rlen[i]; j++) {
1242 #if defined(PETSC_USE_COMPLEX)
1243           if (PetscImaginaryPart(a->val[j]) > 0.0) {
1244             PetscCall(PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g + %g i)",a->colidx[shift+8*j],(double)PetscRealPart(a->val[shift+8*j]),(double)PetscImaginaryPart(a->val[shift+8*j])));
1245           } else if (PetscImaginaryPart(a->val[j]) < 0.0) {
1246             PetscCall(PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g - %g i)",a->colidx[shift+8*j],(double)PetscRealPart(a->val[shift+8*j]),(double)-PetscImaginaryPart(a->val[shift+8*j])));
1247           } else {
1248             PetscCall(PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g) ",a->colidx[shift+8*j],(double)PetscRealPart(a->val[shift+8*j])));
1249           }
1250 #else
1251           PetscCall(PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g) ",a->colidx[shift+8*j],(double)a->val[shift+8*j]));
1252 #endif
1253         }
1254         PetscCall(PetscViewerASCIIPrintf(viewer,"\n"));
1255       }
1256     }
1257     PetscCall(PetscViewerASCIIUseTabs(viewer,PETSC_TRUE));
1258   }
1259   PetscCall(PetscViewerFlush(viewer));
1260   PetscFunctionReturn(0);
1261 }
1262 
1263 #include <petscdraw.h>
1264 PetscErrorCode MatView_SeqSELL_Draw_Zoom(PetscDraw draw,void *Aa)
1265 {
1266   Mat               A=(Mat)Aa;
1267   Mat_SeqSELL       *a=(Mat_SeqSELL*)A->data;
1268   PetscInt          i,j,m=A->rmap->n,shift;
1269   int               color;
1270   PetscReal         xl,yl,xr,yr,x_l,x_r,y_l,y_r;
1271   PetscViewer       viewer;
1272   PetscViewerFormat format;
1273 
1274   PetscFunctionBegin;
1275   PetscCall(PetscObjectQuery((PetscObject)A,"Zoomviewer",(PetscObject*)&viewer));
1276   PetscCall(PetscViewerGetFormat(viewer,&format));
1277   PetscCall(PetscDrawGetCoordinates(draw,&xl,&yl,&xr,&yr));
1278 
1279   /* loop over matrix elements drawing boxes */
1280 
1281   if (format != PETSC_VIEWER_DRAW_CONTOUR) {
1282     PetscDrawCollectiveBegin(draw);
1283     /* Blue for negative, Cyan for zero and  Red for positive */
1284     color = PETSC_DRAW_BLUE;
1285     for (i=0; i<m; i++) {
1286       shift = a->sliidx[i>>3]+(i&0x07); /* starting index of the row i */
1287       y_l = m - i - 1.0; y_r = y_l + 1.0;
1288       for (j=0; j<a->rlen[i]; j++) {
1289         x_l = a->colidx[shift+j*8]; x_r = x_l + 1.0;
1290         if (PetscRealPart(a->val[shift+8*j]) >=  0.) continue;
1291         PetscCall(PetscDrawRectangle(draw,x_l,y_l,x_r,y_r,color,color,color,color));
1292       }
1293     }
1294     color = PETSC_DRAW_CYAN;
1295     for (i=0; i<m; i++) {
1296       shift = a->sliidx[i>>3]+(i&0x07);
1297       y_l = m - i - 1.0; y_r = y_l + 1.0;
1298       for (j=0; j<a->rlen[i]; j++) {
1299         x_l = a->colidx[shift+j*8]; x_r = x_l + 1.0;
1300         if (a->val[shift+8*j] !=  0.) continue;
1301         PetscCall(PetscDrawRectangle(draw,x_l,y_l,x_r,y_r,color,color,color,color));
1302       }
1303     }
1304     color = PETSC_DRAW_RED;
1305     for (i=0; i<m; i++) {
1306       shift = a->sliidx[i>>3]+(i&0x07);
1307       y_l = m - i - 1.0; y_r = y_l + 1.0;
1308       for (j=0; j<a->rlen[i]; j++) {
1309         x_l = a->colidx[shift+j*8]; x_r = x_l + 1.0;
1310         if (PetscRealPart(a->val[shift+8*j]) <=  0.) continue;
1311         PetscCall(PetscDrawRectangle(draw,x_l,y_l,x_r,y_r,color,color,color,color));
1312       }
1313     }
1314     PetscDrawCollectiveEnd(draw);
1315   } else {
1316     /* use contour shading to indicate magnitude of values */
1317     /* first determine max of all nonzero values */
1318     PetscReal minv=0.0,maxv=0.0;
1319     PetscInt  count=0;
1320     PetscDraw popup;
1321     for (i=0; i<a->sliidx[a->totalslices]; i++) {
1322       if (PetscAbsScalar(a->val[i]) > maxv) maxv = PetscAbsScalar(a->val[i]);
1323     }
1324     if (minv >= maxv) maxv = minv + PETSC_SMALL;
1325     PetscCall(PetscDrawGetPopup(draw,&popup));
1326     PetscCall(PetscDrawScalePopup(popup,minv,maxv));
1327 
1328     PetscDrawCollectiveBegin(draw);
1329     for (i=0; i<m; i++) {
1330       shift = a->sliidx[i>>3]+(i&0x07);
1331       y_l = m - i - 1.0;
1332       y_r = y_l + 1.0;
1333       for (j=0; j<a->rlen[i]; j++) {
1334         x_l = a->colidx[shift+j*8];
1335         x_r = x_l + 1.0;
1336         color = PetscDrawRealToColor(PetscAbsScalar(a->val[count]),minv,maxv);
1337         PetscCall(PetscDrawRectangle(draw,x_l,y_l,x_r,y_r,color,color,color,color));
1338         count++;
1339       }
1340     }
1341     PetscDrawCollectiveEnd(draw);
1342   }
1343   PetscFunctionReturn(0);
1344 }
1345 
1346 #include <petscdraw.h>
1347 PetscErrorCode MatView_SeqSELL_Draw(Mat A,PetscViewer viewer)
1348 {
1349   PetscDraw      draw;
1350   PetscReal      xr,yr,xl,yl,h,w;
1351   PetscBool      isnull;
1352 
1353   PetscFunctionBegin;
1354   PetscCall(PetscViewerDrawGetDraw(viewer,0,&draw));
1355   PetscCall(PetscDrawIsNull(draw,&isnull));
1356   if (isnull) PetscFunctionReturn(0);
1357 
1358   xr   = A->cmap->n; yr  = A->rmap->n; h = yr/10.0; w = xr/10.0;
1359   xr  += w;          yr += h;         xl = -w;     yl = -h;
1360   PetscCall(PetscDrawSetCoordinates(draw,xl,yl,xr,yr));
1361   PetscCall(PetscObjectCompose((PetscObject)A,"Zoomviewer",(PetscObject)viewer));
1362   PetscCall(PetscDrawZoom(draw,MatView_SeqSELL_Draw_Zoom,A));
1363   PetscCall(PetscObjectCompose((PetscObject)A,"Zoomviewer",NULL));
1364   PetscCall(PetscDrawSave(draw));
1365   PetscFunctionReturn(0);
1366 }
1367 
1368 PetscErrorCode MatView_SeqSELL(Mat A,PetscViewer viewer)
1369 {
1370   PetscBool      iascii,isbinary,isdraw;
1371 
1372   PetscFunctionBegin;
1373   PetscCall(PetscObjectTypeCompare((PetscObject)viewer,PETSCVIEWERASCII,&iascii));
1374   PetscCall(PetscObjectTypeCompare((PetscObject)viewer,PETSCVIEWERBINARY,&isbinary));
1375   PetscCall(PetscObjectTypeCompare((PetscObject)viewer,PETSCVIEWERDRAW,&isdraw));
1376   if (iascii) {
1377     PetscCall(MatView_SeqSELL_ASCII(A,viewer));
1378   } else if (isbinary) {
1379     /* PetscCall(MatView_SeqSELL_Binary(A,viewer)); */
1380   } else if (isdraw) PetscCall(MatView_SeqSELL_Draw(A,viewer));
1381   PetscFunctionReturn(0);
1382 }
1383 
1384 PetscErrorCode MatAssemblyEnd_SeqSELL(Mat A,MatAssemblyType mode)
1385 {
1386   Mat_SeqSELL    *a=(Mat_SeqSELL*)A->data;
1387   PetscInt       i,shift,row_in_slice,row,nrow,*cp,lastcol,j,k;
1388   MatScalar      *vp;
1389 
1390   PetscFunctionBegin;
1391   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0);
1392   /* To do: compress out the unused elements */
1393   PetscCall(MatMarkDiagonal_SeqSELL(A));
1394   PetscCall(PetscInfo(A,"Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: %" PetscInt_FMT " allocated %" PetscInt_FMT " used (%" PetscInt_FMT " nonzeros+%" PetscInt_FMT " paddedzeros)\n",A->rmap->n,A->cmap->n,a->maxallocmat,a->sliidx[a->totalslices],a->nz,a->sliidx[a->totalslices]-a->nz));
1395   PetscCall(PetscInfo(A,"Number of mallocs during MatSetValues() is %" PetscInt_FMT "\n",a->reallocs));
1396   PetscCall(PetscInfo(A,"Maximum nonzeros in any row is %" PetscInt_FMT "\n",a->rlenmax));
1397   /* Set unused slots for column indices to last valid column index. Set unused slots for values to zero. This allows for a use of unmasked intrinsics -> higher performance */
1398   for (i=0; i<a->totalslices; ++i) {
1399     shift = a->sliidx[i];    /* starting index of the slice */
1400     cp    = a->colidx+shift; /* pointer to the column indices of the slice */
1401     vp    = a->val+shift;    /* pointer to the nonzero values of the slice */
1402     for (row_in_slice=0; row_in_slice<8; ++row_in_slice) { /* loop over rows in the slice */
1403       row  = 8*i + row_in_slice;
1404       nrow = a->rlen[row]; /* number of nonzeros in row */
1405       /*
1406         Search for the nearest nonzero. Normally setting the index to zero may cause extra communication.
1407         But if the entire slice are empty, it is fine to use 0 since the index will not be loaded.
1408       */
1409       lastcol = 0;
1410       if (nrow>0) { /* nonempty row */
1411         lastcol = cp[8*(nrow-1)+row_in_slice]; /* use the index from the last nonzero at current row */
1412       } else if (!row_in_slice) { /* first row of the currect slice is empty */
1413         for (j=1;j<8;j++) {
1414           if (a->rlen[8*i+j]) {
1415             lastcol = cp[j];
1416             break;
1417           }
1418         }
1419       } else {
1420         if (a->sliidx[i+1] != shift) lastcol = cp[row_in_slice-1]; /* use the index from the previous row */
1421       }
1422 
1423       for (k=nrow; k<(a->sliidx[i+1]-shift)/8; ++k) {
1424         cp[8*k+row_in_slice] = lastcol;
1425         vp[8*k+row_in_slice] = (MatScalar)0;
1426       }
1427     }
1428   }
1429 
1430   A->info.mallocs += a->reallocs;
1431   a->reallocs      = 0;
1432 
1433   PetscCall(MatSeqSELLInvalidateDiagonal(A));
1434   PetscFunctionReturn(0);
1435 }
1436 
1437 PetscErrorCode MatGetInfo_SeqSELL(Mat A,MatInfoType flag,MatInfo *info)
1438 {
1439   Mat_SeqSELL *a=(Mat_SeqSELL*)A->data;
1440 
1441   PetscFunctionBegin;
1442   info->block_size   = 1.0;
1443   info->nz_allocated = a->maxallocmat;
1444   info->nz_used      = a->sliidx[a->totalslices]; /* include padding zeros */
1445   info->nz_unneeded  = (a->maxallocmat-a->sliidx[a->totalslices]);
1446   info->assemblies   = A->num_ass;
1447   info->mallocs      = A->info.mallocs;
1448   info->memory       = ((PetscObject)A)->mem;
1449   if (A->factortype) {
1450     info->fill_ratio_given  = A->info.fill_ratio_given;
1451     info->fill_ratio_needed = A->info.fill_ratio_needed;
1452     info->factor_mallocs    = A->info.factor_mallocs;
1453   } else {
1454     info->fill_ratio_given  = 0;
1455     info->fill_ratio_needed = 0;
1456     info->factor_mallocs    = 0;
1457   }
1458   PetscFunctionReturn(0);
1459 }
1460 
1461 PetscErrorCode MatSetValues_SeqSELL(Mat A,PetscInt m,const PetscInt im[],PetscInt n,const PetscInt in[],const PetscScalar v[],InsertMode is)
1462 {
1463   Mat_SeqSELL    *a=(Mat_SeqSELL*)A->data;
1464   PetscInt       shift,i,k,l,low,high,t,ii,row,col,nrow;
1465   PetscInt       *cp,nonew=a->nonew,lastcol=-1;
1466   MatScalar      *vp,value;
1467 
1468   PetscFunctionBegin;
1469   for (k=0; k<m; k++) { /* loop over added rows */
1470     row = im[k];
1471     if (row < 0) continue;
1472     PetscCheck(row < A->rmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Row too large: row %" PetscInt_FMT " max %" PetscInt_FMT,row,A->rmap->n-1);
1473     shift = a->sliidx[row>>3]+(row&0x07); /* starting index of the row */
1474     cp    = a->colidx+shift; /* pointer to the row */
1475     vp    = a->val+shift; /* pointer to the row */
1476     nrow  = a->rlen[row];
1477     low   = 0;
1478     high  = nrow;
1479 
1480     for (l=0; l<n; l++) { /* loop over added columns */
1481       col = in[l];
1482       if (col<0) continue;
1483       PetscCheck(col < A->cmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Col too large: row %" PetscInt_FMT " max %" PetscInt_FMT,col,A->cmap->n-1);
1484       if (a->roworiented) {
1485         value = v[l+k*n];
1486       } else {
1487         value = v[k+l*m];
1488       }
1489       if ((value == 0.0 && a->ignorezeroentries) && (is == ADD_VALUES)) continue;
1490 
1491       /* search in this row for the specified column, i indicates the column to be set */
1492       if (col <= lastcol) low = 0;
1493       else high = nrow;
1494       lastcol = col;
1495       while (high-low > 5) {
1496         t = (low+high)/2;
1497         if (*(cp+t*8) > col) high = t;
1498         else low = t;
1499       }
1500       for (i=low; i<high; i++) {
1501         if (*(cp+i*8) > col) break;
1502         if (*(cp+i*8) == col) {
1503           if (is == ADD_VALUES) *(vp+i*8) += value;
1504           else *(vp+i*8) = value;
1505           low = i + 1;
1506           goto noinsert;
1507         }
1508       }
1509       if (value == 0.0 && a->ignorezeroentries) goto noinsert;
1510       if (nonew == 1) goto noinsert;
1511       PetscCheck(nonew != -1,PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Inserting a new nonzero (%" PetscInt_FMT ", %" PetscInt_FMT ") in the matrix", row, col);
1512       /* If the current row length exceeds the slice width (e.g. nrow==slice_width), allocate a new space, otherwise do nothing */
1513       MatSeqXSELLReallocateSELL(A,A->rmap->n,1,nrow,a->sliidx,row/8,row,col,a->colidx,a->val,cp,vp,nonew,MatScalar);
1514       /* add the new nonzero to the high position, shift the remaining elements in current row to the right by one slot */
1515       for (ii=nrow-1; ii>=i; ii--) {
1516         *(cp+(ii+1)*8) = *(cp+ii*8);
1517         *(vp+(ii+1)*8) = *(vp+ii*8);
1518       }
1519       a->rlen[row]++;
1520       *(cp+i*8) = col;
1521       *(vp+i*8) = value;
1522       a->nz++;
1523       A->nonzerostate++;
1524       low = i+1; high++; nrow++;
1525 noinsert:;
1526     }
1527     a->rlen[row] = nrow;
1528   }
1529   PetscFunctionReturn(0);
1530 }
1531 
1532 PetscErrorCode MatCopy_SeqSELL(Mat A,Mat B,MatStructure str)
1533 {
1534   PetscFunctionBegin;
1535   /* If the two matrices have the same copy implementation, use fast copy. */
1536   if (str == SAME_NONZERO_PATTERN && (A->ops->copy == B->ops->copy)) {
1537     Mat_SeqSELL *a=(Mat_SeqSELL*)A->data;
1538     Mat_SeqSELL *b=(Mat_SeqSELL*)B->data;
1539 
1540     PetscCheck(a->sliidx[a->totalslices] == b->sliidx[b->totalslices],PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"Number of nonzeros in two matrices are different");
1541     PetscCall(PetscArraycpy(b->val,a->val,a->sliidx[a->totalslices]));
1542   } else {
1543     PetscCall(MatCopy_Basic(A,B,str));
1544   }
1545   PetscFunctionReturn(0);
1546 }
1547 
1548 PetscErrorCode MatSetUp_SeqSELL(Mat A)
1549 {
1550   PetscFunctionBegin;
1551   PetscCall(MatSeqSELLSetPreallocation(A,PETSC_DEFAULT,NULL));
1552   PetscFunctionReturn(0);
1553 }
1554 
1555 PetscErrorCode MatSeqSELLGetArray_SeqSELL(Mat A,PetscScalar *array[])
1556 {
1557   Mat_SeqSELL *a=(Mat_SeqSELL*)A->data;
1558 
1559   PetscFunctionBegin;
1560   *array = a->val;
1561   PetscFunctionReturn(0);
1562 }
1563 
1564 PetscErrorCode MatSeqSELLRestoreArray_SeqSELL(Mat A,PetscScalar *array[])
1565 {
1566   PetscFunctionBegin;
1567   PetscFunctionReturn(0);
1568 }
1569 
1570 PetscErrorCode MatRealPart_SeqSELL(Mat A)
1571 {
1572   Mat_SeqSELL *a=(Mat_SeqSELL*)A->data;
1573   PetscInt    i;
1574   MatScalar   *aval=a->val;
1575 
1576   PetscFunctionBegin;
1577   for (i=0; i<a->sliidx[a->totalslices]; i++) aval[i]=PetscRealPart(aval[i]);
1578   PetscFunctionReturn(0);
1579 }
1580 
1581 PetscErrorCode MatImaginaryPart_SeqSELL(Mat A)
1582 {
1583   Mat_SeqSELL    *a=(Mat_SeqSELL*)A->data;
1584   PetscInt       i;
1585   MatScalar      *aval=a->val;
1586 
1587   PetscFunctionBegin;
1588   for (i=0; i<a->sliidx[a->totalslices]; i++) aval[i] = PetscImaginaryPart(aval[i]);
1589   PetscCall(MatSeqSELLInvalidateDiagonal(A));
1590   PetscFunctionReturn(0);
1591 }
1592 
1593 PetscErrorCode MatScale_SeqSELL(Mat inA,PetscScalar alpha)
1594 {
1595   Mat_SeqSELL    *a=(Mat_SeqSELL*)inA->data;
1596   MatScalar      *aval=a->val;
1597   PetscScalar    oalpha=alpha;
1598   PetscBLASInt   one=1,size;
1599 
1600   PetscFunctionBegin;
1601   PetscCall(PetscBLASIntCast(a->sliidx[a->totalslices],&size));
1602   PetscStackCallBLAS("BLASscal",BLASscal_(&size,&oalpha,aval,&one));
1603   PetscCall(PetscLogFlops(a->nz));
1604   PetscCall(MatSeqSELLInvalidateDiagonal(inA));
1605   PetscFunctionReturn(0);
1606 }
1607 
1608 PetscErrorCode MatShift_SeqSELL(Mat Y,PetscScalar a)
1609 {
1610   Mat_SeqSELL    *y=(Mat_SeqSELL*)Y->data;
1611 
1612   PetscFunctionBegin;
1613   if (!Y->preallocated || !y->nz) {
1614     PetscCall(MatSeqSELLSetPreallocation(Y,1,NULL));
1615   }
1616   PetscCall(MatShift_Basic(Y,a));
1617   PetscFunctionReturn(0);
1618 }
1619 
1620 PetscErrorCode MatSOR_SeqSELL(Mat A,Vec bb,PetscReal omega,MatSORType flag,PetscReal fshift,PetscInt its,PetscInt lits,Vec xx)
1621 {
1622   Mat_SeqSELL       *a=(Mat_SeqSELL*)A->data;
1623   PetscScalar       *x,sum,*t;
1624   const MatScalar   *idiag=NULL,*mdiag;
1625   const PetscScalar *b,*xb;
1626   PetscInt          n,m=A->rmap->n,i,j,shift;
1627   const PetscInt    *diag;
1628 
1629   PetscFunctionBegin;
1630   its = its*lits;
1631 
1632   if (fshift != a->fshift || omega != a->omega) a->idiagvalid = PETSC_FALSE; /* must recompute idiag[] */
1633   if (!a->idiagvalid) PetscCall(MatInvertDiagonal_SeqSELL(A,omega,fshift));
1634   a->fshift = fshift;
1635   a->omega  = omega;
1636 
1637   diag  = a->diag;
1638   t     = a->ssor_work;
1639   idiag = a->idiag;
1640   mdiag = a->mdiag;
1641 
1642   PetscCall(VecGetArray(xx,&x));
1643   PetscCall(VecGetArrayRead(bb,&b));
1644   /* We count flops by assuming the upper triangular and lower triangular parts have the same number of nonzeros */
1645   PetscCheck(flag != SOR_APPLY_UPPER,PETSC_COMM_SELF,PETSC_ERR_SUP,"SOR_APPLY_UPPER is not implemented");
1646   PetscCheck(flag != SOR_APPLY_LOWER,PETSC_COMM_SELF,PETSC_ERR_SUP,"SOR_APPLY_LOWER is not implemented");
1647   PetscCheck(!(flag & SOR_EISENSTAT),PETSC_COMM_SELF,PETSC_ERR_SUP,"No support yet for Eisenstat");
1648 
1649   if (flag & SOR_ZERO_INITIAL_GUESS) {
1650     if ((flag & SOR_FORWARD_SWEEP) || (flag & SOR_LOCAL_FORWARD_SWEEP)) {
1651       for (i=0; i<m; i++) {
1652         shift = a->sliidx[i>>3]+(i&0x07); /* starting index of the row i */
1653         sum   = b[i];
1654         n     = (diag[i]-shift)/8;
1655         for (j=0; j<n; j++) sum -= a->val[shift+j*8]*x[a->colidx[shift+j*8]];
1656         t[i]  = sum;
1657         x[i]  = sum*idiag[i];
1658       }
1659       xb   = t;
1660       PetscCall(PetscLogFlops(a->nz));
1661     } else xb = b;
1662     if ((flag & SOR_BACKWARD_SWEEP) || (flag & SOR_LOCAL_BACKWARD_SWEEP)) {
1663       for (i=m-1; i>=0; i--) {
1664         shift = a->sliidx[i>>3]+(i&0x07); /* starting index of the row i */
1665         sum   = xb[i];
1666         n     = a->rlen[i]-(diag[i]-shift)/8-1;
1667         for (j=1; j<=n; j++) sum -= a->val[diag[i]+j*8]*x[a->colidx[diag[i]+j*8]];
1668         if (xb == b) {
1669           x[i] = sum*idiag[i];
1670         } else {
1671           x[i] = (1.-omega)*x[i]+sum*idiag[i];  /* omega in idiag */
1672         }
1673       }
1674       PetscCall(PetscLogFlops(a->nz)); /* assumes 1/2 in upper */
1675     }
1676     its--;
1677   }
1678   while (its--) {
1679     if ((flag & SOR_FORWARD_SWEEP) || (flag & SOR_LOCAL_FORWARD_SWEEP)) {
1680       for (i=0; i<m; i++) {
1681         /* lower */
1682         shift = a->sliidx[i>>3]+(i&0x07); /* starting index of the row i */
1683         sum   = b[i];
1684         n     = (diag[i]-shift)/8;
1685         for (j=0; j<n; j++) sum -= a->val[shift+j*8]*x[a->colidx[shift+j*8]];
1686         t[i]  = sum;             /* save application of the lower-triangular part */
1687         /* upper */
1688         n     = a->rlen[i]-(diag[i]-shift)/8-1;
1689         for (j=1; j<=n; j++) sum -= a->val[diag[i]+j*8]*x[a->colidx[diag[i]+j*8]];
1690         x[i]  = (1.-omega)*x[i]+sum*idiag[i];  /* omega in idiag */
1691       }
1692       xb   = t;
1693       PetscCall(PetscLogFlops(2.0*a->nz));
1694     } else xb = b;
1695     if ((flag & SOR_BACKWARD_SWEEP) || (flag & SOR_LOCAL_BACKWARD_SWEEP)) {
1696       for (i=m-1; i>=0; i--) {
1697         shift = a->sliidx[i>>3]+(i&0x07); /* starting index of the row i */
1698         sum = xb[i];
1699         if (xb == b) {
1700           /* whole matrix (no checkpointing available) */
1701           n     = a->rlen[i];
1702           for (j=0; j<n; j++) sum -= a->val[shift+j*8]*x[a->colidx[shift+j*8]];
1703           x[i] = (1.-omega)*x[i]+(sum+mdiag[i]*x[i])*idiag[i];
1704         } else { /* lower-triangular part has been saved, so only apply upper-triangular */
1705           n     = a->rlen[i]-(diag[i]-shift)/8-1;
1706           for (j=1; j<=n; j++) sum -= a->val[diag[i]+j*8]*x[a->colidx[diag[i]+j*8]];
1707           x[i]  = (1.-omega)*x[i]+sum*idiag[i];  /* omega in idiag */
1708         }
1709       }
1710       if (xb == b) {
1711         PetscCall(PetscLogFlops(2.0*a->nz));
1712       } else {
1713         PetscCall(PetscLogFlops(a->nz)); /* assumes 1/2 in upper */
1714       }
1715     }
1716   }
1717   PetscCall(VecRestoreArray(xx,&x));
1718   PetscCall(VecRestoreArrayRead(bb,&b));
1719   PetscFunctionReturn(0);
1720 }
1721 
1722 /* -------------------------------------------------------------------*/
1723 static struct _MatOps MatOps_Values = {MatSetValues_SeqSELL,
1724                                        MatGetRow_SeqSELL,
1725                                        MatRestoreRow_SeqSELL,
1726                                        MatMult_SeqSELL,
1727                                /* 4*/  MatMultAdd_SeqSELL,
1728                                        MatMultTranspose_SeqSELL,
1729                                        MatMultTransposeAdd_SeqSELL,
1730                                        NULL,
1731                                        NULL,
1732                                        NULL,
1733                                /* 10*/ NULL,
1734                                        NULL,
1735                                        NULL,
1736                                        MatSOR_SeqSELL,
1737                                        NULL,
1738                                /* 15*/ MatGetInfo_SeqSELL,
1739                                        MatEqual_SeqSELL,
1740                                        MatGetDiagonal_SeqSELL,
1741                                        MatDiagonalScale_SeqSELL,
1742                                        NULL,
1743                                /* 20*/ NULL,
1744                                        MatAssemblyEnd_SeqSELL,
1745                                        MatSetOption_SeqSELL,
1746                                        MatZeroEntries_SeqSELL,
1747                                /* 24*/ NULL,
1748                                        NULL,
1749                                        NULL,
1750                                        NULL,
1751                                        NULL,
1752                                /* 29*/ MatSetUp_SeqSELL,
1753                                        NULL,
1754                                        NULL,
1755                                        NULL,
1756                                        NULL,
1757                                /* 34*/ MatDuplicate_SeqSELL,
1758                                        NULL,
1759                                        NULL,
1760                                        NULL,
1761                                        NULL,
1762                                /* 39*/ NULL,
1763                                        NULL,
1764                                        NULL,
1765                                        MatGetValues_SeqSELL,
1766                                        MatCopy_SeqSELL,
1767                                /* 44*/ NULL,
1768                                        MatScale_SeqSELL,
1769                                        MatShift_SeqSELL,
1770                                        NULL,
1771                                        NULL,
1772                                /* 49*/ NULL,
1773                                        NULL,
1774                                        NULL,
1775                                        NULL,
1776                                        NULL,
1777                                /* 54*/ MatFDColoringCreate_SeqXAIJ,
1778                                        NULL,
1779                                        NULL,
1780                                        NULL,
1781                                        NULL,
1782                                /* 59*/ NULL,
1783                                        MatDestroy_SeqSELL,
1784                                        MatView_SeqSELL,
1785                                        NULL,
1786                                        NULL,
1787                                /* 64*/ NULL,
1788                                        NULL,
1789                                        NULL,
1790                                        NULL,
1791                                        NULL,
1792                                /* 69*/ NULL,
1793                                        NULL,
1794                                        NULL,
1795                                        NULL,
1796                                        NULL,
1797                                /* 74*/ NULL,
1798                                        MatFDColoringApply_AIJ, /* reuse the FDColoring function for AIJ */
1799                                        NULL,
1800                                        NULL,
1801                                        NULL,
1802                                /* 79*/ NULL,
1803                                        NULL,
1804                                        NULL,
1805                                        NULL,
1806                                        NULL,
1807                                /* 84*/ NULL,
1808                                        NULL,
1809                                        NULL,
1810                                        NULL,
1811                                        NULL,
1812                                /* 89*/ NULL,
1813                                        NULL,
1814                                        NULL,
1815                                        NULL,
1816                                        NULL,
1817                                /* 94*/ NULL,
1818                                        NULL,
1819                                        NULL,
1820                                        NULL,
1821                                        NULL,
1822                                /* 99*/ NULL,
1823                                        NULL,
1824                                        NULL,
1825                                        MatConjugate_SeqSELL,
1826                                        NULL,
1827                                /*104*/ NULL,
1828                                        NULL,
1829                                        NULL,
1830                                        NULL,
1831                                        NULL,
1832                                /*109*/ NULL,
1833                                        NULL,
1834                                        NULL,
1835                                        NULL,
1836                                        MatMissingDiagonal_SeqSELL,
1837                                /*114*/ NULL,
1838                                        NULL,
1839                                        NULL,
1840                                        NULL,
1841                                        NULL,
1842                                /*119*/ NULL,
1843                                        NULL,
1844                                        NULL,
1845                                        NULL,
1846                                        NULL,
1847                                /*124*/ NULL,
1848                                        NULL,
1849                                        NULL,
1850                                        NULL,
1851                                        NULL,
1852                                /*129*/ NULL,
1853                                        NULL,
1854                                        NULL,
1855                                        NULL,
1856                                        NULL,
1857                                /*134*/ NULL,
1858                                        NULL,
1859                                        NULL,
1860                                        NULL,
1861                                        NULL,
1862                                /*139*/ NULL,
1863                                        NULL,
1864                                        NULL,
1865                                        MatFDColoringSetUp_SeqXAIJ,
1866                                        NULL,
1867                                /*144*/ NULL,
1868                                        NULL,
1869                                        NULL,
1870                                        NULL,
1871                                        NULL,
1872                                        NULL
1873 };
1874 
1875 PetscErrorCode MatStoreValues_SeqSELL(Mat mat)
1876 {
1877   Mat_SeqSELL    *a=(Mat_SeqSELL*)mat->data;
1878 
1879   PetscFunctionBegin;
1880   PetscCheck(a->nonew,PETSC_COMM_SELF,PETSC_ERR_ORDER,"Must call MatSetOption(A,MAT_NEW_NONZERO_LOCATIONS,PETSC_FALSE);first");
1881 
1882   /* allocate space for values if not already there */
1883   if (!a->saved_values) {
1884     PetscCall(PetscMalloc1(a->sliidx[a->totalslices]+1,&a->saved_values));
1885     PetscCall(PetscLogObjectMemory((PetscObject)mat,(a->sliidx[a->totalslices]+1)*sizeof(PetscScalar)));
1886   }
1887 
1888   /* copy values over */
1889   PetscCall(PetscArraycpy(a->saved_values,a->val,a->sliidx[a->totalslices]));
1890   PetscFunctionReturn(0);
1891 }
1892 
1893 PetscErrorCode MatRetrieveValues_SeqSELL(Mat mat)
1894 {
1895   Mat_SeqSELL    *a=(Mat_SeqSELL*)mat->data;
1896 
1897   PetscFunctionBegin;
1898   PetscCheck(a->nonew,PETSC_COMM_SELF,PETSC_ERR_ORDER,"Must call MatSetOption(A,MAT_NEW_NONZERO_LOCATIONS,PETSC_FALSE);first");
1899   PetscCheck(a->saved_values,PETSC_COMM_SELF,PETSC_ERR_ORDER,"Must call MatStoreValues(A);first");
1900   PetscCall(PetscArraycpy(a->val,a->saved_values,a->sliidx[a->totalslices]));
1901   PetscFunctionReturn(0);
1902 }
1903 
1904 /*@C
1905  MatSeqSELLRestoreArray - returns access to the array where the data for a MATSEQSELL matrix is stored obtained by MatSeqSELLGetArray()
1906 
1907  Not Collective
1908 
1909  Input Parameters:
1910  .  mat - a MATSEQSELL matrix
1911  .  array - pointer to the data
1912 
1913  Level: intermediate
1914 
1915  .seealso: `MatSeqSELLGetArray()`, `MatSeqSELLRestoreArrayF90()`
1916  @*/
1917 PetscErrorCode MatSeqSELLRestoreArray(Mat A,PetscScalar **array)
1918 {
1919   PetscFunctionBegin;
1920   PetscUseMethod(A,"MatSeqSELLRestoreArray_C",(Mat,PetscScalar**),(A,array));
1921   PetscFunctionReturn(0);
1922 }
1923 
1924 PETSC_EXTERN PetscErrorCode MatCreate_SeqSELL(Mat B)
1925 {
1926   Mat_SeqSELL    *b;
1927   PetscMPIInt    size;
1928 
1929   PetscFunctionBegin;
1930   PetscCall(PetscCitationsRegister(citation,&cited));
1931   PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)B),&size));
1932   PetscCheck(size <= 1,PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Comm must be of size 1");
1933 
1934   PetscCall(PetscNewLog(B,&b));
1935 
1936   B->data = (void*)b;
1937 
1938   PetscCall(PetscMemcpy(B->ops,&MatOps_Values,sizeof(struct _MatOps)));
1939 
1940   b->row                = NULL;
1941   b->col                = NULL;
1942   b->icol               = NULL;
1943   b->reallocs           = 0;
1944   b->ignorezeroentries  = PETSC_FALSE;
1945   b->roworiented        = PETSC_TRUE;
1946   b->nonew              = 0;
1947   b->diag               = NULL;
1948   b->solve_work         = NULL;
1949   B->spptr              = NULL;
1950   b->saved_values       = NULL;
1951   b->idiag              = NULL;
1952   b->mdiag              = NULL;
1953   b->ssor_work          = NULL;
1954   b->omega              = 1.0;
1955   b->fshift             = 0.0;
1956   b->idiagvalid         = PETSC_FALSE;
1957   b->keepnonzeropattern = PETSC_FALSE;
1958 
1959   PetscCall(PetscObjectChangeTypeName((PetscObject)B,MATSEQSELL));
1960   PetscCall(PetscObjectComposeFunction((PetscObject)B,"MatSeqSELLGetArray_C",MatSeqSELLGetArray_SeqSELL));
1961   PetscCall(PetscObjectComposeFunction((PetscObject)B,"MatSeqSELLRestoreArray_C",MatSeqSELLRestoreArray_SeqSELL));
1962   PetscCall(PetscObjectComposeFunction((PetscObject)B,"MatStoreValues_C",MatStoreValues_SeqSELL));
1963   PetscCall(PetscObjectComposeFunction((PetscObject)B,"MatRetrieveValues_C",MatRetrieveValues_SeqSELL));
1964   PetscCall(PetscObjectComposeFunction((PetscObject)B,"MatSeqSELLSetPreallocation_C",MatSeqSELLSetPreallocation_SeqSELL));
1965   PetscCall(PetscObjectComposeFunction((PetscObject)B,"MatConvert_seqsell_seqaij_C",MatConvert_SeqSELL_SeqAIJ));
1966   PetscFunctionReturn(0);
1967 }
1968 
1969 /*
1970  Given a matrix generated with MatGetFactor() duplicates all the information in A into B
1971  */
1972 PetscErrorCode MatDuplicateNoCreate_SeqSELL(Mat C,Mat A,MatDuplicateOption cpvalues,PetscBool mallocmatspace)
1973 {
1974   Mat_SeqSELL    *c = (Mat_SeqSELL*)C->data,*a = (Mat_SeqSELL*)A->data;
1975   PetscInt       i,m=A->rmap->n;
1976   PetscInt       totalslices=a->totalslices;
1977 
1978   PetscFunctionBegin;
1979   C->factortype = A->factortype;
1980   c->row        = NULL;
1981   c->col        = NULL;
1982   c->icol       = NULL;
1983   c->reallocs   = 0;
1984   C->assembled = PETSC_TRUE;
1985 
1986   PetscCall(PetscLayoutReference(A->rmap,&C->rmap));
1987   PetscCall(PetscLayoutReference(A->cmap,&C->cmap));
1988 
1989   PetscCall(PetscMalloc1(8*totalslices,&c->rlen));
1990   PetscCall(PetscLogObjectMemory((PetscObject)C,m*sizeof(PetscInt)));
1991   PetscCall(PetscMalloc1(totalslices+1,&c->sliidx));
1992   PetscCall(PetscLogObjectMemory((PetscObject)C, (totalslices+1)*sizeof(PetscInt)));
1993 
1994   for (i=0; i<m; i++) c->rlen[i] = a->rlen[i];
1995   for (i=0; i<totalslices+1; i++) c->sliidx[i] = a->sliidx[i];
1996 
1997   /* allocate the matrix space */
1998   if (mallocmatspace) {
1999     PetscCall(PetscMalloc2(a->maxallocmat,&c->val,a->maxallocmat,&c->colidx));
2000     PetscCall(PetscLogObjectMemory((PetscObject)C,a->maxallocmat*(sizeof(PetscScalar)+sizeof(PetscInt))));
2001 
2002     c->singlemalloc = PETSC_TRUE;
2003 
2004     if (m > 0) {
2005       PetscCall(PetscArraycpy(c->colidx,a->colidx,a->maxallocmat));
2006       if (cpvalues == MAT_COPY_VALUES) {
2007         PetscCall(PetscArraycpy(c->val,a->val,a->maxallocmat));
2008       } else {
2009         PetscCall(PetscArrayzero(c->val,a->maxallocmat));
2010       }
2011     }
2012   }
2013 
2014   c->ignorezeroentries = a->ignorezeroentries;
2015   c->roworiented       = a->roworiented;
2016   c->nonew             = a->nonew;
2017   if (a->diag) {
2018     PetscCall(PetscMalloc1(m,&c->diag));
2019     PetscCall(PetscLogObjectMemory((PetscObject)C,m*sizeof(PetscInt)));
2020     for (i=0; i<m; i++) {
2021       c->diag[i] = a->diag[i];
2022     }
2023   } else c->diag = NULL;
2024 
2025   c->solve_work         = NULL;
2026   c->saved_values       = NULL;
2027   c->idiag              = NULL;
2028   c->ssor_work          = NULL;
2029   c->keepnonzeropattern = a->keepnonzeropattern;
2030   c->free_val           = PETSC_TRUE;
2031   c->free_colidx        = PETSC_TRUE;
2032 
2033   c->maxallocmat  = a->maxallocmat;
2034   c->maxallocrow  = a->maxallocrow;
2035   c->rlenmax      = a->rlenmax;
2036   c->nz           = a->nz;
2037   C->preallocated = PETSC_TRUE;
2038 
2039   c->nonzerorowcnt = a->nonzerorowcnt;
2040   C->nonzerostate  = A->nonzerostate;
2041 
2042   PetscCall(PetscFunctionListDuplicate(((PetscObject)A)->qlist,&((PetscObject)C)->qlist));
2043   PetscFunctionReturn(0);
2044 }
2045 
2046 PetscErrorCode MatDuplicate_SeqSELL(Mat A,MatDuplicateOption cpvalues,Mat *B)
2047 {
2048   PetscFunctionBegin;
2049   PetscCall(MatCreate(PetscObjectComm((PetscObject)A),B));
2050   PetscCall(MatSetSizes(*B,A->rmap->n,A->cmap->n,A->rmap->n,A->cmap->n));
2051   if (!(A->rmap->n % A->rmap->bs) && !(A->cmap->n % A->cmap->bs)) {
2052     PetscCall(MatSetBlockSizesFromMats(*B,A,A));
2053   }
2054   PetscCall(MatSetType(*B,((PetscObject)A)->type_name));
2055   PetscCall(MatDuplicateNoCreate_SeqSELL(*B,A,cpvalues,PETSC_TRUE));
2056   PetscFunctionReturn(0);
2057 }
2058 
2059 /*MC
2060    MATSEQSELL - MATSEQSELL = "seqsell" - A matrix type to be used for sequential sparse matrices,
2061    based on the sliced Ellpack format
2062 
2063    Options Database Keys:
2064 . -mat_type seqsell - sets the matrix type to "seqsell" during a call to MatSetFromOptions()
2065 
2066    Level: beginner
2067 
2068 .seealso: `MatCreateSeqSell()`, `MATSELL`, `MATMPISELL`, `MATSEQAIJ`, `MATAIJ`, `MATMPIAIJ`
2069 M*/
2070 
2071 /*MC
2072    MATSELL - MATSELL = "sell" - A matrix type to be used for sparse matrices.
2073 
2074    This matrix type is identical to MATSEQSELL when constructed with a single process communicator,
2075    and MATMPISELL otherwise.  As a result, for single process communicators,
2076   MatSeqSELLSetPreallocation() is supported, and similarly MatMPISELLSetPreallocation() is supported
2077   for communicators controlling multiple processes.  It is recommended that you call both of
2078   the above preallocation routines for simplicity.
2079 
2080    Options Database Keys:
2081 . -mat_type sell - sets the matrix type to "sell" during a call to MatSetFromOptions()
2082 
2083   Level: beginner
2084 
2085   Notes:
2086    This format is only supported for real scalars, double precision, and 32 bit indices (the defaults).
2087 
2088    It can provide better performance on Intel and AMD processes with AVX2 or AVX512 support for matrices that have a similar number of
2089    non-zeros in contiguous groups of rows. However if the computation is memory bandwidth limited it may not provide much improvement.
2090 
2091   Developer Notes:
2092    On Intel (and AMD) systems some of the matrix operations use SIMD (AVX) instructions to achieve higher performance.
2093 
2094    The sparse matrix format is as follows. For simplicity we assume a slice size of 2, it is actually 8
2095 .vb
2096                             (2 0  3 4)
2097    Consider the matrix A =  (5 0  6 0)
2098                             (0 0  7 8)
2099                             (0 0  9 9)
2100 
2101    symbolically the Ellpack format can be written as
2102 
2103         (2 3 4 |)           (0 2 3 |)
2104    v =  (5 6 0 |)  colidx = (0 2 2 |)
2105         --------            ---------
2106         (7 8 |)             (2 3 |)
2107         (9 9 |)             (2 3 |)
2108 
2109     The data for 2 contiguous rows of the matrix are stored together (in column-major format) (with any left-over rows handled as a special case).
2110     Any of the rows in a slice fewer columns than the rest of the slice (row 1 above) are padded with a previous valid column in their "extra" colidx[] locations and
2111     zeros in their "extra" v locations so that the matrix operations do not need special code to handle different length rows within the 2 rows in a slice.
2112 
2113     The one-dimensional representation of v used in the code is (2 5 3 6 4 0 7 9 8 9)  and for colidx is (0 0 2 2 3 2 2 2 3 3)
2114 
2115 .ve
2116 
2117       See MatMult_SeqSELL() for how this format is used with the SIMD operations to achieve high performance.
2118 
2119  References:
2120 . * - Hong Zhang, Richard T. Mills, Karl Rupp, and Barry F. Smith, Vectorized Parallel Sparse Matrix-Vector Multiplication in {PETSc} Using {AVX-512},
2121    Proceedings of the 47th International Conference on Parallel Processing, 2018.
2122 
2123 .seealso: `MatCreateSeqSELL()`, `MatCreateSeqAIJ()`, `MatCreateSell()`, `MATSEQSELL`, `MATMPISELL`, `MATSEQAIJ`, `MATMPIAIJ`, `MATAIJ`
2124 M*/
2125 
2126 /*@C
2127        MatCreateSeqSELL - Creates a sparse matrix in SELL format.
2128 
2129  Collective on comm
2130 
2131  Input Parameters:
2132 +  comm - MPI communicator, set to PETSC_COMM_SELF
2133 .  m - number of rows
2134 .  n - number of columns
2135 .  rlenmax - maximum number of nonzeros in a row
2136 -  rlen - array containing the number of nonzeros in the various rows
2137  (possibly different for each row) or NULL
2138 
2139  Output Parameter:
2140 .  A - the matrix
2141 
2142  It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
2143  MatXXXXSetPreallocation() paradigm instead of this routine directly.
2144  [MatXXXXSetPreallocation() is, for example, MatSeqSELLSetPreallocation]
2145 
2146  Notes:
2147  If nnz is given then nz is ignored
2148 
2149  Specify the preallocated storage with either rlenmax or rlen (not both).
2150  Set rlenmax=PETSC_DEFAULT and rlen=NULL for PETSc to control dynamic memory
2151  allocation.  For large problems you MUST preallocate memory or you
2152  will get TERRIBLE performance, see the users' manual chapter on matrices.
2153 
2154  Level: intermediate
2155 
2156  .seealso: `MatCreate()`, `MatCreateSELL()`, `MatSetValues()`, `MatSeqSELLSetPreallocation()`, `MATSELL`, `MATSEQSELL`, `MATMPISELL`
2157 
2158  @*/
2159 PetscErrorCode MatCreateSeqSELL(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt maxallocrow,const PetscInt rlen[],Mat *A)
2160 {
2161   PetscFunctionBegin;
2162   PetscCall(MatCreate(comm,A));
2163   PetscCall(MatSetSizes(*A,m,n,m,n));
2164   PetscCall(MatSetType(*A,MATSEQSELL));
2165   PetscCall(MatSeqSELLSetPreallocation_SeqSELL(*A,maxallocrow,rlen));
2166   PetscFunctionReturn(0);
2167 }
2168 
2169 PetscErrorCode MatEqual_SeqSELL(Mat A,Mat B,PetscBool * flg)
2170 {
2171   Mat_SeqSELL    *a=(Mat_SeqSELL*)A->data,*b=(Mat_SeqSELL*)B->data;
2172   PetscInt       totalslices=a->totalslices;
2173 
2174   PetscFunctionBegin;
2175   /* If the  matrix dimensions are not equal,or no of nonzeros */
2176   if ((A->rmap->n != B->rmap->n) || (A->cmap->n != B->cmap->n) ||(a->nz != b->nz) || (a->rlenmax != b->rlenmax)) {
2177     *flg = PETSC_FALSE;
2178     PetscFunctionReturn(0);
2179   }
2180   /* if the a->colidx are the same */
2181   PetscCall(PetscArraycmp(a->colidx,b->colidx,a->sliidx[totalslices],flg));
2182   if (!*flg) PetscFunctionReturn(0);
2183   /* if a->val are the same */
2184   PetscCall(PetscArraycmp(a->val,b->val,a->sliidx[totalslices],flg));
2185   PetscFunctionReturn(0);
2186 }
2187 
2188 PetscErrorCode MatSeqSELLInvalidateDiagonal(Mat A)
2189 {
2190   Mat_SeqSELL *a=(Mat_SeqSELL*)A->data;
2191 
2192   PetscFunctionBegin;
2193   a->idiagvalid  = PETSC_FALSE;
2194   PetscFunctionReturn(0);
2195 }
2196 
2197 PetscErrorCode MatConjugate_SeqSELL(Mat A)
2198 {
2199 #if defined(PETSC_USE_COMPLEX)
2200   Mat_SeqSELL *a=(Mat_SeqSELL*)A->data;
2201   PetscInt    i;
2202   PetscScalar *val = a->val;
2203 
2204   PetscFunctionBegin;
2205   for (i=0; i<a->sliidx[a->totalslices]; i++) {
2206     val[i] = PetscConj(val[i]);
2207   }
2208 #else
2209   PetscFunctionBegin;
2210 #endif
2211   PetscFunctionReturn(0);
2212 }
2213