xref: /petsc/src/mat/impls/sell/seq/sell.c (revision 76be6f4ff3bd4e251c19fc00ebbebfd58b6e7589)
1 
2 /*
3   Defines the basic matrix operations for the SELL matrix storage format.
4 */
5 #include <../src/mat/impls/sell/seq/sell.h>  /*I   "petscmat.h"  I*/
6 #include <petscblaslapack.h>
7 #include <petsc/private/kernels/blocktranspose.h>
8 
9 static PetscBool  cited = PETSC_FALSE;
10 static const char citation[] =
11   "@inproceedings{ZhangELLPACK2018,\n"
12   " author = {Hong Zhang and Richard T. Mills and Karl Rupp and Barry F. Smith},\n"
13   " title = {Vectorized Parallel Sparse Matrix-Vector Multiplication in {PETSc} Using {AVX-512}},\n"
14   " booktitle = {Proceedings of the 47th International Conference on Parallel Processing},\n"
15   " year = 2018\n"
16 "}\n";
17 
18 #if defined(PETSC_HAVE_IMMINTRIN_H) && (defined(__AVX512F__) || (defined(__AVX2__) && defined(__FMA__)) || defined(__AVX__)) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
19 
20   #include <immintrin.h>
21 
22   #if !defined(_MM_SCALE_8)
23   #define _MM_SCALE_8    8
24   #endif
25 
26   #if defined(__AVX512F__)
27   /* these do not work
28    vec_idx  = _mm512_loadunpackhi_epi32(vec_idx,acolidx);
29    vec_vals = _mm512_loadunpackhi_pd(vec_vals,aval);
30   */
31     #define AVX512_Mult_Private(vec_idx,vec_x,vec_vals,vec_y) \
32     /* if the mask bit is set, copy from acolidx, otherwise from vec_idx */ \
33     vec_idx  = _mm256_loadu_si256((__m256i const*)acolidx); \
34     vec_vals = _mm512_loadu_pd(aval); \
35     vec_x    = _mm512_i32gather_pd(vec_idx,x,_MM_SCALE_8); \
36     vec_y    = _mm512_fmadd_pd(vec_x,vec_vals,vec_y)
37   #elif defined(__AVX2__) && defined(__FMA__)
38     #define AVX2_Mult_Private(vec_idx,vec_x,vec_vals,vec_y) \
39     vec_vals = _mm256_loadu_pd(aval); \
40     vec_idx  = _mm_loadu_si128((__m128i const*)acolidx); /* SSE2 */ \
41     vec_x    = _mm256_i32gather_pd(x,vec_idx,_MM_SCALE_8); \
42     vec_y    = _mm256_fmadd_pd(vec_x,vec_vals,vec_y)
43   #endif
44 #endif  /* PETSC_HAVE_IMMINTRIN_H */
45 
46 /*@C
47  MatSeqSELLSetPreallocation - For good matrix assembly performance
48  the user should preallocate the matrix storage by setting the parameter nz
49  (or the array nnz).  By setting these parameters accurately, performance
50  during matrix assembly can be increased significantly.
51 
52  Collective
53 
54  Input Parameters:
55  +  B - The matrix
56  .  nz - number of nonzeros per row (same for all rows)
57  -  nnz - array containing the number of nonzeros in the various rows
58  (possibly different for each row) or NULL
59 
60  Notes:
61  If nnz is given then nz is ignored.
62 
63  Specify the preallocated storage with either nz or nnz (not both).
64  Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
65  allocation.  For large problems you MUST preallocate memory or you
66  will get TERRIBLE performance, see the users' manual chapter on matrices.
67 
68  You can call MatGetInfo() to get information on how effective the preallocation was;
69  for example the fields mallocs,nz_allocated,nz_used,nz_unneeded;
70  You can also run with the option -info and look for messages with the string
71  malloc in them to see if additional memory allocation was needed.
72 
73  Developers: Use nz of MAT_SKIP_ALLOCATION to not allocate any space for the matrix
74  entries or columns indices.
75 
76  The maximum number of nonzeos in any row should be as accurate as possible.
77  If it is underestimated, you will get bad performance due to reallocation
78  (MatSeqXSELLReallocateSELL).
79 
80  Level: intermediate
81 
82  .seealso: `MatCreate()`, `MatCreateSELL()`, `MatSetValues()`, `MatGetInfo()`
83 
84  @*/
85 PetscErrorCode MatSeqSELLSetPreallocation(Mat B,PetscInt rlenmax,const PetscInt rlen[])
86 {
87   PetscFunctionBegin;
88   PetscValidHeaderSpecific(B,MAT_CLASSID,1);
89   PetscValidType(B,1);
90   PetscTryMethod(B,"MatSeqSELLSetPreallocation_C",(Mat,PetscInt,const PetscInt[]),(B,rlenmax,rlen));
91   PetscFunctionReturn(0);
92 }
93 
94 PetscErrorCode MatSeqSELLSetPreallocation_SeqSELL(Mat B,PetscInt maxallocrow,const PetscInt rlen[])
95 {
96   Mat_SeqSELL    *b;
97   PetscInt       i,j,totalslices;
98   PetscBool      skipallocation=PETSC_FALSE,realalloc=PETSC_FALSE;
99 
100   PetscFunctionBegin;
101   if (maxallocrow >= 0 || rlen) realalloc = PETSC_TRUE;
102   if (maxallocrow == MAT_SKIP_ALLOCATION) {
103     skipallocation = PETSC_TRUE;
104     maxallocrow    = 0;
105   }
106 
107   PetscCall(PetscLayoutSetUp(B->rmap));
108   PetscCall(PetscLayoutSetUp(B->cmap));
109 
110   /* FIXME: if one preallocates more space than needed, the matrix does not shrink automatically, but for best performance it should */
111   if (maxallocrow == PETSC_DEFAULT || maxallocrow == PETSC_DECIDE) maxallocrow = 5;
112   PetscCheck(maxallocrow >= 0,PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"maxallocrow cannot be less than 0: value %" PetscInt_FMT,maxallocrow);
113   if (rlen) {
114     for (i=0; i<B->rmap->n; i++) {
115       PetscCheck(rlen[i] >= 0,PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"rlen cannot be less than 0: local row %" PetscInt_FMT " value %" PetscInt_FMT,i,rlen[i]);
116       PetscCheck(rlen[i] <= B->cmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"rlen cannot be greater than row length: local row %" PetscInt_FMT " value %" PetscInt_FMT " rowlength %" PetscInt_FMT,i,rlen[i],B->cmap->n);
117     }
118   }
119 
120   B->preallocated = PETSC_TRUE;
121 
122   b = (Mat_SeqSELL*)B->data;
123 
124   totalslices = PetscCeilInt(B->rmap->n,8);
125   b->totalslices = totalslices;
126   if (!skipallocation) {
127     if (B->rmap->n & 0x07) PetscCall(PetscInfo(B,"Padding rows to the SEQSELL matrix because the number of rows is not the multiple of 8 (value %" PetscInt_FMT ")\n",B->rmap->n));
128 
129     if (!b->sliidx) { /* sliidx gives the starting index of each slice, the last element is the total space allocated */
130       PetscCall(PetscMalloc1(totalslices+1,&b->sliidx));
131       PetscCall(PetscLogObjectMemory((PetscObject)B,(totalslices+1)*sizeof(PetscInt)));
132     }
133     if (!rlen) { /* if rlen is not provided, allocate same space for all the slices */
134       if (maxallocrow == PETSC_DEFAULT || maxallocrow == PETSC_DECIDE) maxallocrow = 10;
135       else if (maxallocrow < 0) maxallocrow = 1;
136       for (i=0; i<=totalslices; i++) b->sliidx[i] = i*8*maxallocrow;
137     } else {
138       maxallocrow = 0;
139       b->sliidx[0] = 0;
140       for (i=1; i<totalslices; i++) {
141         b->sliidx[i] = 0;
142         for (j=0;j<8;j++) {
143           b->sliidx[i] = PetscMax(b->sliidx[i],rlen[8*(i-1)+j]);
144         }
145         maxallocrow = PetscMax(b->sliidx[i],maxallocrow);
146         PetscCall(PetscIntSumError(b->sliidx[i-1],8*b->sliidx[i],&b->sliidx[i]));
147       }
148       /* last slice */
149       b->sliidx[totalslices] = 0;
150       for (j=(totalslices-1)*8;j<B->rmap->n;j++) b->sliidx[totalslices] = PetscMax(b->sliidx[totalslices],rlen[j]);
151       maxallocrow = PetscMax(b->sliidx[totalslices],maxallocrow);
152       b->sliidx[totalslices] = b->sliidx[totalslices-1] + 8*b->sliidx[totalslices];
153     }
154 
155     /* allocate space for val, colidx, rlen */
156     /* FIXME: should B's old memory be unlogged? */
157     PetscCall(MatSeqXSELLFreeSELL(B,&b->val,&b->colidx));
158     /* FIXME: assuming an element of the bit array takes 8 bits */
159     PetscCall(PetscMalloc2(b->sliidx[totalslices],&b->val,b->sliidx[totalslices],&b->colidx));
160     PetscCall(PetscLogObjectMemory((PetscObject)B,b->sliidx[totalslices]*(sizeof(PetscScalar)+sizeof(PetscInt))));
161     /* b->rlen will count nonzeros in each row so far. We dont copy rlen to b->rlen because the matrix has not been set. */
162     PetscCall(PetscCalloc1(8*totalslices,&b->rlen));
163     PetscCall(PetscLogObjectMemory((PetscObject)B,8*totalslices*sizeof(PetscInt)));
164 
165     b->singlemalloc = PETSC_TRUE;
166     b->free_val     = PETSC_TRUE;
167     b->free_colidx  = PETSC_TRUE;
168   } else {
169     b->free_val    = PETSC_FALSE;
170     b->free_colidx = PETSC_FALSE;
171   }
172 
173   b->nz               = 0;
174   b->maxallocrow      = maxallocrow;
175   b->rlenmax          = maxallocrow;
176   b->maxallocmat      = b->sliidx[totalslices];
177   B->info.nz_unneeded = (double)b->maxallocmat;
178   if (realalloc) {
179     PetscCall(MatSetOption(B,MAT_NEW_NONZERO_ALLOCATION_ERR,PETSC_TRUE));
180   }
181   PetscFunctionReturn(0);
182 }
183 
184 PetscErrorCode MatGetRow_SeqSELL(Mat A,PetscInt row,PetscInt *nz,PetscInt **idx,PetscScalar **v)
185 {
186   Mat_SeqSELL *a = (Mat_SeqSELL*)A->data;
187   PetscInt    shift;
188 
189   PetscFunctionBegin;
190   PetscCheck(row >= 0 && row < A->rmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Row %" PetscInt_FMT " out of range",row);
191   if (nz) *nz = a->rlen[row];
192   shift = a->sliidx[row>>3]+(row&0x07);
193   if (!a->getrowcols) {
194 
195     PetscCall(PetscMalloc2(a->rlenmax,&a->getrowcols,a->rlenmax,&a->getrowvals));
196   }
197   if (idx) {
198     PetscInt j;
199     for (j=0; j<a->rlen[row]; j++) a->getrowcols[j] = a->colidx[shift+8*j];
200     *idx = a->getrowcols;
201   }
202   if (v) {
203     PetscInt j;
204     for (j=0; j<a->rlen[row]; j++) a->getrowvals[j] = a->val[shift+8*j];
205     *v = a->getrowvals;
206   }
207   PetscFunctionReturn(0);
208 }
209 
210 PetscErrorCode MatRestoreRow_SeqSELL(Mat A,PetscInt row,PetscInt *nz,PetscInt **idx,PetscScalar **v)
211 {
212   PetscFunctionBegin;
213   PetscFunctionReturn(0);
214 }
215 
216 PetscErrorCode MatConvert_SeqSELL_SeqAIJ(Mat A, MatType newtype,MatReuse reuse,Mat *newmat)
217 {
218   Mat            B;
219   Mat_SeqSELL    *a=(Mat_SeqSELL*)A->data;
220   PetscInt       i;
221 
222   PetscFunctionBegin;
223   if (reuse == MAT_REUSE_MATRIX) {
224     B    = *newmat;
225     PetscCall(MatZeroEntries(B));
226   } else {
227     PetscCall(MatCreate(PetscObjectComm((PetscObject)A),&B));
228     PetscCall(MatSetSizes(B,A->rmap->n,A->cmap->n,A->rmap->N,A->cmap->N));
229     PetscCall(MatSetType(B,MATSEQAIJ));
230     PetscCall(MatSeqAIJSetPreallocation(B,0,a->rlen));
231   }
232 
233   for (i=0; i<A->rmap->n; i++) {
234     PetscInt    nz = 0,*cols = NULL;
235     PetscScalar *vals = NULL;
236 
237     PetscCall(MatGetRow_SeqSELL(A,i,&nz,&cols,&vals));
238     PetscCall(MatSetValues(B,1,&i,nz,cols,vals,INSERT_VALUES));
239     PetscCall(MatRestoreRow_SeqSELL(A,i,&nz,&cols,&vals));
240   }
241 
242   PetscCall(MatAssemblyBegin(B,MAT_FINAL_ASSEMBLY));
243   PetscCall(MatAssemblyEnd(B,MAT_FINAL_ASSEMBLY));
244   B->rmap->bs = A->rmap->bs;
245 
246   if (reuse == MAT_INPLACE_MATRIX) {
247     PetscCall(MatHeaderReplace(A,&B));
248   } else {
249     *newmat = B;
250   }
251   PetscFunctionReturn(0);
252 }
253 
254 #include <../src/mat/impls/aij/seq/aij.h>
255 
256 PetscErrorCode MatConvert_SeqAIJ_SeqSELL(Mat A,MatType newtype,MatReuse reuse,Mat *newmat)
257 {
258   Mat               B;
259   Mat_SeqAIJ        *a=(Mat_SeqAIJ*)A->data;
260   PetscInt          *ai=a->i,m=A->rmap->N,n=A->cmap->N,i,*rowlengths,row,ncols;
261   const PetscInt    *cols;
262   const PetscScalar *vals;
263 
264   PetscFunctionBegin;
265 
266   if (reuse == MAT_REUSE_MATRIX) {
267     B = *newmat;
268   } else {
269     if (PetscDefined(USE_DEBUG) || !a->ilen) {
270       PetscCall(PetscMalloc1(m,&rowlengths));
271       for (i=0; i<m; i++) {
272         rowlengths[i] = ai[i+1] - ai[i];
273       }
274     }
275     if (PetscDefined(USE_DEBUG) && a->ilen) {
276       PetscBool eq;
277       PetscCall(PetscMemcmp(rowlengths,a->ilen,m*sizeof(PetscInt),&eq));
278       PetscCheck(eq,PETSC_COMM_SELF,PETSC_ERR_PLIB,"SeqAIJ ilen array incorrect");
279       PetscCall(PetscFree(rowlengths));
280       rowlengths = a->ilen;
281     } else if (a->ilen) rowlengths = a->ilen;
282     PetscCall(MatCreate(PetscObjectComm((PetscObject)A),&B));
283     PetscCall(MatSetSizes(B,m,n,m,n));
284     PetscCall(MatSetType(B,MATSEQSELL));
285     PetscCall(MatSeqSELLSetPreallocation(B,0,rowlengths));
286     if (rowlengths != a->ilen) PetscCall(PetscFree(rowlengths));
287   }
288 
289   for (row=0; row<m; row++) {
290     PetscCall(MatGetRow_SeqAIJ(A,row,&ncols,(PetscInt**)&cols,(PetscScalar**)&vals));
291     PetscCall(MatSetValues_SeqSELL(B,1,&row,ncols,cols,vals,INSERT_VALUES));
292     PetscCall(MatRestoreRow_SeqAIJ(A,row,&ncols,(PetscInt**)&cols,(PetscScalar**)&vals));
293   }
294   PetscCall(MatAssemblyBegin(B,MAT_FINAL_ASSEMBLY));
295   PetscCall(MatAssemblyEnd(B,MAT_FINAL_ASSEMBLY));
296   B->rmap->bs = A->rmap->bs;
297 
298   if (reuse == MAT_INPLACE_MATRIX) {
299     PetscCall(MatHeaderReplace(A,&B));
300   } else {
301     *newmat = B;
302   }
303   PetscFunctionReturn(0);
304 }
305 
306 PetscErrorCode MatMult_SeqSELL(Mat A,Vec xx,Vec yy)
307 {
308   Mat_SeqSELL       *a=(Mat_SeqSELL*)A->data;
309   PetscScalar       *y;
310   const PetscScalar *x;
311   const MatScalar   *aval=a->val;
312   PetscInt          totalslices=a->totalslices;
313   const PetscInt    *acolidx=a->colidx;
314   PetscInt          i,j;
315 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
316   __m512d           vec_x,vec_y,vec_vals;
317   __m256i           vec_idx;
318   __mmask8          mask;
319   __m512d           vec_x2,vec_y2,vec_vals2,vec_x3,vec_y3,vec_vals3,vec_x4,vec_y4,vec_vals4;
320   __m256i           vec_idx2,vec_idx3,vec_idx4;
321 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
322   __m128i           vec_idx;
323   __m256d           vec_x,vec_y,vec_y2,vec_vals;
324   MatScalar         yval;
325   PetscInt          r,rows_left,row,nnz_in_row;
326 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
327   __m128d           vec_x_tmp;
328   __m256d           vec_x,vec_y,vec_y2,vec_vals;
329   MatScalar         yval;
330   PetscInt          r,rows_left,row,nnz_in_row;
331 #else
332   PetscScalar       sum[8];
333 #endif
334 
335 #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
336 #pragma disjoint(*x,*y,*aval)
337 #endif
338 
339   PetscFunctionBegin;
340   PetscCall(VecGetArrayRead(xx,&x));
341   PetscCall(VecGetArray(yy,&y));
342 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
343   for (i=0; i<totalslices; i++) { /* loop over slices */
344     PetscPrefetchBlock(acolidx,a->sliidx[i+1]-a->sliidx[i],0,PETSC_PREFETCH_HINT_T0);
345     PetscPrefetchBlock(aval,a->sliidx[i+1]-a->sliidx[i],0,PETSC_PREFETCH_HINT_T0);
346 
347     vec_y  = _mm512_setzero_pd();
348     vec_y2 = _mm512_setzero_pd();
349     vec_y3 = _mm512_setzero_pd();
350     vec_y4 = _mm512_setzero_pd();
351 
352     j = a->sliidx[i]>>3; /* 8 bytes are read at each time, corresponding to a slice columnn */
353     switch ((a->sliidx[i+1]-a->sliidx[i])/8 & 3) {
354     case 3:
355       AVX512_Mult_Private(vec_idx,vec_x,vec_vals,vec_y);
356       acolidx += 8; aval += 8;
357       AVX512_Mult_Private(vec_idx2,vec_x2,vec_vals2,vec_y2);
358       acolidx += 8; aval += 8;
359       AVX512_Mult_Private(vec_idx3,vec_x3,vec_vals3,vec_y3);
360       acolidx += 8; aval += 8;
361       j += 3;
362       break;
363     case 2:
364       AVX512_Mult_Private(vec_idx,vec_x,vec_vals,vec_y);
365       acolidx += 8; aval += 8;
366       AVX512_Mult_Private(vec_idx2,vec_x2,vec_vals2,vec_y2);
367       acolidx += 8; aval += 8;
368       j += 2;
369       break;
370     case 1:
371       AVX512_Mult_Private(vec_idx,vec_x,vec_vals,vec_y);
372       acolidx += 8; aval += 8;
373       j += 1;
374       break;
375     }
376     #pragma novector
377     for (; j<(a->sliidx[i+1]>>3); j+=4) {
378       AVX512_Mult_Private(vec_idx,vec_x,vec_vals,vec_y);
379       acolidx += 8; aval += 8;
380       AVX512_Mult_Private(vec_idx2,vec_x2,vec_vals2,vec_y2);
381       acolidx += 8; aval += 8;
382       AVX512_Mult_Private(vec_idx3,vec_x3,vec_vals3,vec_y3);
383       acolidx += 8; aval += 8;
384       AVX512_Mult_Private(vec_idx4,vec_x4,vec_vals4,vec_y4);
385       acolidx += 8; aval += 8;
386     }
387 
388     vec_y = _mm512_add_pd(vec_y,vec_y2);
389     vec_y = _mm512_add_pd(vec_y,vec_y3);
390     vec_y = _mm512_add_pd(vec_y,vec_y4);
391     if (i == totalslices-1 && A->rmap->n & 0x07) { /* if last slice has padding rows */
392       mask = (__mmask8)(0xff >> (8-(A->rmap->n & 0x07)));
393       _mm512_mask_storeu_pd(&y[8*i],mask,vec_y);
394     } else {
395       _mm512_storeu_pd(&y[8*i],vec_y);
396     }
397   }
398 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
399   for (i=0; i<totalslices; i++) { /* loop over full slices */
400     PetscPrefetchBlock(acolidx,a->sliidx[i+1]-a->sliidx[i],0,PETSC_PREFETCH_HINT_T0);
401     PetscPrefetchBlock(aval,a->sliidx[i+1]-a->sliidx[i],0,PETSC_PREFETCH_HINT_T0);
402 
403     /* last slice may have padding rows. Don't use vectorization. */
404     if (i == totalslices-1 && (A->rmap->n & 0x07)) {
405       rows_left = A->rmap->n - 8*i;
406       for (r=0; r<rows_left; ++r) {
407         yval = (MatScalar)0;
408         row = 8*i + r;
409         nnz_in_row = a->rlen[row];
410         for (j=0; j<nnz_in_row; ++j) yval += aval[8*j+r] * x[acolidx[8*j+r]];
411         y[row] = yval;
412       }
413       break;
414     }
415 
416     vec_y  = _mm256_setzero_pd();
417     vec_y2 = _mm256_setzero_pd();
418 
419     /* Process slice of height 8 (512 bits) via two subslices of height 4 (256 bits) via AVX */
420     #pragma novector
421     #pragma unroll(2)
422     for (j=a->sliidx[i]; j<a->sliidx[i+1]; j+=8) {
423       AVX2_Mult_Private(vec_idx,vec_x,vec_vals,vec_y);
424       aval += 4; acolidx += 4;
425       AVX2_Mult_Private(vec_idx,vec_x,vec_vals,vec_y2);
426       aval += 4; acolidx += 4;
427     }
428 
429     _mm256_storeu_pd(y+i*8,vec_y);
430     _mm256_storeu_pd(y+i*8+4,vec_y2);
431   }
432 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
433   for (i=0; i<totalslices; i++) { /* loop over full slices */
434     PetscPrefetchBlock(acolidx,a->sliidx[i+1]-a->sliidx[i],0,PETSC_PREFETCH_HINT_T0);
435     PetscPrefetchBlock(aval,a->sliidx[i+1]-a->sliidx[i],0,PETSC_PREFETCH_HINT_T0);
436 
437     vec_y  = _mm256_setzero_pd();
438     vec_y2 = _mm256_setzero_pd();
439 
440     /* last slice may have padding rows. Don't use vectorization. */
441     if (i == totalslices-1 && (A->rmap->n & 0x07)) {
442       rows_left = A->rmap->n - 8*i;
443       for (r=0; r<rows_left; ++r) {
444         yval = (MatScalar)0;
445         row = 8*i + r;
446         nnz_in_row = a->rlen[row];
447         for (j=0; j<nnz_in_row; ++j) yval += aval[8*j + r] * x[acolidx[8*j + r]];
448         y[row] = yval;
449       }
450       break;
451     }
452 
453     /* Process slice of height 8 (512 bits) via two subslices of height 4 (256 bits) via AVX */
454     #pragma novector
455     #pragma unroll(2)
456     for (j=a->sliidx[i]; j<a->sliidx[i+1]; j+=8) {
457       vec_vals  = _mm256_loadu_pd(aval);
458       vec_x_tmp = _mm_setzero_pd();
459       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
460       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
461       vec_x     = _mm256_insertf128_pd(vec_x,vec_x_tmp,0);
462       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
463       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
464       vec_x     = _mm256_insertf128_pd(vec_x,vec_x_tmp,1);
465       vec_y     = _mm256_add_pd(_mm256_mul_pd(vec_x,vec_vals),vec_y);
466       aval     += 4;
467 
468       vec_vals  = _mm256_loadu_pd(aval);
469       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
470       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
471       vec_x     = _mm256_insertf128_pd(vec_x,vec_x_tmp,0);
472       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
473       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
474       vec_x     = _mm256_insertf128_pd(vec_x,vec_x_tmp,1);
475       vec_y2    = _mm256_add_pd(_mm256_mul_pd(vec_x,vec_vals),vec_y2);
476       aval     += 4;
477     }
478 
479     _mm256_storeu_pd(y + i*8,     vec_y);
480     _mm256_storeu_pd(y + i*8 + 4, vec_y2);
481   }
482 #else
483   for (i=0; i<totalslices; i++) { /* loop over slices */
484     for (j=0; j<8; j++) sum[j] = 0.0;
485     for (j=a->sliidx[i]; j<a->sliidx[i+1]; j+=8) {
486       sum[0] += aval[j] * x[acolidx[j]];
487       sum[1] += aval[j+1] * x[acolidx[j+1]];
488       sum[2] += aval[j+2] * x[acolidx[j+2]];
489       sum[3] += aval[j+3] * x[acolidx[j+3]];
490       sum[4] += aval[j+4] * x[acolidx[j+4]];
491       sum[5] += aval[j+5] * x[acolidx[j+5]];
492       sum[6] += aval[j+6] * x[acolidx[j+6]];
493       sum[7] += aval[j+7] * x[acolidx[j+7]];
494     }
495     if (i == totalslices-1 && (A->rmap->n & 0x07)) { /* if last slice has padding rows */
496       for (j=0; j<(A->rmap->n & 0x07); j++) y[8*i+j] = sum[j];
497     } else {
498       for (j=0; j<8; j++) y[8*i+j] = sum[j];
499     }
500   }
501 #endif
502 
503   PetscCall(PetscLogFlops(2.0*a->nz-a->nonzerorowcnt)); /* theoretical minimal FLOPs */
504   PetscCall(VecRestoreArrayRead(xx,&x));
505   PetscCall(VecRestoreArray(yy,&y));
506   PetscFunctionReturn(0);
507 }
508 
509 #include <../src/mat/impls/aij/seq/ftn-kernels/fmultadd.h>
510 PetscErrorCode MatMultAdd_SeqSELL(Mat A,Vec xx,Vec yy,Vec zz)
511 {
512   Mat_SeqSELL       *a=(Mat_SeqSELL*)A->data;
513   PetscScalar       *y,*z;
514   const PetscScalar *x;
515   const MatScalar   *aval=a->val;
516   PetscInt          totalslices=a->totalslices;
517   const PetscInt    *acolidx=a->colidx;
518   PetscInt          i,j;
519 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
520   __m512d           vec_x,vec_y,vec_vals;
521   __m256i           vec_idx;
522   __mmask8          mask;
523   __m512d           vec_x2,vec_y2,vec_vals2,vec_x3,vec_y3,vec_vals3,vec_x4,vec_y4,vec_vals4;
524   __m256i           vec_idx2,vec_idx3,vec_idx4;
525 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
526   __m128d           vec_x_tmp;
527   __m256d           vec_x,vec_y,vec_y2,vec_vals;
528   MatScalar         yval;
529   PetscInt          r,row,nnz_in_row;
530 #else
531   PetscScalar       sum[8];
532 #endif
533 
534 #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
535 #pragma disjoint(*x,*y,*aval)
536 #endif
537 
538   PetscFunctionBegin;
539   PetscCall(VecGetArrayRead(xx,&x));
540   PetscCall(VecGetArrayPair(yy,zz,&y,&z));
541 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
542   for (i=0; i<totalslices; i++) { /* loop over slices */
543     PetscPrefetchBlock(acolidx,a->sliidx[i+1]-a->sliidx[i],0,PETSC_PREFETCH_HINT_T0);
544     PetscPrefetchBlock(aval,a->sliidx[i+1]-a->sliidx[i],0,PETSC_PREFETCH_HINT_T0);
545 
546     if (i == totalslices-1 && A->rmap->n & 0x07) { /* if last slice has padding rows */
547       mask   = (__mmask8)(0xff >> (8-(A->rmap->n & 0x07)));
548       vec_y  = _mm512_mask_loadu_pd(vec_y,mask,&y[8*i]);
549     } else {
550       vec_y  = _mm512_loadu_pd(&y[8*i]);
551     }
552     vec_y2 = _mm512_setzero_pd();
553     vec_y3 = _mm512_setzero_pd();
554     vec_y4 = _mm512_setzero_pd();
555 
556     j = a->sliidx[i]>>3; /* 8 bytes are read at each time, corresponding to a slice columnn */
557     switch ((a->sliidx[i+1]-a->sliidx[i])/8 & 3) {
558     case 3:
559       AVX512_Mult_Private(vec_idx,vec_x,vec_vals,vec_y);
560       acolidx += 8; aval += 8;
561       AVX512_Mult_Private(vec_idx2,vec_x2,vec_vals2,vec_y2);
562       acolidx += 8; aval += 8;
563       AVX512_Mult_Private(vec_idx3,vec_x3,vec_vals3,vec_y3);
564       acolidx += 8; aval += 8;
565       j += 3;
566       break;
567     case 2:
568       AVX512_Mult_Private(vec_idx,vec_x,vec_vals,vec_y);
569       acolidx += 8; aval += 8;
570       AVX512_Mult_Private(vec_idx2,vec_x2,vec_vals2,vec_y2);
571       acolidx += 8; aval += 8;
572       j += 2;
573       break;
574     case 1:
575       AVX512_Mult_Private(vec_idx,vec_x,vec_vals,vec_y);
576       acolidx += 8; aval += 8;
577       j += 1;
578       break;
579     }
580     #pragma novector
581     for (; j<(a->sliidx[i+1]>>3); j+=4) {
582       AVX512_Mult_Private(vec_idx,vec_x,vec_vals,vec_y);
583       acolidx += 8; aval += 8;
584       AVX512_Mult_Private(vec_idx2,vec_x2,vec_vals2,vec_y2);
585       acolidx += 8; aval += 8;
586       AVX512_Mult_Private(vec_idx3,vec_x3,vec_vals3,vec_y3);
587       acolidx += 8; aval += 8;
588       AVX512_Mult_Private(vec_idx4,vec_x4,vec_vals4,vec_y4);
589       acolidx += 8; aval += 8;
590     }
591 
592     vec_y = _mm512_add_pd(vec_y,vec_y2);
593     vec_y = _mm512_add_pd(vec_y,vec_y3);
594     vec_y = _mm512_add_pd(vec_y,vec_y4);
595     if (i == totalslices-1 && A->rmap->n & 0x07) { /* if last slice has padding rows */
596       _mm512_mask_storeu_pd(&z[8*i],mask,vec_y);
597     } else {
598       _mm512_storeu_pd(&z[8*i],vec_y);
599     }
600   }
601 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
602   for (i=0; i<totalslices; i++) { /* loop over full slices */
603     PetscPrefetchBlock(acolidx,a->sliidx[i+1]-a->sliidx[i],0,PETSC_PREFETCH_HINT_T0);
604     PetscPrefetchBlock(aval,a->sliidx[i+1]-a->sliidx[i],0,PETSC_PREFETCH_HINT_T0);
605 
606     /* last slice may have padding rows. Don't use vectorization. */
607     if (i == totalslices-1 && (A->rmap->n & 0x07)) {
608       for (r=0; r<(A->rmap->n & 0x07); ++r) {
609         row        = 8*i + r;
610         yval       = (MatScalar)0.0;
611         nnz_in_row = a->rlen[row];
612         for (j=0; j<nnz_in_row; ++j) yval += aval[8*j+r] * x[acolidx[8*j+r]];
613         z[row] = y[row] + yval;
614       }
615       break;
616     }
617 
618     vec_y  = _mm256_loadu_pd(y+8*i);
619     vec_y2 = _mm256_loadu_pd(y+8*i+4);
620 
621     /* Process slice of height 8 (512 bits) via two subslices of height 4 (256 bits) via AVX */
622     for (j=a->sliidx[i]; j<a->sliidx[i+1]; j+=8) {
623       vec_vals  = _mm256_loadu_pd(aval);
624       vec_x_tmp = _mm_setzero_pd();
625       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
626       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
627       vec_x     = _mm256_setzero_pd();
628       vec_x     = _mm256_insertf128_pd(vec_x,vec_x_tmp,0);
629       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
630       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
631       vec_x     = _mm256_insertf128_pd(vec_x,vec_x_tmp,1);
632       vec_y     = _mm256_add_pd(_mm256_mul_pd(vec_x,vec_vals),vec_y);
633       aval     += 4;
634 
635       vec_vals  = _mm256_loadu_pd(aval);
636       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
637       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
638       vec_x     = _mm256_insertf128_pd(vec_x,vec_x_tmp,0);
639       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
640       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
641       vec_x     = _mm256_insertf128_pd(vec_x,vec_x_tmp,1);
642       vec_y2    = _mm256_add_pd(_mm256_mul_pd(vec_x,vec_vals),vec_y2);
643       aval     += 4;
644     }
645 
646     _mm256_storeu_pd(z+i*8,vec_y);
647     _mm256_storeu_pd(z+i*8+4,vec_y2);
648   }
649 #else
650   for (i=0; i<totalslices; i++) { /* loop over slices */
651     for (j=0; j<8; j++) sum[j] = 0.0;
652     for (j=a->sliidx[i]; j<a->sliidx[i+1]; j+=8) {
653       sum[0] += aval[j] * x[acolidx[j]];
654       sum[1] += aval[j+1] * x[acolidx[j+1]];
655       sum[2] += aval[j+2] * x[acolidx[j+2]];
656       sum[3] += aval[j+3] * x[acolidx[j+3]];
657       sum[4] += aval[j+4] * x[acolidx[j+4]];
658       sum[5] += aval[j+5] * x[acolidx[j+5]];
659       sum[6] += aval[j+6] * x[acolidx[j+6]];
660       sum[7] += aval[j+7] * x[acolidx[j+7]];
661     }
662     if (i == totalslices-1 && (A->rmap->n & 0x07)) {
663       for (j=0; j<(A->rmap->n & 0x07); j++) z[8*i+j] = y[8*i+j] + sum[j];
664     } else {
665       for (j=0; j<8; j++) z[8*i+j] = y[8*i+j] + sum[j];
666     }
667   }
668 #endif
669 
670   PetscCall(PetscLogFlops(2.0*a->nz));
671   PetscCall(VecRestoreArrayRead(xx,&x));
672   PetscCall(VecRestoreArrayPair(yy,zz,&y,&z));
673   PetscFunctionReturn(0);
674 }
675 
676 PetscErrorCode MatMultTransposeAdd_SeqSELL(Mat A,Vec xx,Vec zz,Vec yy)
677 {
678   Mat_SeqSELL       *a=(Mat_SeqSELL*)A->data;
679   PetscScalar       *y;
680   const PetscScalar *x;
681   const MatScalar   *aval=a->val;
682   const PetscInt    *acolidx=a->colidx;
683   PetscInt          i,j,r,row,nnz_in_row,totalslices=a->totalslices;
684 
685 #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
686 #pragma disjoint(*x,*y,*aval)
687 #endif
688 
689   PetscFunctionBegin;
690   if (A->symmetric) {
691     PetscCall(MatMultAdd_SeqSELL(A,xx,zz,yy));
692     PetscFunctionReturn(0);
693   }
694   if (zz != yy) PetscCall(VecCopy(zz,yy));
695   PetscCall(VecGetArrayRead(xx,&x));
696   PetscCall(VecGetArray(yy,&y));
697   for (i=0; i<a->totalslices; i++) { /* loop over slices */
698     if (i == totalslices-1 && (A->rmap->n & 0x07)) {
699       for (r=0; r<(A->rmap->n & 0x07); ++r) {
700         row        = 8*i + r;
701         nnz_in_row = a->rlen[row];
702         for (j=0; j<nnz_in_row; ++j) y[acolidx[8*j+r]] += aval[8*j+r] * x[row];
703       }
704       break;
705     }
706     for (j=a->sliidx[i]; j<a->sliidx[i+1]; j+=8) {
707       y[acolidx[j]]   += aval[j] * x[8*i];
708       y[acolidx[j+1]] += aval[j+1] * x[8*i+1];
709       y[acolidx[j+2]] += aval[j+2] * x[8*i+2];
710       y[acolidx[j+3]] += aval[j+3] * x[8*i+3];
711       y[acolidx[j+4]] += aval[j+4] * x[8*i+4];
712       y[acolidx[j+5]] += aval[j+5] * x[8*i+5];
713       y[acolidx[j+6]] += aval[j+6] * x[8*i+6];
714       y[acolidx[j+7]] += aval[j+7] * x[8*i+7];
715     }
716   }
717   PetscCall(PetscLogFlops(2.0*a->sliidx[a->totalslices]));
718   PetscCall(VecRestoreArrayRead(xx,&x));
719   PetscCall(VecRestoreArray(yy,&y));
720   PetscFunctionReturn(0);
721 }
722 
723 PetscErrorCode MatMultTranspose_SeqSELL(Mat A,Vec xx,Vec yy)
724 {
725   PetscFunctionBegin;
726   if (A->symmetric) {
727     PetscCall(MatMult_SeqSELL(A,xx,yy));
728   } else {
729     PetscCall(VecSet(yy,0.0));
730     PetscCall(MatMultTransposeAdd_SeqSELL(A,xx,yy,yy));
731   }
732   PetscFunctionReturn(0);
733 }
734 
735 /*
736      Checks for missing diagonals
737 */
738 PetscErrorCode MatMissingDiagonal_SeqSELL(Mat A,PetscBool  *missing,PetscInt *d)
739 {
740   Mat_SeqSELL    *a=(Mat_SeqSELL*)A->data;
741   PetscInt       *diag,i;
742 
743   PetscFunctionBegin;
744   *missing = PETSC_FALSE;
745   if (A->rmap->n > 0 && !(a->colidx)) {
746     *missing = PETSC_TRUE;
747     if (d) *d = 0;
748     PetscCall(PetscInfo(A,"Matrix has no entries therefore is missing diagonal\n"));
749   } else {
750     diag = a->diag;
751     for (i=0; i<A->rmap->n; i++) {
752       if (diag[i] == -1) {
753         *missing = PETSC_TRUE;
754         if (d) *d = i;
755         PetscCall(PetscInfo(A,"Matrix is missing diagonal number %" PetscInt_FMT "\n",i));
756         break;
757       }
758     }
759   }
760   PetscFunctionReturn(0);
761 }
762 
763 PetscErrorCode MatMarkDiagonal_SeqSELL(Mat A)
764 {
765   Mat_SeqSELL    *a=(Mat_SeqSELL*)A->data;
766   PetscInt       i,j,m=A->rmap->n,shift;
767 
768   PetscFunctionBegin;
769   if (!a->diag) {
770     PetscCall(PetscMalloc1(m,&a->diag));
771     PetscCall(PetscLogObjectMemory((PetscObject)A,m*sizeof(PetscInt)));
772     a->free_diag = PETSC_TRUE;
773   }
774   for (i=0; i<m; i++) { /* loop over rows */
775     shift = a->sliidx[i>>3]+(i&0x07); /* starting index of the row i */
776     a->diag[i] = -1;
777     for (j=0; j<a->rlen[i]; j++) {
778       if (a->colidx[shift+j*8] == i) {
779         a->diag[i] = shift+j*8;
780         break;
781       }
782     }
783   }
784   PetscFunctionReturn(0);
785 }
786 
787 /*
788   Negative shift indicates do not generate an error if there is a zero diagonal, just invert it anyways
789 */
790 PetscErrorCode MatInvertDiagonal_SeqSELL(Mat A,PetscScalar omega,PetscScalar fshift)
791 {
792   Mat_SeqSELL    *a=(Mat_SeqSELL*) A->data;
793   PetscInt       i,*diag,m = A->rmap->n;
794   MatScalar      *val = a->val;
795   PetscScalar    *idiag,*mdiag;
796 
797   PetscFunctionBegin;
798   if (a->idiagvalid) PetscFunctionReturn(0);
799   PetscCall(MatMarkDiagonal_SeqSELL(A));
800   diag = a->diag;
801   if (!a->idiag) {
802     PetscCall(PetscMalloc3(m,&a->idiag,m,&a->mdiag,m,&a->ssor_work));
803     PetscCall(PetscLogObjectMemory((PetscObject)A, 3*m*sizeof(PetscScalar)));
804     val  = a->val;
805   }
806   mdiag = a->mdiag;
807   idiag = a->idiag;
808 
809   if (omega == 1.0 && PetscRealPart(fshift) <= 0.0) {
810     for (i=0; i<m; i++) {
811       mdiag[i] = val[diag[i]];
812       if (!PetscAbsScalar(mdiag[i])) { /* zero diagonal */
813         if (PetscRealPart(fshift)) {
814           PetscCall(PetscInfo(A,"Zero diagonal on row %" PetscInt_FMT "\n",i));
815           A->factorerrortype             = MAT_FACTOR_NUMERIC_ZEROPIVOT;
816           A->factorerror_zeropivot_value = 0.0;
817           A->factorerror_zeropivot_row   = i;
818         } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"Zero diagonal on row %" PetscInt_FMT,i);
819       }
820       idiag[i] = 1.0/val[diag[i]];
821     }
822     PetscCall(PetscLogFlops(m));
823   } else {
824     for (i=0; i<m; i++) {
825       mdiag[i] = val[diag[i]];
826       idiag[i] = omega/(fshift + val[diag[i]]);
827     }
828     PetscCall(PetscLogFlops(2.0*m));
829   }
830   a->idiagvalid = PETSC_TRUE;
831   PetscFunctionReturn(0);
832 }
833 
834 PetscErrorCode MatZeroEntries_SeqSELL(Mat A)
835 {
836   Mat_SeqSELL    *a=(Mat_SeqSELL*)A->data;
837 
838   PetscFunctionBegin;
839   PetscCall(PetscArrayzero(a->val,a->sliidx[a->totalslices]));
840   PetscCall(MatSeqSELLInvalidateDiagonal(A));
841   PetscFunctionReturn(0);
842 }
843 
844 PetscErrorCode MatDestroy_SeqSELL(Mat A)
845 {
846   Mat_SeqSELL    *a=(Mat_SeqSELL*)A->data;
847 
848   PetscFunctionBegin;
849 #if defined(PETSC_USE_LOG)
850   PetscLogObjectState((PetscObject)A,"Rows=%" PetscInt_FMT ", Cols=%" PetscInt_FMT ", NZ=%" PetscInt_FMT,A->rmap->n,A->cmap->n,a->nz);
851 #endif
852   PetscCall(MatSeqXSELLFreeSELL(A,&a->val,&a->colidx));
853   PetscCall(ISDestroy(&a->row));
854   PetscCall(ISDestroy(&a->col));
855   PetscCall(PetscFree(a->diag));
856   PetscCall(PetscFree(a->rlen));
857   PetscCall(PetscFree(a->sliidx));
858   PetscCall(PetscFree3(a->idiag,a->mdiag,a->ssor_work));
859   PetscCall(PetscFree(a->solve_work));
860   PetscCall(ISDestroy(&a->icol));
861   PetscCall(PetscFree(a->saved_values));
862   PetscCall(PetscFree2(a->getrowcols,a->getrowvals));
863 
864   PetscCall(PetscFree(A->data));
865 
866   PetscCall(PetscObjectChangeTypeName((PetscObject)A,NULL));
867   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatStoreValues_C",NULL));
868   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatRetrieveValues_C",NULL));
869   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSeqSELLSetPreallocation_C",NULL));
870   PetscFunctionReturn(0);
871 }
872 
873 PetscErrorCode MatSetOption_SeqSELL(Mat A,MatOption op,PetscBool flg)
874 {
875   Mat_SeqSELL    *a=(Mat_SeqSELL*)A->data;
876 
877   PetscFunctionBegin;
878   switch (op) {
879   case MAT_ROW_ORIENTED:
880     a->roworiented = flg;
881     break;
882   case MAT_KEEP_NONZERO_PATTERN:
883     a->keepnonzeropattern = flg;
884     break;
885   case MAT_NEW_NONZERO_LOCATIONS:
886     a->nonew = (flg ? 0 : 1);
887     break;
888   case MAT_NEW_NONZERO_LOCATION_ERR:
889     a->nonew = (flg ? -1 : 0);
890     break;
891   case MAT_NEW_NONZERO_ALLOCATION_ERR:
892     a->nonew = (flg ? -2 : 0);
893     break;
894   case MAT_UNUSED_NONZERO_LOCATION_ERR:
895     a->nounused = (flg ? -1 : 0);
896     break;
897   case MAT_FORCE_DIAGONAL_ENTRIES:
898   case MAT_IGNORE_OFF_PROC_ENTRIES:
899   case MAT_USE_HASH_TABLE:
900   case MAT_SORTED_FULL:
901     PetscCall(PetscInfo(A,"Option %s ignored\n",MatOptions[op]));
902     break;
903   case MAT_SPD:
904   case MAT_SYMMETRIC:
905   case MAT_STRUCTURALLY_SYMMETRIC:
906   case MAT_HERMITIAN:
907   case MAT_SYMMETRY_ETERNAL:
908     /* These options are handled directly by MatSetOption() */
909     break;
910   default:
911     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"unknown option %d",op);
912   }
913   PetscFunctionReturn(0);
914 }
915 
916 PetscErrorCode MatGetDiagonal_SeqSELL(Mat A,Vec v)
917 {
918   Mat_SeqSELL    *a=(Mat_SeqSELL*)A->data;
919   PetscInt       i,j,n,shift;
920   PetscScalar    *x,zero=0.0;
921 
922   PetscFunctionBegin;
923   PetscCall(VecGetLocalSize(v,&n));
924   PetscCheck(n == A->rmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector");
925 
926   if (A->factortype == MAT_FACTOR_ILU || A->factortype == MAT_FACTOR_LU) {
927     PetscInt *diag=a->diag;
928     PetscCall(VecGetArray(v,&x));
929     for (i=0; i<n; i++) x[i] = 1.0/a->val[diag[i]];
930     PetscCall(VecRestoreArray(v,&x));
931     PetscFunctionReturn(0);
932   }
933 
934   PetscCall(VecSet(v,zero));
935   PetscCall(VecGetArray(v,&x));
936   for (i=0; i<n; i++) { /* loop over rows */
937     shift = a->sliidx[i>>3]+(i&0x07); /* starting index of the row i */
938     x[i] = 0;
939     for (j=0; j<a->rlen[i]; j++) {
940       if (a->colidx[shift+j*8] == i) {
941         x[i] = a->val[shift+j*8];
942         break;
943       }
944     }
945   }
946   PetscCall(VecRestoreArray(v,&x));
947   PetscFunctionReturn(0);
948 }
949 
950 PetscErrorCode MatDiagonalScale_SeqSELL(Mat A,Vec ll,Vec rr)
951 {
952   Mat_SeqSELL       *a=(Mat_SeqSELL*)A->data;
953   const PetscScalar *l,*r;
954   PetscInt          i,j,m,n,row;
955 
956   PetscFunctionBegin;
957   if (ll) {
958     /* The local size is used so that VecMPI can be passed to this routine
959        by MatDiagonalScale_MPISELL */
960     PetscCall(VecGetLocalSize(ll,&m));
961     PetscCheck(m == A->rmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Left scaling vector wrong length");
962     PetscCall(VecGetArrayRead(ll,&l));
963     for (i=0; i<a->totalslices; i++) { /* loop over slices */
964       if (i == a->totalslices-1 && (A->rmap->n & 0x07)) { /* if last slice has padding rows */
965         for (j=a->sliidx[i],row=0; j<a->sliidx[i+1]; j++,row=((row+1)&0x07)) {
966           if (row < (A->rmap->n & 0x07)) a->val[j] *= l[8*i+row];
967         }
968       } else {
969         for (j=a->sliidx[i],row=0; j<a->sliidx[i+1]; j++,row=((row+1)&0x07)) {
970           a->val[j] *= l[8*i+row];
971         }
972       }
973     }
974     PetscCall(VecRestoreArrayRead(ll,&l));
975     PetscCall(PetscLogFlops(a->nz));
976   }
977   if (rr) {
978     PetscCall(VecGetLocalSize(rr,&n));
979     PetscCheck(n == A->cmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Right scaling vector wrong length");
980     PetscCall(VecGetArrayRead(rr,&r));
981     for (i=0; i<a->totalslices; i++) { /* loop over slices */
982       if (i == a->totalslices-1 && (A->rmap->n & 0x07)) { /* if last slice has padding rows */
983         for (j=a->sliidx[i],row=0; j<a->sliidx[i+1]; j++,row=((row+1)&0x07)) {
984           if (row < (A->rmap->n & 0x07)) a->val[j] *= r[a->colidx[j]];
985         }
986       } else {
987         for (j=a->sliidx[i]; j<a->sliidx[i+1]; j++) {
988           a->val[j] *= r[a->colidx[j]];
989         }
990       }
991     }
992     PetscCall(VecRestoreArrayRead(rr,&r));
993     PetscCall(PetscLogFlops(a->nz));
994   }
995   PetscCall(MatSeqSELLInvalidateDiagonal(A));
996   PetscFunctionReturn(0);
997 }
998 
999 PetscErrorCode MatGetValues_SeqSELL(Mat A,PetscInt m,const PetscInt im[],PetscInt n,const PetscInt in[],PetscScalar v[])
1000 {
1001   Mat_SeqSELL *a=(Mat_SeqSELL*)A->data;
1002   PetscInt    *cp,i,k,low,high,t,row,col,l;
1003   PetscInt    shift;
1004   MatScalar   *vp;
1005 
1006   PetscFunctionBegin;
1007   for (k=0; k<m; k++) { /* loop over requested rows */
1008     row = im[k];
1009     if (row<0) continue;
1010     PetscCheck(row < A->rmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Row too large: row %" PetscInt_FMT " max %" PetscInt_FMT,row,A->rmap->n-1);
1011     shift = a->sliidx[row>>3]+(row&0x07); /* starting index of the row */
1012     cp = a->colidx+shift; /* pointer to the row */
1013     vp = a->val+shift; /* pointer to the row */
1014     for (l=0; l<n; l++) { /* loop over requested columns */
1015       col = in[l];
1016       if (col<0) continue;
1017       PetscCheck(col < A->cmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Column too large: row %" PetscInt_FMT " max %" PetscInt_FMT,col,A->cmap->n-1);
1018       high = a->rlen[row]; low = 0; /* assume unsorted */
1019       while (high-low > 5) {
1020         t = (low+high)/2;
1021         if (*(cp+t*8) > col) high = t;
1022         else low = t;
1023       }
1024       for (i=low; i<high; i++) {
1025         if (*(cp+8*i) > col) break;
1026         if (*(cp+8*i) == col) {
1027           *v++ = *(vp+8*i);
1028           goto finished;
1029         }
1030       }
1031       *v++ = 0.0;
1032     finished:;
1033     }
1034   }
1035   PetscFunctionReturn(0);
1036 }
1037 
1038 PetscErrorCode MatView_SeqSELL_ASCII(Mat A,PetscViewer viewer)
1039 {
1040   Mat_SeqSELL       *a=(Mat_SeqSELL*)A->data;
1041   PetscInt          i,j,m=A->rmap->n,shift;
1042   const char        *name;
1043   PetscViewerFormat format;
1044 
1045   PetscFunctionBegin;
1046   PetscCall(PetscViewerGetFormat(viewer,&format));
1047   if (format == PETSC_VIEWER_ASCII_MATLAB) {
1048     PetscInt nofinalvalue = 0;
1049     /*
1050     if (m && ((a->i[m] == a->i[m-1]) || (a->j[a->nz-1] != A->cmap->n-1))) {
1051       nofinalvalue = 1;
1052     }
1053     */
1054     PetscCall(PetscViewerASCIIUseTabs(viewer,PETSC_FALSE));
1055     PetscCall(PetscViewerASCIIPrintf(viewer,"%% Size = %" PetscInt_FMT " %" PetscInt_FMT " \n",m,A->cmap->n));
1056     PetscCall(PetscViewerASCIIPrintf(viewer,"%% Nonzeros = %" PetscInt_FMT " \n",a->nz));
1057 #if defined(PETSC_USE_COMPLEX)
1058     PetscCall(PetscViewerASCIIPrintf(viewer,"zzz = zeros(%" PetscInt_FMT ",4);\n",a->nz+nofinalvalue));
1059 #else
1060     PetscCall(PetscViewerASCIIPrintf(viewer,"zzz = zeros(%" PetscInt_FMT ",3);\n",a->nz+nofinalvalue));
1061 #endif
1062     PetscCall(PetscViewerASCIIPrintf(viewer,"zzz = [\n"));
1063 
1064     for (i=0; i<m; i++) {
1065       shift = a->sliidx[i>>3]+(i&0x07);
1066       for (j=0; j<a->rlen[i]; j++) {
1067 #if defined(PETSC_USE_COMPLEX)
1068         PetscCall(PetscViewerASCIIPrintf(viewer,"%" PetscInt_FMT " %" PetscInt_FMT "  %18.16e %18.16e\n",i+1,a->colidx[shift+8*j]+1,(double)PetscRealPart(a->val[shift+8*j]),(double)PetscImaginaryPart(a->val[shift+8*j])));
1069 #else
1070         PetscCall(PetscViewerASCIIPrintf(viewer,"%" PetscInt_FMT " %" PetscInt_FMT "  %18.16e\n",i+1,a->colidx[shift+8*j]+1,(double)a->val[shift+8*j]));
1071 #endif
1072       }
1073     }
1074     /*
1075     if (nofinalvalue) {
1076 #if defined(PETSC_USE_COMPLEX)
1077       PetscCall(PetscViewerASCIIPrintf(viewer,"%" PetscInt_FMT " %" PetscInt_FMT "  %18.16e %18.16e\n",m,A->cmap->n,0.,0.));
1078 #else
1079       PetscCall(PetscViewerASCIIPrintf(viewer,"%" PetscInt_FMT " %" PetscInt_FMT "  %18.16e\n",m,A->cmap->n,0.0));
1080 #endif
1081     }
1082     */
1083     PetscCall(PetscObjectGetName((PetscObject)A,&name));
1084     PetscCall(PetscViewerASCIIPrintf(viewer,"];\n %s = spconvert(zzz);\n",name));
1085     PetscCall(PetscViewerASCIIUseTabs(viewer,PETSC_TRUE));
1086   } else if (format == PETSC_VIEWER_ASCII_FACTOR_INFO || format == PETSC_VIEWER_ASCII_INFO) {
1087     PetscFunctionReturn(0);
1088   } else if (format == PETSC_VIEWER_ASCII_COMMON) {
1089     PetscCall(PetscViewerASCIIUseTabs(viewer,PETSC_FALSE));
1090     for (i=0; i<m; i++) {
1091       PetscCall(PetscViewerASCIIPrintf(viewer,"row %" PetscInt_FMT ":",i));
1092       shift = a->sliidx[i>>3]+(i&0x07);
1093       for (j=0; j<a->rlen[i]; j++) {
1094 #if defined(PETSC_USE_COMPLEX)
1095         if (PetscImaginaryPart(a->val[shift+8*j]) > 0.0 && PetscRealPart(a->val[shift+8*j]) != 0.0) {
1096           PetscCall(PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g + %g i)",a->colidx[shift+8*j],(double)PetscRealPart(a->val[shift+8*j]),(double)PetscImaginaryPart(a->val[shift+8*j])));
1097         } else if (PetscImaginaryPart(a->val[shift+8*j]) < 0.0 && PetscRealPart(a->val[shift+8*j]) != 0.0) {
1098           PetscCall(PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g - %g i)",a->colidx[shift+8*j],(double)PetscRealPart(a->val[shift+8*j]),(double)-PetscImaginaryPart(a->val[shift+8*j])));
1099         } else if (PetscRealPart(a->val[shift+8*j]) != 0.0) {
1100           PetscCall(PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g) ",a->colidx[shift+8*j],(double)PetscRealPart(a->val[shift+8*j])));
1101         }
1102 #else
1103         if (a->val[shift+8*j] != 0.0) PetscCall(PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g) ",a->colidx[shift+8*j],(double)a->val[shift+8*j]));
1104 #endif
1105       }
1106       PetscCall(PetscViewerASCIIPrintf(viewer,"\n"));
1107     }
1108     PetscCall(PetscViewerASCIIUseTabs(viewer,PETSC_TRUE));
1109   } else if (format == PETSC_VIEWER_ASCII_DENSE) {
1110     PetscInt    cnt=0,jcnt;
1111     PetscScalar value;
1112 #if defined(PETSC_USE_COMPLEX)
1113     PetscBool   realonly=PETSC_TRUE;
1114     for (i=0; i<a->sliidx[a->totalslices]; i++) {
1115       if (PetscImaginaryPart(a->val[i]) != 0.0) {
1116         realonly = PETSC_FALSE;
1117         break;
1118       }
1119     }
1120 #endif
1121 
1122     PetscCall(PetscViewerASCIIUseTabs(viewer,PETSC_FALSE));
1123     for (i=0; i<m; i++) {
1124       jcnt = 0;
1125       shift = a->sliidx[i>>3]+(i&0x07);
1126       for (j=0; j<A->cmap->n; j++) {
1127         if (jcnt < a->rlen[i] && j == a->colidx[shift+8*j]) {
1128           value = a->val[cnt++];
1129           jcnt++;
1130         } else {
1131           value = 0.0;
1132         }
1133 #if defined(PETSC_USE_COMPLEX)
1134         if (realonly) {
1135           PetscCall(PetscViewerASCIIPrintf(viewer," %7.5e ",(double)PetscRealPart(value)));
1136         } else {
1137           PetscCall(PetscViewerASCIIPrintf(viewer," %7.5e+%7.5e i ",(double)PetscRealPart(value),(double)PetscImaginaryPart(value)));
1138         }
1139 #else
1140         PetscCall(PetscViewerASCIIPrintf(viewer," %7.5e ",(double)value));
1141 #endif
1142       }
1143       PetscCall(PetscViewerASCIIPrintf(viewer,"\n"));
1144     }
1145     PetscCall(PetscViewerASCIIUseTabs(viewer,PETSC_TRUE));
1146   } else if (format == PETSC_VIEWER_ASCII_MATRIXMARKET) {
1147     PetscInt fshift=1;
1148     PetscCall(PetscViewerASCIIUseTabs(viewer,PETSC_FALSE));
1149 #if defined(PETSC_USE_COMPLEX)
1150     PetscCall(PetscViewerASCIIPrintf(viewer,"%%%%MatrixMarket matrix coordinate complex general\n"));
1151 #else
1152     PetscCall(PetscViewerASCIIPrintf(viewer,"%%%%MatrixMarket matrix coordinate real general\n"));
1153 #endif
1154     PetscCall(PetscViewerASCIIPrintf(viewer,"%" PetscInt_FMT " %" PetscInt_FMT " %" PetscInt_FMT "\n", m, A->cmap->n, a->nz));
1155     for (i=0; i<m; i++) {
1156       shift = a->sliidx[i>>3]+(i&0x07);
1157       for (j=0; j<a->rlen[i]; j++) {
1158 #if defined(PETSC_USE_COMPLEX)
1159         PetscCall(PetscViewerASCIIPrintf(viewer,"%" PetscInt_FMT " %" PetscInt_FMT " %g %g\n",i+fshift,a->colidx[shift+8*j]+fshift,(double)PetscRealPart(a->val[shift+8*j]),(double)PetscImaginaryPart(a->val[shift+8*j])));
1160 #else
1161         PetscCall(PetscViewerASCIIPrintf(viewer,"%" PetscInt_FMT " %" PetscInt_FMT " %g\n",i+fshift,a->colidx[shift+8*j]+fshift,(double)a->val[shift+8*j]));
1162 #endif
1163       }
1164     }
1165     PetscCall(PetscViewerASCIIUseTabs(viewer,PETSC_TRUE));
1166   } else if (format == PETSC_VIEWER_NATIVE) {
1167     for (i=0; i<a->totalslices; i++) { /* loop over slices */
1168       PetscInt row;
1169       PetscCall(PetscViewerASCIIPrintf(viewer,"slice %" PetscInt_FMT ": %" PetscInt_FMT " %" PetscInt_FMT "\n",i,a->sliidx[i],a->sliidx[i+1]));
1170       for (j=a->sliidx[i],row=0; j<a->sliidx[i+1]; j++,row=((row+1)&0x07)) {
1171 #if defined(PETSC_USE_COMPLEX)
1172         if (PetscImaginaryPart(a->val[j]) > 0.0) {
1173           PetscCall(PetscViewerASCIIPrintf(viewer,"  %" PetscInt_FMT " %" PetscInt_FMT " %g + %g i\n",8*i+row,a->colidx[j],(double)PetscRealPart(a->val[j]),(double)PetscImaginaryPart(a->val[j])));
1174         } else if (PetscImaginaryPart(a->val[j]) < 0.0) {
1175           PetscCall(PetscViewerASCIIPrintf(viewer,"  %" PetscInt_FMT " %" PetscInt_FMT " %g - %g i\n",8*i+row,a->colidx[j],(double)PetscRealPart(a->val[j]),-(double)PetscImaginaryPart(a->val[j])));
1176         } else {
1177           PetscCall(PetscViewerASCIIPrintf(viewer,"  %" PetscInt_FMT " %" PetscInt_FMT " %g\n",8*i+row,a->colidx[j],(double)PetscRealPart(a->val[j])));
1178         }
1179 #else
1180         PetscCall(PetscViewerASCIIPrintf(viewer,"  %" PetscInt_FMT " %" PetscInt_FMT " %g\n",8*i+row,a->colidx[j],(double)a->val[j]));
1181 #endif
1182       }
1183     }
1184   } else {
1185     PetscCall(PetscViewerASCIIUseTabs(viewer,PETSC_FALSE));
1186     if (A->factortype) {
1187       for (i=0; i<m; i++) {
1188         shift = a->sliidx[i>>3]+(i&0x07);
1189         PetscCall(PetscViewerASCIIPrintf(viewer,"row %" PetscInt_FMT ":",i));
1190         /* L part */
1191         for (j=shift; j<a->diag[i]; j+=8) {
1192 #if defined(PETSC_USE_COMPLEX)
1193           if (PetscImaginaryPart(a->val[shift+8*j]) > 0.0) {
1194             PetscCall(PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g + %g i)",a->colidx[j],(double)PetscRealPart(a->val[j]),(double)PetscImaginaryPart(a->val[j])));
1195           } else if (PetscImaginaryPart(a->val[shift+8*j]) < 0.0) {
1196             PetscCall(PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g - %g i)",a->colidx[j],(double)PetscRealPart(a->val[j]),(double)(-PetscImaginaryPart(a->val[j]))));
1197           } else {
1198             PetscCall(PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g) ",a->colidx[j],(double)PetscRealPart(a->val[j])));
1199           }
1200 #else
1201           PetscCall(PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g) ",a->colidx[j],(double)a->val[j]));
1202 #endif
1203         }
1204         /* diagonal */
1205         j = a->diag[i];
1206 #if defined(PETSC_USE_COMPLEX)
1207         if (PetscImaginaryPart(a->val[j]) > 0.0) {
1208           PetscCall(PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g + %g i)",a->colidx[j],(double)PetscRealPart(1.0/a->val[j]),(double)PetscImaginaryPart(1.0/a->val[j])));
1209         } else if (PetscImaginaryPart(a->val[j]) < 0.0) {
1210           PetscCall(PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g - %g i)",a->colidx[j],(double)PetscRealPart(1.0/a->val[j]),(double)(-PetscImaginaryPart(1.0/a->val[j]))));
1211         } else {
1212           PetscCall(PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g) ",a->colidx[j],(double)PetscRealPart(1.0/a->val[j])));
1213         }
1214 #else
1215         PetscCall(PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g) ",a->colidx[j],(double)(1.0/a->val[j])));
1216 #endif
1217 
1218         /* U part */
1219         for (j=a->diag[i]+1; j<shift+8*a->rlen[i]; j+=8) {
1220 #if defined(PETSC_USE_COMPLEX)
1221           if (PetscImaginaryPart(a->val[j]) > 0.0) {
1222             PetscCall(PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g + %g i)",a->colidx[j],(double)PetscRealPart(a->val[j]),(double)PetscImaginaryPart(a->val[j])));
1223           } else if (PetscImaginaryPart(a->val[j]) < 0.0) {
1224             PetscCall(PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g - %g i)",a->colidx[j],(double)PetscRealPart(a->val[j]),(double)(-PetscImaginaryPart(a->val[j]))));
1225           } else {
1226             PetscCall(PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g) ",a->colidx[j],(double)PetscRealPart(a->val[j])));
1227           }
1228 #else
1229           PetscCall(PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g) ",a->colidx[j],(double)a->val[j]));
1230 #endif
1231         }
1232         PetscCall(PetscViewerASCIIPrintf(viewer,"\n"));
1233       }
1234     } else {
1235       for (i=0; i<m; i++) {
1236         shift = a->sliidx[i>>3]+(i&0x07);
1237         PetscCall(PetscViewerASCIIPrintf(viewer,"row %" PetscInt_FMT ":",i));
1238         for (j=0; j<a->rlen[i]; j++) {
1239 #if defined(PETSC_USE_COMPLEX)
1240           if (PetscImaginaryPart(a->val[j]) > 0.0) {
1241             PetscCall(PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g + %g i)",a->colidx[shift+8*j],(double)PetscRealPart(a->val[shift+8*j]),(double)PetscImaginaryPart(a->val[shift+8*j])));
1242           } else if (PetscImaginaryPart(a->val[j]) < 0.0) {
1243             PetscCall(PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g - %g i)",a->colidx[shift+8*j],(double)PetscRealPart(a->val[shift+8*j]),(double)-PetscImaginaryPart(a->val[shift+8*j])));
1244           } else {
1245             PetscCall(PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g) ",a->colidx[shift+8*j],(double)PetscRealPart(a->val[shift+8*j])));
1246           }
1247 #else
1248           PetscCall(PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g) ",a->colidx[shift+8*j],(double)a->val[shift+8*j]));
1249 #endif
1250         }
1251         PetscCall(PetscViewerASCIIPrintf(viewer,"\n"));
1252       }
1253     }
1254     PetscCall(PetscViewerASCIIUseTabs(viewer,PETSC_TRUE));
1255   }
1256   PetscCall(PetscViewerFlush(viewer));
1257   PetscFunctionReturn(0);
1258 }
1259 
1260 #include <petscdraw.h>
1261 PetscErrorCode MatView_SeqSELL_Draw_Zoom(PetscDraw draw,void *Aa)
1262 {
1263   Mat               A=(Mat)Aa;
1264   Mat_SeqSELL       *a=(Mat_SeqSELL*)A->data;
1265   PetscInt          i,j,m=A->rmap->n,shift;
1266   int               color;
1267   PetscReal         xl,yl,xr,yr,x_l,x_r,y_l,y_r;
1268   PetscViewer       viewer;
1269   PetscViewerFormat format;
1270 
1271   PetscFunctionBegin;
1272   PetscCall(PetscObjectQuery((PetscObject)A,"Zoomviewer",(PetscObject*)&viewer));
1273   PetscCall(PetscViewerGetFormat(viewer,&format));
1274   PetscCall(PetscDrawGetCoordinates(draw,&xl,&yl,&xr,&yr));
1275 
1276   /* loop over matrix elements drawing boxes */
1277 
1278   if (format != PETSC_VIEWER_DRAW_CONTOUR) {
1279     PetscDrawCollectiveBegin(draw);
1280     /* Blue for negative, Cyan for zero and  Red for positive */
1281     color = PETSC_DRAW_BLUE;
1282     for (i=0; i<m; i++) {
1283       shift = a->sliidx[i>>3]+(i&0x07); /* starting index of the row i */
1284       y_l = m - i - 1.0; y_r = y_l + 1.0;
1285       for (j=0; j<a->rlen[i]; j++) {
1286         x_l = a->colidx[shift+j*8]; x_r = x_l + 1.0;
1287         if (PetscRealPart(a->val[shift+8*j]) >=  0.) continue;
1288         PetscCall(PetscDrawRectangle(draw,x_l,y_l,x_r,y_r,color,color,color,color));
1289       }
1290     }
1291     color = PETSC_DRAW_CYAN;
1292     for (i=0; i<m; i++) {
1293       shift = a->sliidx[i>>3]+(i&0x07);
1294       y_l = m - i - 1.0; y_r = y_l + 1.0;
1295       for (j=0; j<a->rlen[i]; j++) {
1296         x_l = a->colidx[shift+j*8]; x_r = x_l + 1.0;
1297         if (a->val[shift+8*j] !=  0.) continue;
1298         PetscCall(PetscDrawRectangle(draw,x_l,y_l,x_r,y_r,color,color,color,color));
1299       }
1300     }
1301     color = PETSC_DRAW_RED;
1302     for (i=0; i<m; i++) {
1303       shift = a->sliidx[i>>3]+(i&0x07);
1304       y_l = m - i - 1.0; y_r = y_l + 1.0;
1305       for (j=0; j<a->rlen[i]; j++) {
1306         x_l = a->colidx[shift+j*8]; x_r = x_l + 1.0;
1307         if (PetscRealPart(a->val[shift+8*j]) <=  0.) continue;
1308         PetscCall(PetscDrawRectangle(draw,x_l,y_l,x_r,y_r,color,color,color,color));
1309       }
1310     }
1311     PetscDrawCollectiveEnd(draw);
1312   } else {
1313     /* use contour shading to indicate magnitude of values */
1314     /* first determine max of all nonzero values */
1315     PetscReal minv=0.0,maxv=0.0;
1316     PetscInt  count=0;
1317     PetscDraw popup;
1318     for (i=0; i<a->sliidx[a->totalslices]; i++) {
1319       if (PetscAbsScalar(a->val[i]) > maxv) maxv = PetscAbsScalar(a->val[i]);
1320     }
1321     if (minv >= maxv) maxv = minv + PETSC_SMALL;
1322     PetscCall(PetscDrawGetPopup(draw,&popup));
1323     PetscCall(PetscDrawScalePopup(popup,minv,maxv));
1324 
1325     PetscDrawCollectiveBegin(draw);
1326     for (i=0; i<m; i++) {
1327       shift = a->sliidx[i>>3]+(i&0x07);
1328       y_l = m - i - 1.0;
1329       y_r = y_l + 1.0;
1330       for (j=0; j<a->rlen[i]; j++) {
1331         x_l = a->colidx[shift+j*8];
1332         x_r = x_l + 1.0;
1333         color = PetscDrawRealToColor(PetscAbsScalar(a->val[count]),minv,maxv);
1334         PetscCall(PetscDrawRectangle(draw,x_l,y_l,x_r,y_r,color,color,color,color));
1335         count++;
1336       }
1337     }
1338     PetscDrawCollectiveEnd(draw);
1339   }
1340   PetscFunctionReturn(0);
1341 }
1342 
1343 #include <petscdraw.h>
1344 PetscErrorCode MatView_SeqSELL_Draw(Mat A,PetscViewer viewer)
1345 {
1346   PetscDraw      draw;
1347   PetscReal      xr,yr,xl,yl,h,w;
1348   PetscBool      isnull;
1349 
1350   PetscFunctionBegin;
1351   PetscCall(PetscViewerDrawGetDraw(viewer,0,&draw));
1352   PetscCall(PetscDrawIsNull(draw,&isnull));
1353   if (isnull) PetscFunctionReturn(0);
1354 
1355   xr   = A->cmap->n; yr  = A->rmap->n; h = yr/10.0; w = xr/10.0;
1356   xr  += w;          yr += h;         xl = -w;     yl = -h;
1357   PetscCall(PetscDrawSetCoordinates(draw,xl,yl,xr,yr));
1358   PetscCall(PetscObjectCompose((PetscObject)A,"Zoomviewer",(PetscObject)viewer));
1359   PetscCall(PetscDrawZoom(draw,MatView_SeqSELL_Draw_Zoom,A));
1360   PetscCall(PetscObjectCompose((PetscObject)A,"Zoomviewer",NULL));
1361   PetscCall(PetscDrawSave(draw));
1362   PetscFunctionReturn(0);
1363 }
1364 
1365 PetscErrorCode MatView_SeqSELL(Mat A,PetscViewer viewer)
1366 {
1367   PetscBool      iascii,isbinary,isdraw;
1368 
1369   PetscFunctionBegin;
1370   PetscCall(PetscObjectTypeCompare((PetscObject)viewer,PETSCVIEWERASCII,&iascii));
1371   PetscCall(PetscObjectTypeCompare((PetscObject)viewer,PETSCVIEWERBINARY,&isbinary));
1372   PetscCall(PetscObjectTypeCompare((PetscObject)viewer,PETSCVIEWERDRAW,&isdraw));
1373   if (iascii) {
1374     PetscCall(MatView_SeqSELL_ASCII(A,viewer));
1375   } else if (isbinary) {
1376     /* PetscCall(MatView_SeqSELL_Binary(A,viewer)); */
1377   } else if (isdraw) {
1378     PetscCall(MatView_SeqSELL_Draw(A,viewer));
1379   }
1380   PetscFunctionReturn(0);
1381 }
1382 
1383 PetscErrorCode MatAssemblyEnd_SeqSELL(Mat A,MatAssemblyType mode)
1384 {
1385   Mat_SeqSELL    *a=(Mat_SeqSELL*)A->data;
1386   PetscInt       i,shift,row_in_slice,row,nrow,*cp,lastcol,j,k;
1387   MatScalar      *vp;
1388 
1389   PetscFunctionBegin;
1390   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0);
1391   /* To do: compress out the unused elements */
1392   PetscCall(MatMarkDiagonal_SeqSELL(A));
1393   PetscCall(PetscInfo(A,"Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: %" PetscInt_FMT " allocated %" PetscInt_FMT " used (%" PetscInt_FMT " nonzeros+%" PetscInt_FMT " paddedzeros)\n",A->rmap->n,A->cmap->n,a->maxallocmat,a->sliidx[a->totalslices],a->nz,a->sliidx[a->totalslices]-a->nz));
1394   PetscCall(PetscInfo(A,"Number of mallocs during MatSetValues() is %" PetscInt_FMT "\n",a->reallocs));
1395   PetscCall(PetscInfo(A,"Maximum nonzeros in any row is %" PetscInt_FMT "\n",a->rlenmax));
1396   /* Set unused slots for column indices to last valid column index. Set unused slots for values to zero. This allows for a use of unmasked intrinsics -> higher performance */
1397   for (i=0; i<a->totalslices; ++i) {
1398     shift = a->sliidx[i];    /* starting index of the slice */
1399     cp    = a->colidx+shift; /* pointer to the column indices of the slice */
1400     vp    = a->val+shift;    /* pointer to the nonzero values of the slice */
1401     for (row_in_slice=0; row_in_slice<8; ++row_in_slice) { /* loop over rows in the slice */
1402       row  = 8*i + row_in_slice;
1403       nrow = a->rlen[row]; /* number of nonzeros in row */
1404       /*
1405         Search for the nearest nonzero. Normally setting the index to zero may cause extra communication.
1406         But if the entire slice are empty, it is fine to use 0 since the index will not be loaded.
1407       */
1408       lastcol = 0;
1409       if (nrow>0) { /* nonempty row */
1410         lastcol = cp[8*(nrow-1)+row_in_slice]; /* use the index from the last nonzero at current row */
1411       } else if (!row_in_slice) { /* first row of the currect slice is empty */
1412         for (j=1;j<8;j++) {
1413           if (a->rlen[8*i+j]) {
1414             lastcol = cp[j];
1415             break;
1416           }
1417         }
1418       } else {
1419         if (a->sliidx[i+1] != shift) lastcol = cp[row_in_slice-1]; /* use the index from the previous row */
1420       }
1421 
1422       for (k=nrow; k<(a->sliidx[i+1]-shift)/8; ++k) {
1423         cp[8*k+row_in_slice] = lastcol;
1424         vp[8*k+row_in_slice] = (MatScalar)0;
1425       }
1426     }
1427   }
1428 
1429   A->info.mallocs += a->reallocs;
1430   a->reallocs      = 0;
1431 
1432   PetscCall(MatSeqSELLInvalidateDiagonal(A));
1433   PetscFunctionReturn(0);
1434 }
1435 
1436 PetscErrorCode MatGetInfo_SeqSELL(Mat A,MatInfoType flag,MatInfo *info)
1437 {
1438   Mat_SeqSELL *a=(Mat_SeqSELL*)A->data;
1439 
1440   PetscFunctionBegin;
1441   info->block_size   = 1.0;
1442   info->nz_allocated = a->maxallocmat;
1443   info->nz_used      = a->sliidx[a->totalslices]; /* include padding zeros */
1444   info->nz_unneeded  = (a->maxallocmat-a->sliidx[a->totalslices]);
1445   info->assemblies   = A->num_ass;
1446   info->mallocs      = A->info.mallocs;
1447   info->memory       = ((PetscObject)A)->mem;
1448   if (A->factortype) {
1449     info->fill_ratio_given  = A->info.fill_ratio_given;
1450     info->fill_ratio_needed = A->info.fill_ratio_needed;
1451     info->factor_mallocs    = A->info.factor_mallocs;
1452   } else {
1453     info->fill_ratio_given  = 0;
1454     info->fill_ratio_needed = 0;
1455     info->factor_mallocs    = 0;
1456   }
1457   PetscFunctionReturn(0);
1458 }
1459 
1460 PetscErrorCode MatSetValues_SeqSELL(Mat A,PetscInt m,const PetscInt im[],PetscInt n,const PetscInt in[],const PetscScalar v[],InsertMode is)
1461 {
1462   Mat_SeqSELL    *a=(Mat_SeqSELL*)A->data;
1463   PetscInt       shift,i,k,l,low,high,t,ii,row,col,nrow;
1464   PetscInt       *cp,nonew=a->nonew,lastcol=-1;
1465   MatScalar      *vp,value;
1466 
1467   PetscFunctionBegin;
1468   for (k=0; k<m; k++) { /* loop over added rows */
1469     row = im[k];
1470     if (row < 0) continue;
1471     PetscCheck(row < A->rmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Row too large: row %" PetscInt_FMT " max %" PetscInt_FMT,row,A->rmap->n-1);
1472     shift = a->sliidx[row>>3]+(row&0x07); /* starting index of the row */
1473     cp    = a->colidx+shift; /* pointer to the row */
1474     vp    = a->val+shift; /* pointer to the row */
1475     nrow  = a->rlen[row];
1476     low   = 0;
1477     high  = nrow;
1478 
1479     for (l=0; l<n; l++) { /* loop over added columns */
1480       col = in[l];
1481       if (col<0) continue;
1482       PetscCheck(col < A->cmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Col too large: row %" PetscInt_FMT " max %" PetscInt_FMT,col,A->cmap->n-1);
1483       if (a->roworiented) {
1484         value = v[l+k*n];
1485       } else {
1486         value = v[k+l*m];
1487       }
1488       if ((value == 0.0 && a->ignorezeroentries) && (is == ADD_VALUES)) continue;
1489 
1490       /* search in this row for the specified column, i indicates the column to be set */
1491       if (col <= lastcol) low = 0;
1492       else high = nrow;
1493       lastcol = col;
1494       while (high-low > 5) {
1495         t = (low+high)/2;
1496         if (*(cp+t*8) > col) high = t;
1497         else low = t;
1498       }
1499       for (i=low; i<high; i++) {
1500         if (*(cp+i*8) > col) break;
1501         if (*(cp+i*8) == col) {
1502           if (is == ADD_VALUES) *(vp+i*8) += value;
1503           else *(vp+i*8) = value;
1504           low = i + 1;
1505           goto noinsert;
1506         }
1507       }
1508       if (value == 0.0 && a->ignorezeroentries) goto noinsert;
1509       if (nonew == 1) goto noinsert;
1510       PetscCheck(nonew != -1,PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Inserting a new nonzero (%" PetscInt_FMT ", %" PetscInt_FMT ") in the matrix", row, col);
1511       /* If the current row length exceeds the slice width (e.g. nrow==slice_width), allocate a new space, otherwise do nothing */
1512       MatSeqXSELLReallocateSELL(A,A->rmap->n,1,nrow,a->sliidx,row/8,row,col,a->colidx,a->val,cp,vp,nonew,MatScalar);
1513       /* add the new nonzero to the high position, shift the remaining elements in current row to the right by one slot */
1514       for (ii=nrow-1; ii>=i; ii--) {
1515         *(cp+(ii+1)*8) = *(cp+ii*8);
1516         *(vp+(ii+1)*8) = *(vp+ii*8);
1517       }
1518       a->rlen[row]++;
1519       *(cp+i*8) = col;
1520       *(vp+i*8) = value;
1521       a->nz++;
1522       A->nonzerostate++;
1523       low = i+1; high++; nrow++;
1524 noinsert:;
1525     }
1526     a->rlen[row] = nrow;
1527   }
1528   PetscFunctionReturn(0);
1529 }
1530 
1531 PetscErrorCode MatCopy_SeqSELL(Mat A,Mat B,MatStructure str)
1532 {
1533   PetscFunctionBegin;
1534   /* If the two matrices have the same copy implementation, use fast copy. */
1535   if (str == SAME_NONZERO_PATTERN && (A->ops->copy == B->ops->copy)) {
1536     Mat_SeqSELL *a=(Mat_SeqSELL*)A->data;
1537     Mat_SeqSELL *b=(Mat_SeqSELL*)B->data;
1538 
1539     PetscCheck(a->sliidx[a->totalslices] == b->sliidx[b->totalslices],PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"Number of nonzeros in two matrices are different");
1540     PetscCall(PetscArraycpy(b->val,a->val,a->sliidx[a->totalslices]));
1541   } else {
1542     PetscCall(MatCopy_Basic(A,B,str));
1543   }
1544   PetscFunctionReturn(0);
1545 }
1546 
1547 PetscErrorCode MatSetUp_SeqSELL(Mat A)
1548 {
1549   PetscFunctionBegin;
1550   PetscCall(MatSeqSELLSetPreallocation(A,PETSC_DEFAULT,NULL));
1551   PetscFunctionReturn(0);
1552 }
1553 
1554 PetscErrorCode MatSeqSELLGetArray_SeqSELL(Mat A,PetscScalar *array[])
1555 {
1556   Mat_SeqSELL *a=(Mat_SeqSELL*)A->data;
1557 
1558   PetscFunctionBegin;
1559   *array = a->val;
1560   PetscFunctionReturn(0);
1561 }
1562 
1563 PetscErrorCode MatSeqSELLRestoreArray_SeqSELL(Mat A,PetscScalar *array[])
1564 {
1565   PetscFunctionBegin;
1566   PetscFunctionReturn(0);
1567 }
1568 
1569 PetscErrorCode MatRealPart_SeqSELL(Mat A)
1570 {
1571   Mat_SeqSELL *a=(Mat_SeqSELL*)A->data;
1572   PetscInt    i;
1573   MatScalar   *aval=a->val;
1574 
1575   PetscFunctionBegin;
1576   for (i=0; i<a->sliidx[a->totalslices]; i++) aval[i]=PetscRealPart(aval[i]);
1577   PetscFunctionReturn(0);
1578 }
1579 
1580 PetscErrorCode MatImaginaryPart_SeqSELL(Mat A)
1581 {
1582   Mat_SeqSELL    *a=(Mat_SeqSELL*)A->data;
1583   PetscInt       i;
1584   MatScalar      *aval=a->val;
1585 
1586   PetscFunctionBegin;
1587   for (i=0; i<a->sliidx[a->totalslices]; i++) aval[i] = PetscImaginaryPart(aval[i]);
1588   PetscCall(MatSeqSELLInvalidateDiagonal(A));
1589   PetscFunctionReturn(0);
1590 }
1591 
1592 PetscErrorCode MatScale_SeqSELL(Mat inA,PetscScalar alpha)
1593 {
1594   Mat_SeqSELL    *a=(Mat_SeqSELL*)inA->data;
1595   MatScalar      *aval=a->val;
1596   PetscScalar    oalpha=alpha;
1597   PetscBLASInt   one=1,size;
1598 
1599   PetscFunctionBegin;
1600   PetscCall(PetscBLASIntCast(a->sliidx[a->totalslices],&size));
1601   PetscStackCallBLAS("BLASscal",BLASscal_(&size,&oalpha,aval,&one));
1602   PetscCall(PetscLogFlops(a->nz));
1603   PetscCall(MatSeqSELLInvalidateDiagonal(inA));
1604   PetscFunctionReturn(0);
1605 }
1606 
1607 PetscErrorCode MatShift_SeqSELL(Mat Y,PetscScalar a)
1608 {
1609   Mat_SeqSELL    *y=(Mat_SeqSELL*)Y->data;
1610 
1611   PetscFunctionBegin;
1612   if (!Y->preallocated || !y->nz) {
1613     PetscCall(MatSeqSELLSetPreallocation(Y,1,NULL));
1614   }
1615   PetscCall(MatShift_Basic(Y,a));
1616   PetscFunctionReturn(0);
1617 }
1618 
1619 PetscErrorCode MatSOR_SeqSELL(Mat A,Vec bb,PetscReal omega,MatSORType flag,PetscReal fshift,PetscInt its,PetscInt lits,Vec xx)
1620 {
1621   Mat_SeqSELL       *a=(Mat_SeqSELL*)A->data;
1622   PetscScalar       *x,sum,*t;
1623   const MatScalar   *idiag=NULL,*mdiag;
1624   const PetscScalar *b,*xb;
1625   PetscInt          n,m=A->rmap->n,i,j,shift;
1626   const PetscInt    *diag;
1627 
1628   PetscFunctionBegin;
1629   its = its*lits;
1630 
1631   if (fshift != a->fshift || omega != a->omega) a->idiagvalid = PETSC_FALSE; /* must recompute idiag[] */
1632   if (!a->idiagvalid) PetscCall(MatInvertDiagonal_SeqSELL(A,omega,fshift));
1633   a->fshift = fshift;
1634   a->omega  = omega;
1635 
1636   diag  = a->diag;
1637   t     = a->ssor_work;
1638   idiag = a->idiag;
1639   mdiag = a->mdiag;
1640 
1641   PetscCall(VecGetArray(xx,&x));
1642   PetscCall(VecGetArrayRead(bb,&b));
1643   /* We count flops by assuming the upper triangular and lower triangular parts have the same number of nonzeros */
1644   PetscCheck(flag != SOR_APPLY_UPPER,PETSC_COMM_SELF,PETSC_ERR_SUP,"SOR_APPLY_UPPER is not implemented");
1645   PetscCheck(flag != SOR_APPLY_LOWER,PETSC_COMM_SELF,PETSC_ERR_SUP,"SOR_APPLY_LOWER is not implemented");
1646   PetscCheck(!(flag & SOR_EISENSTAT),PETSC_COMM_SELF,PETSC_ERR_SUP,"No support yet for Eisenstat");
1647 
1648   if (flag & SOR_ZERO_INITIAL_GUESS) {
1649     if ((flag & SOR_FORWARD_SWEEP) || (flag & SOR_LOCAL_FORWARD_SWEEP)) {
1650       for (i=0; i<m; i++) {
1651         shift = a->sliidx[i>>3]+(i&0x07); /* starting index of the row i */
1652         sum   = b[i];
1653         n     = (diag[i]-shift)/8;
1654         for (j=0; j<n; j++) sum -= a->val[shift+j*8]*x[a->colidx[shift+j*8]];
1655         t[i]  = sum;
1656         x[i]  = sum*idiag[i];
1657       }
1658       xb   = t;
1659       PetscCall(PetscLogFlops(a->nz));
1660     } else xb = b;
1661     if ((flag & SOR_BACKWARD_SWEEP) || (flag & SOR_LOCAL_BACKWARD_SWEEP)) {
1662       for (i=m-1; i>=0; i--) {
1663         shift = a->sliidx[i>>3]+(i&0x07); /* starting index of the row i */
1664         sum   = xb[i];
1665         n     = a->rlen[i]-(diag[i]-shift)/8-1;
1666         for (j=1; j<=n; j++) sum -= a->val[diag[i]+j*8]*x[a->colidx[diag[i]+j*8]];
1667         if (xb == b) {
1668           x[i] = sum*idiag[i];
1669         } else {
1670           x[i] = (1.-omega)*x[i]+sum*idiag[i];  /* omega in idiag */
1671         }
1672       }
1673       PetscCall(PetscLogFlops(a->nz)); /* assumes 1/2 in upper */
1674     }
1675     its--;
1676   }
1677   while (its--) {
1678     if ((flag & SOR_FORWARD_SWEEP) || (flag & SOR_LOCAL_FORWARD_SWEEP)) {
1679       for (i=0; i<m; i++) {
1680         /* lower */
1681         shift = a->sliidx[i>>3]+(i&0x07); /* starting index of the row i */
1682         sum   = b[i];
1683         n     = (diag[i]-shift)/8;
1684         for (j=0; j<n; j++) sum -= a->val[shift+j*8]*x[a->colidx[shift+j*8]];
1685         t[i]  = sum;             /* save application of the lower-triangular part */
1686         /* upper */
1687         n     = a->rlen[i]-(diag[i]-shift)/8-1;
1688         for (j=1; j<=n; j++) sum -= a->val[diag[i]+j*8]*x[a->colidx[diag[i]+j*8]];
1689         x[i]  = (1.-omega)*x[i]+sum*idiag[i];  /* omega in idiag */
1690       }
1691       xb   = t;
1692       PetscCall(PetscLogFlops(2.0*a->nz));
1693     } else xb = b;
1694     if ((flag & SOR_BACKWARD_SWEEP) || (flag & SOR_LOCAL_BACKWARD_SWEEP)) {
1695       for (i=m-1; i>=0; i--) {
1696         shift = a->sliidx[i>>3]+(i&0x07); /* starting index of the row i */
1697         sum = xb[i];
1698         if (xb == b) {
1699           /* whole matrix (no checkpointing available) */
1700           n     = a->rlen[i];
1701           for (j=0; j<n; j++) sum -= a->val[shift+j*8]*x[a->colidx[shift+j*8]];
1702           x[i] = (1.-omega)*x[i]+(sum+mdiag[i]*x[i])*idiag[i];
1703         } else { /* lower-triangular part has been saved, so only apply upper-triangular */
1704           n     = a->rlen[i]-(diag[i]-shift)/8-1;
1705           for (j=1; j<=n; j++) sum -= a->val[diag[i]+j*8]*x[a->colidx[diag[i]+j*8]];
1706           x[i]  = (1.-omega)*x[i]+sum*idiag[i];  /* omega in idiag */
1707         }
1708       }
1709       if (xb == b) {
1710         PetscCall(PetscLogFlops(2.0*a->nz));
1711       } else {
1712         PetscCall(PetscLogFlops(a->nz)); /* assumes 1/2 in upper */
1713       }
1714     }
1715   }
1716   PetscCall(VecRestoreArray(xx,&x));
1717   PetscCall(VecRestoreArrayRead(bb,&b));
1718   PetscFunctionReturn(0);
1719 }
1720 
1721 /* -------------------------------------------------------------------*/
1722 static struct _MatOps MatOps_Values = {MatSetValues_SeqSELL,
1723                                        MatGetRow_SeqSELL,
1724                                        MatRestoreRow_SeqSELL,
1725                                        MatMult_SeqSELL,
1726                                /* 4*/  MatMultAdd_SeqSELL,
1727                                        MatMultTranspose_SeqSELL,
1728                                        MatMultTransposeAdd_SeqSELL,
1729                                        NULL,
1730                                        NULL,
1731                                        NULL,
1732                                /* 10*/ NULL,
1733                                        NULL,
1734                                        NULL,
1735                                        MatSOR_SeqSELL,
1736                                        NULL,
1737                                /* 15*/ MatGetInfo_SeqSELL,
1738                                        MatEqual_SeqSELL,
1739                                        MatGetDiagonal_SeqSELL,
1740                                        MatDiagonalScale_SeqSELL,
1741                                        NULL,
1742                                /* 20*/ NULL,
1743                                        MatAssemblyEnd_SeqSELL,
1744                                        MatSetOption_SeqSELL,
1745                                        MatZeroEntries_SeqSELL,
1746                                /* 24*/ NULL,
1747                                        NULL,
1748                                        NULL,
1749                                        NULL,
1750                                        NULL,
1751                                /* 29*/ MatSetUp_SeqSELL,
1752                                        NULL,
1753                                        NULL,
1754                                        NULL,
1755                                        NULL,
1756                                /* 34*/ MatDuplicate_SeqSELL,
1757                                        NULL,
1758                                        NULL,
1759                                        NULL,
1760                                        NULL,
1761                                /* 39*/ NULL,
1762                                        NULL,
1763                                        NULL,
1764                                        MatGetValues_SeqSELL,
1765                                        MatCopy_SeqSELL,
1766                                /* 44*/ NULL,
1767                                        MatScale_SeqSELL,
1768                                        MatShift_SeqSELL,
1769                                        NULL,
1770                                        NULL,
1771                                /* 49*/ NULL,
1772                                        NULL,
1773                                        NULL,
1774                                        NULL,
1775                                        NULL,
1776                                /* 54*/ MatFDColoringCreate_SeqXAIJ,
1777                                        NULL,
1778                                        NULL,
1779                                        NULL,
1780                                        NULL,
1781                                /* 59*/ NULL,
1782                                        MatDestroy_SeqSELL,
1783                                        MatView_SeqSELL,
1784                                        NULL,
1785                                        NULL,
1786                                /* 64*/ NULL,
1787                                        NULL,
1788                                        NULL,
1789                                        NULL,
1790                                        NULL,
1791                                /* 69*/ NULL,
1792                                        NULL,
1793                                        NULL,
1794                                        NULL,
1795                                        NULL,
1796                                /* 74*/ NULL,
1797                                        MatFDColoringApply_AIJ, /* reuse the FDColoring function for AIJ */
1798                                        NULL,
1799                                        NULL,
1800                                        NULL,
1801                                /* 79*/ NULL,
1802                                        NULL,
1803                                        NULL,
1804                                        NULL,
1805                                        NULL,
1806                                /* 84*/ NULL,
1807                                        NULL,
1808                                        NULL,
1809                                        NULL,
1810                                        NULL,
1811                                /* 89*/ NULL,
1812                                        NULL,
1813                                        NULL,
1814                                        NULL,
1815                                        NULL,
1816                                /* 94*/ NULL,
1817                                        NULL,
1818                                        NULL,
1819                                        NULL,
1820                                        NULL,
1821                                /* 99*/ NULL,
1822                                        NULL,
1823                                        NULL,
1824                                        MatConjugate_SeqSELL,
1825                                        NULL,
1826                                /*104*/ NULL,
1827                                        NULL,
1828                                        NULL,
1829                                        NULL,
1830                                        NULL,
1831                                /*109*/ NULL,
1832                                        NULL,
1833                                        NULL,
1834                                        NULL,
1835                                        MatMissingDiagonal_SeqSELL,
1836                                /*114*/ NULL,
1837                                        NULL,
1838                                        NULL,
1839                                        NULL,
1840                                        NULL,
1841                                /*119*/ NULL,
1842                                        NULL,
1843                                        NULL,
1844                                        NULL,
1845                                        NULL,
1846                                /*124*/ NULL,
1847                                        NULL,
1848                                        NULL,
1849                                        NULL,
1850                                        NULL,
1851                                /*129*/ NULL,
1852                                        NULL,
1853                                        NULL,
1854                                        NULL,
1855                                        NULL,
1856                                /*134*/ NULL,
1857                                        NULL,
1858                                        NULL,
1859                                        NULL,
1860                                        NULL,
1861                                /*139*/ NULL,
1862                                        NULL,
1863                                        NULL,
1864                                        MatFDColoringSetUp_SeqXAIJ,
1865                                        NULL,
1866                                /*144*/ NULL,
1867                                        NULL,
1868                                        NULL,
1869                                        NULL
1870 };
1871 
1872 PetscErrorCode MatStoreValues_SeqSELL(Mat mat)
1873 {
1874   Mat_SeqSELL    *a=(Mat_SeqSELL*)mat->data;
1875 
1876   PetscFunctionBegin;
1877   PetscCheck(a->nonew,PETSC_COMM_SELF,PETSC_ERR_ORDER,"Must call MatSetOption(A,MAT_NEW_NONZERO_LOCATIONS,PETSC_FALSE);first");
1878 
1879   /* allocate space for values if not already there */
1880   if (!a->saved_values) {
1881     PetscCall(PetscMalloc1(a->sliidx[a->totalslices]+1,&a->saved_values));
1882     PetscCall(PetscLogObjectMemory((PetscObject)mat,(a->sliidx[a->totalslices]+1)*sizeof(PetscScalar)));
1883   }
1884 
1885   /* copy values over */
1886   PetscCall(PetscArraycpy(a->saved_values,a->val,a->sliidx[a->totalslices]));
1887   PetscFunctionReturn(0);
1888 }
1889 
1890 PetscErrorCode MatRetrieveValues_SeqSELL(Mat mat)
1891 {
1892   Mat_SeqSELL    *a=(Mat_SeqSELL*)mat->data;
1893 
1894   PetscFunctionBegin;
1895   PetscCheck(a->nonew,PETSC_COMM_SELF,PETSC_ERR_ORDER,"Must call MatSetOption(A,MAT_NEW_NONZERO_LOCATIONS,PETSC_FALSE);first");
1896   PetscCheck(a->saved_values,PETSC_COMM_SELF,PETSC_ERR_ORDER,"Must call MatStoreValues(A);first");
1897   PetscCall(PetscArraycpy(a->val,a->saved_values,a->sliidx[a->totalslices]));
1898   PetscFunctionReturn(0);
1899 }
1900 
1901 /*@C
1902  MatSeqSELLRestoreArray - returns access to the array where the data for a MATSEQSELL matrix is stored obtained by MatSeqSELLGetArray()
1903 
1904  Not Collective
1905 
1906  Input Parameters:
1907  .  mat - a MATSEQSELL matrix
1908  .  array - pointer to the data
1909 
1910  Level: intermediate
1911 
1912  .seealso: `MatSeqSELLGetArray()`, `MatSeqSELLRestoreArrayF90()`
1913  @*/
1914 PetscErrorCode MatSeqSELLRestoreArray(Mat A,PetscScalar **array)
1915 {
1916   PetscFunctionBegin;
1917   PetscUseMethod(A,"MatSeqSELLRestoreArray_C",(Mat,PetscScalar**),(A,array));
1918   PetscFunctionReturn(0);
1919 }
1920 
1921 PETSC_EXTERN PetscErrorCode MatCreate_SeqSELL(Mat B)
1922 {
1923   Mat_SeqSELL    *b;
1924   PetscMPIInt    size;
1925 
1926   PetscFunctionBegin;
1927   PetscCall(PetscCitationsRegister(citation,&cited));
1928   PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)B),&size));
1929   PetscCheck(size <= 1,PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Comm must be of size 1");
1930 
1931   PetscCall(PetscNewLog(B,&b));
1932 
1933   B->data = (void*)b;
1934 
1935   PetscCall(PetscMemcpy(B->ops,&MatOps_Values,sizeof(struct _MatOps)));
1936 
1937   b->row                = NULL;
1938   b->col                = NULL;
1939   b->icol               = NULL;
1940   b->reallocs           = 0;
1941   b->ignorezeroentries  = PETSC_FALSE;
1942   b->roworiented        = PETSC_TRUE;
1943   b->nonew              = 0;
1944   b->diag               = NULL;
1945   b->solve_work         = NULL;
1946   B->spptr              = NULL;
1947   b->saved_values       = NULL;
1948   b->idiag              = NULL;
1949   b->mdiag              = NULL;
1950   b->ssor_work          = NULL;
1951   b->omega              = 1.0;
1952   b->fshift             = 0.0;
1953   b->idiagvalid         = PETSC_FALSE;
1954   b->keepnonzeropattern = PETSC_FALSE;
1955 
1956   PetscCall(PetscObjectChangeTypeName((PetscObject)B,MATSEQSELL));
1957   PetscCall(PetscObjectComposeFunction((PetscObject)B,"MatSeqSELLGetArray_C",MatSeqSELLGetArray_SeqSELL));
1958   PetscCall(PetscObjectComposeFunction((PetscObject)B,"MatSeqSELLRestoreArray_C",MatSeqSELLRestoreArray_SeqSELL));
1959   PetscCall(PetscObjectComposeFunction((PetscObject)B,"MatStoreValues_C",MatStoreValues_SeqSELL));
1960   PetscCall(PetscObjectComposeFunction((PetscObject)B,"MatRetrieveValues_C",MatRetrieveValues_SeqSELL));
1961   PetscCall(PetscObjectComposeFunction((PetscObject)B,"MatSeqSELLSetPreallocation_C",MatSeqSELLSetPreallocation_SeqSELL));
1962   PetscCall(PetscObjectComposeFunction((PetscObject)B,"MatConvert_seqsell_seqaij_C",MatConvert_SeqSELL_SeqAIJ));
1963   PetscFunctionReturn(0);
1964 }
1965 
1966 /*
1967  Given a matrix generated with MatGetFactor() duplicates all the information in A into B
1968  */
1969 PetscErrorCode MatDuplicateNoCreate_SeqSELL(Mat C,Mat A,MatDuplicateOption cpvalues,PetscBool mallocmatspace)
1970 {
1971   Mat_SeqSELL    *c = (Mat_SeqSELL*)C->data,*a = (Mat_SeqSELL*)A->data;
1972   PetscInt       i,m=A->rmap->n;
1973   PetscInt       totalslices=a->totalslices;
1974 
1975   PetscFunctionBegin;
1976   C->factortype = A->factortype;
1977   c->row        = NULL;
1978   c->col        = NULL;
1979   c->icol       = NULL;
1980   c->reallocs   = 0;
1981   C->assembled = PETSC_TRUE;
1982 
1983   PetscCall(PetscLayoutReference(A->rmap,&C->rmap));
1984   PetscCall(PetscLayoutReference(A->cmap,&C->cmap));
1985 
1986   PetscCall(PetscMalloc1(8*totalslices,&c->rlen));
1987   PetscCall(PetscLogObjectMemory((PetscObject)C,m*sizeof(PetscInt)));
1988   PetscCall(PetscMalloc1(totalslices+1,&c->sliidx));
1989   PetscCall(PetscLogObjectMemory((PetscObject)C, (totalslices+1)*sizeof(PetscInt)));
1990 
1991   for (i=0; i<m; i++) c->rlen[i] = a->rlen[i];
1992   for (i=0; i<totalslices+1; i++) c->sliidx[i] = a->sliidx[i];
1993 
1994   /* allocate the matrix space */
1995   if (mallocmatspace) {
1996     PetscCall(PetscMalloc2(a->maxallocmat,&c->val,a->maxallocmat,&c->colidx));
1997     PetscCall(PetscLogObjectMemory((PetscObject)C,a->maxallocmat*(sizeof(PetscScalar)+sizeof(PetscInt))));
1998 
1999     c->singlemalloc = PETSC_TRUE;
2000 
2001     if (m > 0) {
2002       PetscCall(PetscArraycpy(c->colidx,a->colidx,a->maxallocmat));
2003       if (cpvalues == MAT_COPY_VALUES) {
2004         PetscCall(PetscArraycpy(c->val,a->val,a->maxallocmat));
2005       } else {
2006         PetscCall(PetscArrayzero(c->val,a->maxallocmat));
2007       }
2008     }
2009   }
2010 
2011   c->ignorezeroentries = a->ignorezeroentries;
2012   c->roworiented       = a->roworiented;
2013   c->nonew             = a->nonew;
2014   if (a->diag) {
2015     PetscCall(PetscMalloc1(m,&c->diag));
2016     PetscCall(PetscLogObjectMemory((PetscObject)C,m*sizeof(PetscInt)));
2017     for (i=0; i<m; i++) {
2018       c->diag[i] = a->diag[i];
2019     }
2020   } else c->diag = NULL;
2021 
2022   c->solve_work         = NULL;
2023   c->saved_values       = NULL;
2024   c->idiag              = NULL;
2025   c->ssor_work          = NULL;
2026   c->keepnonzeropattern = a->keepnonzeropattern;
2027   c->free_val           = PETSC_TRUE;
2028   c->free_colidx        = PETSC_TRUE;
2029 
2030   c->maxallocmat  = a->maxallocmat;
2031   c->maxallocrow  = a->maxallocrow;
2032   c->rlenmax      = a->rlenmax;
2033   c->nz           = a->nz;
2034   C->preallocated = PETSC_TRUE;
2035 
2036   c->nonzerorowcnt = a->nonzerorowcnt;
2037   C->nonzerostate  = A->nonzerostate;
2038 
2039   PetscCall(PetscFunctionListDuplicate(((PetscObject)A)->qlist,&((PetscObject)C)->qlist));
2040   PetscFunctionReturn(0);
2041 }
2042 
2043 PetscErrorCode MatDuplicate_SeqSELL(Mat A,MatDuplicateOption cpvalues,Mat *B)
2044 {
2045   PetscFunctionBegin;
2046   PetscCall(MatCreate(PetscObjectComm((PetscObject)A),B));
2047   PetscCall(MatSetSizes(*B,A->rmap->n,A->cmap->n,A->rmap->n,A->cmap->n));
2048   if (!(A->rmap->n % A->rmap->bs) && !(A->cmap->n % A->cmap->bs)) {
2049     PetscCall(MatSetBlockSizesFromMats(*B,A,A));
2050   }
2051   PetscCall(MatSetType(*B,((PetscObject)A)->type_name));
2052   PetscCall(MatDuplicateNoCreate_SeqSELL(*B,A,cpvalues,PETSC_TRUE));
2053   PetscFunctionReturn(0);
2054 }
2055 
2056 /*MC
2057    MATSEQSELL - MATSEQSELL = "seqsell" - A matrix type to be used for sequential sparse matrices,
2058    based on the sliced Ellpack format
2059 
2060    Options Database Keys:
2061 . -mat_type seqsell - sets the matrix type to "seqsell" during a call to MatSetFromOptions()
2062 
2063    Level: beginner
2064 
2065 .seealso: `MatCreateSeqSell()`, `MATSELL`, `MATMPISELL`, `MATSEQAIJ`, `MATAIJ`, `MATMPIAIJ`
2066 M*/
2067 
2068 /*MC
2069    MATSELL - MATSELL = "sell" - A matrix type to be used for sparse matrices.
2070 
2071    This matrix type is identical to MATSEQSELL when constructed with a single process communicator,
2072    and MATMPISELL otherwise.  As a result, for single process communicators,
2073   MatSeqSELLSetPreallocation() is supported, and similarly MatMPISELLSetPreallocation() is supported
2074   for communicators controlling multiple processes.  It is recommended that you call both of
2075   the above preallocation routines for simplicity.
2076 
2077    Options Database Keys:
2078 . -mat_type sell - sets the matrix type to "sell" during a call to MatSetFromOptions()
2079 
2080   Level: beginner
2081 
2082   Notes:
2083    This format is only supported for real scalars, double precision, and 32 bit indices (the defaults).
2084 
2085    It can provide better performance on Intel and AMD processes with AVX2 or AVX512 support for matrices that have a similar number of
2086    non-zeros in contiguous groups of rows. However if the computation is memory bandwidth limited it may not provide much improvement.
2087 
2088   Developer Notes:
2089    On Intel (and AMD) systems some of the matrix operations use SIMD (AVX) instructions to achieve higher performance.
2090 
2091    The sparse matrix format is as follows. For simplicity we assume a slice size of 2, it is actually 8
2092 .vb
2093                             (2 0  3 4)
2094    Consider the matrix A =  (5 0  6 0)
2095                             (0 0  7 8)
2096                             (0 0  9 9)
2097 
2098    symbolically the Ellpack format can be written as
2099 
2100         (2 3 4 |)           (0 2 3 |)
2101    v =  (5 6 0 |)  colidx = (0 2 2 |)
2102         --------            ---------
2103         (7 8 |)             (2 3 |)
2104         (9 9 |)             (2 3 |)
2105 
2106     The data for 2 contiguous rows of the matrix are stored together (in column-major format) (with any left-over rows handled as a special case).
2107     Any of the rows in a slice fewer columns than the rest of the slice (row 1 above) are padded with a previous valid column in their "extra" colidx[] locations and
2108     zeros in their "extra" v locations so that the matrix operations do not need special code to handle different length rows within the 2 rows in a slice.
2109 
2110     The one-dimensional representation of v used in the code is (2 5 3 6 4 0 7 9 8 9)  and for colidx is (0 0 2 2 3 2 2 2 3 3)
2111 
2112 .ve
2113 
2114       See MatMult_SeqSELL() for how this format is used with the SIMD operations to achieve high performance.
2115 
2116  References:
2117 . * - Hong Zhang, Richard T. Mills, Karl Rupp, and Barry F. Smith, Vectorized Parallel Sparse Matrix-Vector Multiplication in {PETSc} Using {AVX-512},
2118    Proceedings of the 47th International Conference on Parallel Processing, 2018.
2119 
2120 .seealso: `MatCreateSeqSELL()`, `MatCreateSeqAIJ()`, `MatCreateSell()`, `MATSEQSELL`, `MATMPISELL`, `MATSEQAIJ`, `MATMPIAIJ`, `MATAIJ`
2121 M*/
2122 
2123 /*@C
2124        MatCreateSeqSELL - Creates a sparse matrix in SELL format.
2125 
2126  Collective on comm
2127 
2128  Input Parameters:
2129 +  comm - MPI communicator, set to PETSC_COMM_SELF
2130 .  m - number of rows
2131 .  n - number of columns
2132 .  rlenmax - maximum number of nonzeros in a row
2133 -  rlen - array containing the number of nonzeros in the various rows
2134  (possibly different for each row) or NULL
2135 
2136  Output Parameter:
2137 .  A - the matrix
2138 
2139  It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
2140  MatXXXXSetPreallocation() paradigm instead of this routine directly.
2141  [MatXXXXSetPreallocation() is, for example, MatSeqSELLSetPreallocation]
2142 
2143  Notes:
2144  If nnz is given then nz is ignored
2145 
2146  Specify the preallocated storage with either rlenmax or rlen (not both).
2147  Set rlenmax=PETSC_DEFAULT and rlen=NULL for PETSc to control dynamic memory
2148  allocation.  For large problems you MUST preallocate memory or you
2149  will get TERRIBLE performance, see the users' manual chapter on matrices.
2150 
2151  Level: intermediate
2152 
2153  .seealso: `MatCreate()`, `MatCreateSELL()`, `MatSetValues()`, `MatSeqSELLSetPreallocation()`, `MATSELL`, `MATSEQSELL`, `MATMPISELL`
2154 
2155  @*/
2156 PetscErrorCode MatCreateSeqSELL(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt maxallocrow,const PetscInt rlen[],Mat *A)
2157 {
2158   PetscFunctionBegin;
2159   PetscCall(MatCreate(comm,A));
2160   PetscCall(MatSetSizes(*A,m,n,m,n));
2161   PetscCall(MatSetType(*A,MATSEQSELL));
2162   PetscCall(MatSeqSELLSetPreallocation_SeqSELL(*A,maxallocrow,rlen));
2163   PetscFunctionReturn(0);
2164 }
2165 
2166 PetscErrorCode MatEqual_SeqSELL(Mat A,Mat B,PetscBool * flg)
2167 {
2168   Mat_SeqSELL    *a=(Mat_SeqSELL*)A->data,*b=(Mat_SeqSELL*)B->data;
2169   PetscInt       totalslices=a->totalslices;
2170 
2171   PetscFunctionBegin;
2172   /* If the  matrix dimensions are not equal,or no of nonzeros */
2173   if ((A->rmap->n != B->rmap->n) || (A->cmap->n != B->cmap->n) ||(a->nz != b->nz) || (a->rlenmax != b->rlenmax)) {
2174     *flg = PETSC_FALSE;
2175     PetscFunctionReturn(0);
2176   }
2177   /* if the a->colidx are the same */
2178   PetscCall(PetscArraycmp(a->colidx,b->colidx,a->sliidx[totalslices],flg));
2179   if (!*flg) PetscFunctionReturn(0);
2180   /* if a->val are the same */
2181   PetscCall(PetscArraycmp(a->val,b->val,a->sliidx[totalslices],flg));
2182   PetscFunctionReturn(0);
2183 }
2184 
2185 PetscErrorCode MatSeqSELLInvalidateDiagonal(Mat A)
2186 {
2187   Mat_SeqSELL *a=(Mat_SeqSELL*)A->data;
2188 
2189   PetscFunctionBegin;
2190   a->idiagvalid  = PETSC_FALSE;
2191   PetscFunctionReturn(0);
2192 }
2193 
2194 PetscErrorCode MatConjugate_SeqSELL(Mat A)
2195 {
2196 #if defined(PETSC_USE_COMPLEX)
2197   Mat_SeqSELL *a=(Mat_SeqSELL*)A->data;
2198   PetscInt    i;
2199   PetscScalar *val = a->val;
2200 
2201   PetscFunctionBegin;
2202   for (i=0; i<a->sliidx[a->totalslices]; i++) {
2203     val[i] = PetscConj(val[i]);
2204   }
2205 #else
2206   PetscFunctionBegin;
2207 #endif
2208   PetscFunctionReturn(0);
2209 }
2210