xref: /petsc/src/mat/impls/sell/seq/sell.c (revision c3b5f7ba6bc5ce25a01a67bb37ba5d34b02bbbd7)
1 
2 /*
3   Defines the basic matrix operations for the SELL matrix storage format.
4 */
5 #include <../src/mat/impls/sell/seq/sell.h>  /*I   "petscmat.h"  I*/
6 #include <petscblaslapack.h>
7 #include <petsc/private/kernels/blocktranspose.h>
8 
9 static PetscBool  cited = PETSC_FALSE;
10 static const char citation[] =
11   "@inproceedings{ZhangELLPACK2018,\n"
12   " author = {Hong Zhang and Richard T. Mills and Karl Rupp and Barry F. Smith},\n"
13   " title = {Vectorized Parallel Sparse Matrix-Vector Multiplication in {PETSc} Using {AVX-512}},\n"
14   " booktitle = {Proceedings of the 47th International Conference on Parallel Processing},\n"
15   " year = 2018\n"
16 "}\n";
17 
18 #if defined(PETSC_HAVE_IMMINTRIN_H) && (defined(__AVX512F__) || (defined(__AVX2__) && defined(__FMA__)) || defined(__AVX__)) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
19 
20   #include <immintrin.h>
21 
22   #if !defined(_MM_SCALE_8)
23   #define _MM_SCALE_8    8
24   #endif
25 
26   #if defined(__AVX512F__)
27   /* these do not work
28    vec_idx  = _mm512_loadunpackhi_epi32(vec_idx,acolidx);
29    vec_vals = _mm512_loadunpackhi_pd(vec_vals,aval);
30   */
31     #define AVX512_Mult_Private(vec_idx,vec_x,vec_vals,vec_y) \
32     /* if the mask bit is set, copy from acolidx, otherwise from vec_idx */ \
33     vec_idx  = _mm256_loadu_si256((__m256i const*)acolidx); \
34     vec_vals = _mm512_loadu_pd(aval); \
35     vec_x    = _mm512_i32gather_pd(vec_idx,x,_MM_SCALE_8); \
36     vec_y    = _mm512_fmadd_pd(vec_x,vec_vals,vec_y)
37   #elif defined(__AVX2__) && defined(__FMA__)
38     #define AVX2_Mult_Private(vec_idx,vec_x,vec_vals,vec_y) \
39     vec_vals = _mm256_loadu_pd(aval); \
40     vec_idx  = _mm_loadu_si128((__m128i const*)acolidx); /* SSE2 */ \
41     vec_x    = _mm256_i32gather_pd(x,vec_idx,_MM_SCALE_8); \
42     vec_y    = _mm256_fmadd_pd(vec_x,vec_vals,vec_y)
43   #endif
44 #endif  /* PETSC_HAVE_IMMINTRIN_H */
45 
46 /*@C
47  MatSeqSELLSetPreallocation - For good matrix assembly performance
48  the user should preallocate the matrix storage by setting the parameter nz
49  (or the array nnz).  By setting these parameters accurately, performance
50  during matrix assembly can be increased significantly.
51 
52  Collective
53 
54  Input Parameters:
55  +  B - The matrix
56  .  nz - number of nonzeros per row (same for all rows)
57  -  nnz - array containing the number of nonzeros in the various rows
58  (possibly different for each row) or NULL
59 
60  Notes:
61  If nnz is given then nz is ignored.
62 
63  Specify the preallocated storage with either nz or nnz (not both).
64  Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
65  allocation.  For large problems you MUST preallocate memory or you
66  will get TERRIBLE performance, see the users' manual chapter on matrices.
67 
68  You can call MatGetInfo() to get information on how effective the preallocation was;
69  for example the fields mallocs,nz_allocated,nz_used,nz_unneeded;
70  You can also run with the option -info and look for messages with the string
71  malloc in them to see if additional memory allocation was needed.
72 
73  Developers: Use nz of MAT_SKIP_ALLOCATION to not allocate any space for the matrix
74  entries or columns indices.
75 
76  The maximum number of nonzeos in any row should be as accurate as possible.
77  If it is underestimated, you will get bad performance due to reallocation
78  (MatSeqXSELLReallocateSELL).
79 
80  Level: intermediate
81 
82  .seealso: MatCreate(), MatCreateSELL(), MatSetValues(), MatGetInfo()
83 
84  @*/
85 PetscErrorCode MatSeqSELLSetPreallocation(Mat B,PetscInt rlenmax,const PetscInt rlen[])
86 {
87   PetscFunctionBegin;
88   PetscValidHeaderSpecific(B,MAT_CLASSID,1);
89   PetscValidType(B,1);
90   PetscTryMethod(B,"MatSeqSELLSetPreallocation_C",(Mat,PetscInt,const PetscInt[]),(B,rlenmax,rlen));
91   PetscFunctionReturn(0);
92 }
93 
94 PetscErrorCode MatSeqSELLSetPreallocation_SeqSELL(Mat B,PetscInt maxallocrow,const PetscInt rlen[])
95 {
96   Mat_SeqSELL    *b;
97   PetscInt       i,j,totalslices;
98   PetscBool      skipallocation=PETSC_FALSE,realalloc=PETSC_FALSE;
99 
100   PetscFunctionBegin;
101   if (maxallocrow >= 0 || rlen) realalloc = PETSC_TRUE;
102   if (maxallocrow == MAT_SKIP_ALLOCATION) {
103     skipallocation = PETSC_TRUE;
104     maxallocrow    = 0;
105   }
106 
107   PetscCall(PetscLayoutSetUp(B->rmap));
108   PetscCall(PetscLayoutSetUp(B->cmap));
109 
110   /* FIXME: if one preallocates more space than needed, the matrix does not shrink automatically, but for best performance it should */
111   if (maxallocrow == PETSC_DEFAULT || maxallocrow == PETSC_DECIDE) maxallocrow = 5;
112   PetscCheckFalse(maxallocrow < 0,PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"maxallocrow cannot be less than 0: value %" PetscInt_FMT,maxallocrow);
113   if (rlen) {
114     for (i=0; i<B->rmap->n; i++) {
115       PetscCheckFalse(rlen[i] < 0,PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"rlen cannot be less than 0: local row %" PetscInt_FMT " value %" PetscInt_FMT,i,rlen[i]);
116       PetscCheckFalse(rlen[i] > B->cmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"rlen cannot be greater than row length: local row %" PetscInt_FMT " value %" PetscInt_FMT " rowlength %" PetscInt_FMT,i,rlen[i],B->cmap->n);
117     }
118   }
119 
120   B->preallocated = PETSC_TRUE;
121 
122   b = (Mat_SeqSELL*)B->data;
123 
124   totalslices = PetscCeilInt(B->rmap->n,8);
125   b->totalslices = totalslices;
126   if (!skipallocation) {
127     if (B->rmap->n & 0x07) PetscCall(PetscInfo(B,"Padding rows to the SEQSELL matrix because the number of rows is not the multiple of 8 (value %" PetscInt_FMT ")\n",B->rmap->n));
128 
129     if (!b->sliidx) { /* sliidx gives the starting index of each slice, the last element is the total space allocated */
130       PetscCall(PetscMalloc1(totalslices+1,&b->sliidx));
131       PetscCall(PetscLogObjectMemory((PetscObject)B,(totalslices+1)*sizeof(PetscInt)));
132     }
133     if (!rlen) { /* if rlen is not provided, allocate same space for all the slices */
134       if (maxallocrow == PETSC_DEFAULT || maxallocrow == PETSC_DECIDE) maxallocrow = 10;
135       else if (maxallocrow < 0) maxallocrow = 1;
136       for (i=0; i<=totalslices; i++) b->sliidx[i] = i*8*maxallocrow;
137     } else {
138       maxallocrow = 0;
139       b->sliidx[0] = 0;
140       for (i=1; i<totalslices; i++) {
141         b->sliidx[i] = 0;
142         for (j=0;j<8;j++) {
143           b->sliidx[i] = PetscMax(b->sliidx[i],rlen[8*(i-1)+j]);
144         }
145         maxallocrow = PetscMax(b->sliidx[i],maxallocrow);
146         PetscCall(PetscIntSumError(b->sliidx[i-1],8*b->sliidx[i],&b->sliidx[i]));
147       }
148       /* last slice */
149       b->sliidx[totalslices] = 0;
150       for (j=(totalslices-1)*8;j<B->rmap->n;j++) b->sliidx[totalslices] = PetscMax(b->sliidx[totalslices],rlen[j]);
151       maxallocrow = PetscMax(b->sliidx[totalslices],maxallocrow);
152       b->sliidx[totalslices] = b->sliidx[totalslices-1] + 8*b->sliidx[totalslices];
153     }
154 
155     /* allocate space for val, colidx, rlen */
156     /* FIXME: should B's old memory be unlogged? */
157     PetscCall(MatSeqXSELLFreeSELL(B,&b->val,&b->colidx));
158     /* FIXME: assuming an element of the bit array takes 8 bits */
159     PetscCall(PetscMalloc2(b->sliidx[totalslices],&b->val,b->sliidx[totalslices],&b->colidx));
160     PetscCall(PetscLogObjectMemory((PetscObject)B,b->sliidx[totalslices]*(sizeof(PetscScalar)+sizeof(PetscInt))));
161     /* b->rlen will count nonzeros in each row so far. We dont copy rlen to b->rlen because the matrix has not been set. */
162     PetscCall(PetscCalloc1(8*totalslices,&b->rlen));
163     PetscCall(PetscLogObjectMemory((PetscObject)B,8*totalslices*sizeof(PetscInt)));
164 
165     b->singlemalloc = PETSC_TRUE;
166     b->free_val     = PETSC_TRUE;
167     b->free_colidx  = PETSC_TRUE;
168   } else {
169     b->free_val    = PETSC_FALSE;
170     b->free_colidx = PETSC_FALSE;
171   }
172 
173   b->nz               = 0;
174   b->maxallocrow      = maxallocrow;
175   b->rlenmax          = maxallocrow;
176   b->maxallocmat      = b->sliidx[totalslices];
177   B->info.nz_unneeded = (double)b->maxallocmat;
178   if (realalloc) {
179     PetscCall(MatSetOption(B,MAT_NEW_NONZERO_ALLOCATION_ERR,PETSC_TRUE));
180   }
181   PetscFunctionReturn(0);
182 }
183 
184 PetscErrorCode MatGetRow_SeqSELL(Mat A,PetscInt row,PetscInt *nz,PetscInt **idx,PetscScalar **v)
185 {
186   Mat_SeqSELL *a = (Mat_SeqSELL*)A->data;
187   PetscInt    shift;
188 
189   PetscFunctionBegin;
190   PetscCheckFalse(row < 0 || row >= A->rmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Row %" PetscInt_FMT " out of range",row);
191   if (nz) *nz = a->rlen[row];
192   shift = a->sliidx[row>>3]+(row&0x07);
193   if (!a->getrowcols) {
194 
195     PetscCall(PetscMalloc2(a->rlenmax,&a->getrowcols,a->rlenmax,&a->getrowvals));
196   }
197   if (idx) {
198     PetscInt j;
199     for (j=0; j<a->rlen[row]; j++) a->getrowcols[j] = a->colidx[shift+8*j];
200     *idx = a->getrowcols;
201   }
202   if (v) {
203     PetscInt j;
204     for (j=0; j<a->rlen[row]; j++) a->getrowvals[j] = a->val[shift+8*j];
205     *v = a->getrowvals;
206   }
207   PetscFunctionReturn(0);
208 }
209 
210 PetscErrorCode MatRestoreRow_SeqSELL(Mat A,PetscInt row,PetscInt *nz,PetscInt **idx,PetscScalar **v)
211 {
212   PetscFunctionBegin;
213   PetscFunctionReturn(0);
214 }
215 
216 PetscErrorCode MatConvert_SeqSELL_SeqAIJ(Mat A, MatType newtype,MatReuse reuse,Mat *newmat)
217 {
218   Mat            B;
219   Mat_SeqSELL    *a=(Mat_SeqSELL*)A->data;
220   PetscInt       i;
221 
222   PetscFunctionBegin;
223   if (reuse == MAT_REUSE_MATRIX) {
224     B    = *newmat;
225     PetscCall(MatZeroEntries(B));
226   } else {
227     PetscCall(MatCreate(PetscObjectComm((PetscObject)A),&B));
228     PetscCall(MatSetSizes(B,A->rmap->n,A->cmap->n,A->rmap->N,A->cmap->N));
229     PetscCall(MatSetType(B,MATSEQAIJ));
230     PetscCall(MatSeqAIJSetPreallocation(B,0,a->rlen));
231   }
232 
233   for (i=0; i<A->rmap->n; i++) {
234     PetscInt    nz = 0,*cols = NULL;
235     PetscScalar *vals = NULL;
236 
237     PetscCall(MatGetRow_SeqSELL(A,i,&nz,&cols,&vals));
238     PetscCall(MatSetValues(B,1,&i,nz,cols,vals,INSERT_VALUES));
239     PetscCall(MatRestoreRow_SeqSELL(A,i,&nz,&cols,&vals));
240   }
241 
242   PetscCall(MatAssemblyBegin(B,MAT_FINAL_ASSEMBLY));
243   PetscCall(MatAssemblyEnd(B,MAT_FINAL_ASSEMBLY));
244   B->rmap->bs = A->rmap->bs;
245 
246   if (reuse == MAT_INPLACE_MATRIX) {
247     PetscCall(MatHeaderReplace(A,&B));
248   } else {
249     *newmat = B;
250   }
251   PetscFunctionReturn(0);
252 }
253 
254 #include <../src/mat/impls/aij/seq/aij.h>
255 
256 PetscErrorCode MatConvert_SeqAIJ_SeqSELL(Mat A,MatType newtype,MatReuse reuse,Mat *newmat)
257 {
258   Mat               B;
259   Mat_SeqAIJ        *a=(Mat_SeqAIJ*)A->data;
260   PetscInt          *ai=a->i,m=A->rmap->N,n=A->cmap->N,i,*rowlengths,row,ncols;
261   const PetscInt    *cols;
262   const PetscScalar *vals;
263 
264   PetscFunctionBegin;
265 
266   if (reuse == MAT_REUSE_MATRIX) {
267     B = *newmat;
268   } else {
269     if (PetscDefined(USE_DEBUG) || !a->ilen) {
270       PetscCall(PetscMalloc1(m,&rowlengths));
271       for (i=0; i<m; i++) {
272         rowlengths[i] = ai[i+1] - ai[i];
273       }
274     }
275     if (PetscDefined(USE_DEBUG) && a->ilen) {
276       PetscBool eq;
277       PetscCall(PetscMemcmp(rowlengths,a->ilen,m*sizeof(PetscInt),&eq));
278       PetscCheck(eq,PETSC_COMM_SELF,PETSC_ERR_PLIB,"SeqAIJ ilen array incorrect");
279       PetscCall(PetscFree(rowlengths));
280       rowlengths = a->ilen;
281     } else if (a->ilen) rowlengths = a->ilen;
282     PetscCall(MatCreate(PetscObjectComm((PetscObject)A),&B));
283     PetscCall(MatSetSizes(B,m,n,m,n));
284     PetscCall(MatSetType(B,MATSEQSELL));
285     PetscCall(MatSeqSELLSetPreallocation(B,0,rowlengths));
286     if (rowlengths != a->ilen) PetscCall(PetscFree(rowlengths));
287   }
288 
289   for (row=0; row<m; row++) {
290     PetscCall(MatGetRow_SeqAIJ(A,row,&ncols,(PetscInt**)&cols,(PetscScalar**)&vals));
291     PetscCall(MatSetValues_SeqSELL(B,1,&row,ncols,cols,vals,INSERT_VALUES));
292     PetscCall(MatRestoreRow_SeqAIJ(A,row,&ncols,(PetscInt**)&cols,(PetscScalar**)&vals));
293   }
294   PetscCall(MatAssemblyBegin(B,MAT_FINAL_ASSEMBLY));
295   PetscCall(MatAssemblyEnd(B,MAT_FINAL_ASSEMBLY));
296   B->rmap->bs = A->rmap->bs;
297 
298   if (reuse == MAT_INPLACE_MATRIX) {
299     PetscCall(MatHeaderReplace(A,&B));
300   } else {
301     *newmat = B;
302   }
303   PetscFunctionReturn(0);
304 }
305 
306 PetscErrorCode MatMult_SeqSELL(Mat A,Vec xx,Vec yy)
307 {
308   Mat_SeqSELL       *a=(Mat_SeqSELL*)A->data;
309   PetscScalar       *y;
310   const PetscScalar *x;
311   const MatScalar   *aval=a->val;
312   PetscInt          totalslices=a->totalslices;
313   const PetscInt    *acolidx=a->colidx;
314   PetscInt          i,j;
315 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
316   __m512d           vec_x,vec_y,vec_vals;
317   __m256i           vec_idx;
318   __mmask8          mask;
319   __m512d           vec_x2,vec_y2,vec_vals2,vec_x3,vec_y3,vec_vals3,vec_x4,vec_y4,vec_vals4;
320   __m256i           vec_idx2,vec_idx3,vec_idx4;
321 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
322   __m128i           vec_idx;
323   __m256d           vec_x,vec_y,vec_y2,vec_vals;
324   MatScalar         yval;
325   PetscInt          r,rows_left,row,nnz_in_row;
326 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
327   __m128d           vec_x_tmp;
328   __m256d           vec_x,vec_y,vec_y2,vec_vals;
329   MatScalar         yval;
330   PetscInt          r,rows_left,row,nnz_in_row;
331 #else
332   PetscScalar       sum[8];
333 #endif
334 
335 #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
336 #pragma disjoint(*x,*y,*aval)
337 #endif
338 
339   PetscFunctionBegin;
340   PetscCall(VecGetArrayRead(xx,&x));
341   PetscCall(VecGetArray(yy,&y));
342 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
343   for (i=0; i<totalslices; i++) { /* loop over slices */
344     PetscPrefetchBlock(acolidx,a->sliidx[i+1]-a->sliidx[i],0,PETSC_PREFETCH_HINT_T0);
345     PetscPrefetchBlock(aval,a->sliidx[i+1]-a->sliidx[i],0,PETSC_PREFETCH_HINT_T0);
346 
347     vec_y  = _mm512_setzero_pd();
348     vec_y2 = _mm512_setzero_pd();
349     vec_y3 = _mm512_setzero_pd();
350     vec_y4 = _mm512_setzero_pd();
351 
352     j = a->sliidx[i]>>3; /* 8 bytes are read at each time, corresponding to a slice columnn */
353     switch ((a->sliidx[i+1]-a->sliidx[i])/8 & 3) {
354     case 3:
355       AVX512_Mult_Private(vec_idx,vec_x,vec_vals,vec_y);
356       acolidx += 8; aval += 8;
357       AVX512_Mult_Private(vec_idx2,vec_x2,vec_vals2,vec_y2);
358       acolidx += 8; aval += 8;
359       AVX512_Mult_Private(vec_idx3,vec_x3,vec_vals3,vec_y3);
360       acolidx += 8; aval += 8;
361       j += 3;
362       break;
363     case 2:
364       AVX512_Mult_Private(vec_idx,vec_x,vec_vals,vec_y);
365       acolidx += 8; aval += 8;
366       AVX512_Mult_Private(vec_idx2,vec_x2,vec_vals2,vec_y2);
367       acolidx += 8; aval += 8;
368       j += 2;
369       break;
370     case 1:
371       AVX512_Mult_Private(vec_idx,vec_x,vec_vals,vec_y);
372       acolidx += 8; aval += 8;
373       j += 1;
374       break;
375     }
376     #pragma novector
377     for (; j<(a->sliidx[i+1]>>3); j+=4) {
378       AVX512_Mult_Private(vec_idx,vec_x,vec_vals,vec_y);
379       acolidx += 8; aval += 8;
380       AVX512_Mult_Private(vec_idx2,vec_x2,vec_vals2,vec_y2);
381       acolidx += 8; aval += 8;
382       AVX512_Mult_Private(vec_idx3,vec_x3,vec_vals3,vec_y3);
383       acolidx += 8; aval += 8;
384       AVX512_Mult_Private(vec_idx4,vec_x4,vec_vals4,vec_y4);
385       acolidx += 8; aval += 8;
386     }
387 
388     vec_y = _mm512_add_pd(vec_y,vec_y2);
389     vec_y = _mm512_add_pd(vec_y,vec_y3);
390     vec_y = _mm512_add_pd(vec_y,vec_y4);
391     if (i == totalslices-1 && A->rmap->n & 0x07) { /* if last slice has padding rows */
392       mask = (__mmask8)(0xff >> (8-(A->rmap->n & 0x07)));
393       _mm512_mask_storeu_pd(&y[8*i],mask,vec_y);
394     } else {
395       _mm512_storeu_pd(&y[8*i],vec_y);
396     }
397   }
398 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
399   for (i=0; i<totalslices; i++) { /* loop over full slices */
400     PetscPrefetchBlock(acolidx,a->sliidx[i+1]-a->sliidx[i],0,PETSC_PREFETCH_HINT_T0);
401     PetscPrefetchBlock(aval,a->sliidx[i+1]-a->sliidx[i],0,PETSC_PREFETCH_HINT_T0);
402 
403     /* last slice may have padding rows. Don't use vectorization. */
404     if (i == totalslices-1 && (A->rmap->n & 0x07)) {
405       rows_left = A->rmap->n - 8*i;
406       for (r=0; r<rows_left; ++r) {
407         yval = (MatScalar)0;
408         row = 8*i + r;
409         nnz_in_row = a->rlen[row];
410         for (j=0; j<nnz_in_row; ++j) yval += aval[8*j+r] * x[acolidx[8*j+r]];
411         y[row] = yval;
412       }
413       break;
414     }
415 
416     vec_y  = _mm256_setzero_pd();
417     vec_y2 = _mm256_setzero_pd();
418 
419     /* Process slice of height 8 (512 bits) via two subslices of height 4 (256 bits) via AVX */
420     #pragma novector
421     #pragma unroll(2)
422     for (j=a->sliidx[i]; j<a->sliidx[i+1]; j+=8) {
423       AVX2_Mult_Private(vec_idx,vec_x,vec_vals,vec_y);
424       aval += 4; acolidx += 4;
425       AVX2_Mult_Private(vec_idx,vec_x,vec_vals,vec_y2);
426       aval += 4; acolidx += 4;
427     }
428 
429     _mm256_storeu_pd(y+i*8,vec_y);
430     _mm256_storeu_pd(y+i*8+4,vec_y2);
431   }
432 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
433   for (i=0; i<totalslices; i++) { /* loop over full slices */
434     PetscPrefetchBlock(acolidx,a->sliidx[i+1]-a->sliidx[i],0,PETSC_PREFETCH_HINT_T0);
435     PetscPrefetchBlock(aval,a->sliidx[i+1]-a->sliidx[i],0,PETSC_PREFETCH_HINT_T0);
436 
437     vec_y  = _mm256_setzero_pd();
438     vec_y2 = _mm256_setzero_pd();
439 
440     /* last slice may have padding rows. Don't use vectorization. */
441     if (i == totalslices-1 && (A->rmap->n & 0x07)) {
442       rows_left = A->rmap->n - 8*i;
443       for (r=0; r<rows_left; ++r) {
444         yval = (MatScalar)0;
445         row = 8*i + r;
446         nnz_in_row = a->rlen[row];
447         for (j=0; j<nnz_in_row; ++j) yval += aval[8*j + r] * x[acolidx[8*j + r]];
448         y[row] = yval;
449       }
450       break;
451     }
452 
453     /* Process slice of height 8 (512 bits) via two subslices of height 4 (256 bits) via AVX */
454     #pragma novector
455     #pragma unroll(2)
456     for (j=a->sliidx[i]; j<a->sliidx[i+1]; j+=8) {
457       vec_vals  = _mm256_loadu_pd(aval);
458       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
459       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
460       vec_x     = _mm256_insertf128_pd(vec_x,vec_x_tmp,0);
461       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
462       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
463       vec_x     = _mm256_insertf128_pd(vec_x,vec_x_tmp,1);
464       vec_y     = _mm256_add_pd(_mm256_mul_pd(vec_x,vec_vals),vec_y);
465       aval     += 4;
466 
467       vec_vals  = _mm256_loadu_pd(aval);
468       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
469       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
470       vec_x     = _mm256_insertf128_pd(vec_x,vec_x_tmp,0);
471       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
472       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
473       vec_x     = _mm256_insertf128_pd(vec_x,vec_x_tmp,1);
474       vec_y2    = _mm256_add_pd(_mm256_mul_pd(vec_x,vec_vals),vec_y2);
475       aval     += 4;
476     }
477 
478     _mm256_storeu_pd(y + i*8,     vec_y);
479     _mm256_storeu_pd(y + i*8 + 4, vec_y2);
480   }
481 #else
482   for (i=0; i<totalslices; i++) { /* loop over slices */
483     for (j=0; j<8; j++) sum[j] = 0.0;
484     for (j=a->sliidx[i]; j<a->sliidx[i+1]; j+=8) {
485       sum[0] += aval[j] * x[acolidx[j]];
486       sum[1] += aval[j+1] * x[acolidx[j+1]];
487       sum[2] += aval[j+2] * x[acolidx[j+2]];
488       sum[3] += aval[j+3] * x[acolidx[j+3]];
489       sum[4] += aval[j+4] * x[acolidx[j+4]];
490       sum[5] += aval[j+5] * x[acolidx[j+5]];
491       sum[6] += aval[j+6] * x[acolidx[j+6]];
492       sum[7] += aval[j+7] * x[acolidx[j+7]];
493     }
494     if (i == totalslices-1 && (A->rmap->n & 0x07)) { /* if last slice has padding rows */
495       for (j=0; j<(A->rmap->n & 0x07); j++) y[8*i+j] = sum[j];
496     } else {
497       for (j=0; j<8; j++) y[8*i+j] = sum[j];
498     }
499   }
500 #endif
501 
502   PetscCall(PetscLogFlops(2.0*a->nz-a->nonzerorowcnt)); /* theoretical minimal FLOPs */
503   PetscCall(VecRestoreArrayRead(xx,&x));
504   PetscCall(VecRestoreArray(yy,&y));
505   PetscFunctionReturn(0);
506 }
507 
508 #include <../src/mat/impls/aij/seq/ftn-kernels/fmultadd.h>
509 PetscErrorCode MatMultAdd_SeqSELL(Mat A,Vec xx,Vec yy,Vec zz)
510 {
511   Mat_SeqSELL       *a=(Mat_SeqSELL*)A->data;
512   PetscScalar       *y,*z;
513   const PetscScalar *x;
514   const MatScalar   *aval=a->val;
515   PetscInt          totalslices=a->totalslices;
516   const PetscInt    *acolidx=a->colidx;
517   PetscInt          i,j;
518 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
519   __m512d           vec_x,vec_y,vec_vals;
520   __m256i           vec_idx;
521   __mmask8          mask;
522   __m512d           vec_x2,vec_y2,vec_vals2,vec_x3,vec_y3,vec_vals3,vec_x4,vec_y4,vec_vals4;
523   __m256i           vec_idx2,vec_idx3,vec_idx4;
524 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
525   __m128d           vec_x_tmp;
526   __m256d           vec_x,vec_y,vec_y2,vec_vals;
527   MatScalar         yval;
528   PetscInt          r,row,nnz_in_row;
529 #else
530   PetscScalar       sum[8];
531 #endif
532 
533 #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
534 #pragma disjoint(*x,*y,*aval)
535 #endif
536 
537   PetscFunctionBegin;
538   PetscCall(VecGetArrayRead(xx,&x));
539   PetscCall(VecGetArrayPair(yy,zz,&y,&z));
540 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
541   for (i=0; i<totalslices; i++) { /* loop over slices */
542     PetscPrefetchBlock(acolidx,a->sliidx[i+1]-a->sliidx[i],0,PETSC_PREFETCH_HINT_T0);
543     PetscPrefetchBlock(aval,a->sliidx[i+1]-a->sliidx[i],0,PETSC_PREFETCH_HINT_T0);
544 
545     if (i == totalslices-1 && A->rmap->n & 0x07) { /* if last slice has padding rows */
546       mask   = (__mmask8)(0xff >> (8-(A->rmap->n & 0x07)));
547       vec_y  = _mm512_mask_loadu_pd(vec_y,mask,&y[8*i]);
548     } else {
549       vec_y  = _mm512_loadu_pd(&y[8*i]);
550     }
551     vec_y2 = _mm512_setzero_pd();
552     vec_y3 = _mm512_setzero_pd();
553     vec_y4 = _mm512_setzero_pd();
554 
555     j = a->sliidx[i]>>3; /* 8 bytes are read at each time, corresponding to a slice columnn */
556     switch ((a->sliidx[i+1]-a->sliidx[i])/8 & 3) {
557     case 3:
558       AVX512_Mult_Private(vec_idx,vec_x,vec_vals,vec_y);
559       acolidx += 8; aval += 8;
560       AVX512_Mult_Private(vec_idx2,vec_x2,vec_vals2,vec_y2);
561       acolidx += 8; aval += 8;
562       AVX512_Mult_Private(vec_idx3,vec_x3,vec_vals3,vec_y3);
563       acolidx += 8; aval += 8;
564       j += 3;
565       break;
566     case 2:
567       AVX512_Mult_Private(vec_idx,vec_x,vec_vals,vec_y);
568       acolidx += 8; aval += 8;
569       AVX512_Mult_Private(vec_idx2,vec_x2,vec_vals2,vec_y2);
570       acolidx += 8; aval += 8;
571       j += 2;
572       break;
573     case 1:
574       AVX512_Mult_Private(vec_idx,vec_x,vec_vals,vec_y);
575       acolidx += 8; aval += 8;
576       j += 1;
577       break;
578     }
579     #pragma novector
580     for (; j<(a->sliidx[i+1]>>3); j+=4) {
581       AVX512_Mult_Private(vec_idx,vec_x,vec_vals,vec_y);
582       acolidx += 8; aval += 8;
583       AVX512_Mult_Private(vec_idx2,vec_x2,vec_vals2,vec_y2);
584       acolidx += 8; aval += 8;
585       AVX512_Mult_Private(vec_idx3,vec_x3,vec_vals3,vec_y3);
586       acolidx += 8; aval += 8;
587       AVX512_Mult_Private(vec_idx4,vec_x4,vec_vals4,vec_y4);
588       acolidx += 8; aval += 8;
589     }
590 
591     vec_y = _mm512_add_pd(vec_y,vec_y2);
592     vec_y = _mm512_add_pd(vec_y,vec_y3);
593     vec_y = _mm512_add_pd(vec_y,vec_y4);
594     if (i == totalslices-1 && A->rmap->n & 0x07) { /* if last slice has padding rows */
595       _mm512_mask_storeu_pd(&z[8*i],mask,vec_y);
596     } else {
597       _mm512_storeu_pd(&z[8*i],vec_y);
598     }
599   }
600 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
601   for (i=0; i<totalslices; i++) { /* loop over full slices */
602     PetscPrefetchBlock(acolidx,a->sliidx[i+1]-a->sliidx[i],0,PETSC_PREFETCH_HINT_T0);
603     PetscPrefetchBlock(aval,a->sliidx[i+1]-a->sliidx[i],0,PETSC_PREFETCH_HINT_T0);
604 
605     /* last slice may have padding rows. Don't use vectorization. */
606     if (i == totalslices-1 && (A->rmap->n & 0x07)) {
607       for (r=0; r<(A->rmap->n & 0x07); ++r) {
608         row        = 8*i + r;
609         yval       = (MatScalar)0.0;
610         nnz_in_row = a->rlen[row];
611         for (j=0; j<nnz_in_row; ++j) yval += aval[8*j+r] * x[acolidx[8*j+r]];
612         z[row] = y[row] + yval;
613       }
614       break;
615     }
616 
617     vec_y  = _mm256_loadu_pd(y+8*i);
618     vec_y2 = _mm256_loadu_pd(y+8*i+4);
619 
620     /* Process slice of height 8 (512 bits) via two subslices of height 4 (256 bits) via AVX */
621     for (j=a->sliidx[i]; j<a->sliidx[i+1]; j+=8) {
622       vec_vals  = _mm256_loadu_pd(aval);
623       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
624       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
625       vec_x     = _mm256_insertf128_pd(vec_x,vec_x_tmp,0);
626       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
627       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
628       vec_x     = _mm256_insertf128_pd(vec_x,vec_x_tmp,1);
629       vec_y     = _mm256_add_pd(_mm256_mul_pd(vec_x,vec_vals),vec_y);
630       aval     += 4;
631 
632       vec_vals  = _mm256_loadu_pd(aval);
633       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
634       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
635       vec_x     = _mm256_insertf128_pd(vec_x,vec_x_tmp,0);
636       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
637       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
638       vec_x     = _mm256_insertf128_pd(vec_x,vec_x_tmp,1);
639       vec_y2    = _mm256_add_pd(_mm256_mul_pd(vec_x,vec_vals),vec_y2);
640       aval     += 4;
641     }
642 
643     _mm256_storeu_pd(z+i*8,vec_y);
644     _mm256_storeu_pd(z+i*8+4,vec_y2);
645   }
646 #else
647   for (i=0; i<totalslices; i++) { /* loop over slices */
648     for (j=0; j<8; j++) sum[j] = 0.0;
649     for (j=a->sliidx[i]; j<a->sliidx[i+1]; j+=8) {
650       sum[0] += aval[j] * x[acolidx[j]];
651       sum[1] += aval[j+1] * x[acolidx[j+1]];
652       sum[2] += aval[j+2] * x[acolidx[j+2]];
653       sum[3] += aval[j+3] * x[acolidx[j+3]];
654       sum[4] += aval[j+4] * x[acolidx[j+4]];
655       sum[5] += aval[j+5] * x[acolidx[j+5]];
656       sum[6] += aval[j+6] * x[acolidx[j+6]];
657       sum[7] += aval[j+7] * x[acolidx[j+7]];
658     }
659     if (i == totalslices-1 && (A->rmap->n & 0x07)) {
660       for (j=0; j<(A->rmap->n & 0x07); j++) z[8*i+j] = y[8*i+j] + sum[j];
661     } else {
662       for (j=0; j<8; j++) z[8*i+j] = y[8*i+j] + sum[j];
663     }
664   }
665 #endif
666 
667   PetscCall(PetscLogFlops(2.0*a->nz));
668   PetscCall(VecRestoreArrayRead(xx,&x));
669   PetscCall(VecRestoreArrayPair(yy,zz,&y,&z));
670   PetscFunctionReturn(0);
671 }
672 
673 PetscErrorCode MatMultTransposeAdd_SeqSELL(Mat A,Vec xx,Vec zz,Vec yy)
674 {
675   Mat_SeqSELL       *a=(Mat_SeqSELL*)A->data;
676   PetscScalar       *y;
677   const PetscScalar *x;
678   const MatScalar   *aval=a->val;
679   const PetscInt    *acolidx=a->colidx;
680   PetscInt          i,j,r,row,nnz_in_row,totalslices=a->totalslices;
681 
682 #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
683 #pragma disjoint(*x,*y,*aval)
684 #endif
685 
686   PetscFunctionBegin;
687   if (A->symmetric) {
688     PetscCall(MatMultAdd_SeqSELL(A,xx,zz,yy));
689     PetscFunctionReturn(0);
690   }
691   if (zz != yy) PetscCall(VecCopy(zz,yy));
692   PetscCall(VecGetArrayRead(xx,&x));
693   PetscCall(VecGetArray(yy,&y));
694   for (i=0; i<a->totalslices; i++) { /* loop over slices */
695     if (i == totalslices-1 && (A->rmap->n & 0x07)) {
696       for (r=0; r<(A->rmap->n & 0x07); ++r) {
697         row        = 8*i + r;
698         nnz_in_row = a->rlen[row];
699         for (j=0; j<nnz_in_row; ++j) y[acolidx[8*j+r]] += aval[8*j+r] * x[row];
700       }
701       break;
702     }
703     for (j=a->sliidx[i]; j<a->sliidx[i+1]; j+=8) {
704       y[acolidx[j]]   += aval[j] * x[8*i];
705       y[acolidx[j+1]] += aval[j+1] * x[8*i+1];
706       y[acolidx[j+2]] += aval[j+2] * x[8*i+2];
707       y[acolidx[j+3]] += aval[j+3] * x[8*i+3];
708       y[acolidx[j+4]] += aval[j+4] * x[8*i+4];
709       y[acolidx[j+5]] += aval[j+5] * x[8*i+5];
710       y[acolidx[j+6]] += aval[j+6] * x[8*i+6];
711       y[acolidx[j+7]] += aval[j+7] * x[8*i+7];
712     }
713   }
714   PetscCall(PetscLogFlops(2.0*a->sliidx[a->totalslices]));
715   PetscCall(VecRestoreArrayRead(xx,&x));
716   PetscCall(VecRestoreArray(yy,&y));
717   PetscFunctionReturn(0);
718 }
719 
720 PetscErrorCode MatMultTranspose_SeqSELL(Mat A,Vec xx,Vec yy)
721 {
722   PetscFunctionBegin;
723   if (A->symmetric) {
724     PetscCall(MatMult_SeqSELL(A,xx,yy));
725   } else {
726     PetscCall(VecSet(yy,0.0));
727     PetscCall(MatMultTransposeAdd_SeqSELL(A,xx,yy,yy));
728   }
729   PetscFunctionReturn(0);
730 }
731 
732 /*
733      Checks for missing diagonals
734 */
735 PetscErrorCode MatMissingDiagonal_SeqSELL(Mat A,PetscBool  *missing,PetscInt *d)
736 {
737   Mat_SeqSELL    *a=(Mat_SeqSELL*)A->data;
738   PetscInt       *diag,i;
739 
740   PetscFunctionBegin;
741   *missing = PETSC_FALSE;
742   if (A->rmap->n > 0 && !(a->colidx)) {
743     *missing = PETSC_TRUE;
744     if (d) *d = 0;
745     PetscCall(PetscInfo(A,"Matrix has no entries therefore is missing diagonal\n"));
746   } else {
747     diag = a->diag;
748     for (i=0; i<A->rmap->n; i++) {
749       if (diag[i] == -1) {
750         *missing = PETSC_TRUE;
751         if (d) *d = i;
752         PetscCall(PetscInfo(A,"Matrix is missing diagonal number %" PetscInt_FMT "\n",i));
753         break;
754       }
755     }
756   }
757   PetscFunctionReturn(0);
758 }
759 
760 PetscErrorCode MatMarkDiagonal_SeqSELL(Mat A)
761 {
762   Mat_SeqSELL    *a=(Mat_SeqSELL*)A->data;
763   PetscInt       i,j,m=A->rmap->n,shift;
764 
765   PetscFunctionBegin;
766   if (!a->diag) {
767     PetscCall(PetscMalloc1(m,&a->diag));
768     PetscCall(PetscLogObjectMemory((PetscObject)A,m*sizeof(PetscInt)));
769     a->free_diag = PETSC_TRUE;
770   }
771   for (i=0; i<m; i++) { /* loop over rows */
772     shift = a->sliidx[i>>3]+(i&0x07); /* starting index of the row i */
773     a->diag[i] = -1;
774     for (j=0; j<a->rlen[i]; j++) {
775       if (a->colidx[shift+j*8] == i) {
776         a->diag[i] = shift+j*8;
777         break;
778       }
779     }
780   }
781   PetscFunctionReturn(0);
782 }
783 
784 /*
785   Negative shift indicates do not generate an error if there is a zero diagonal, just invert it anyways
786 */
787 PetscErrorCode MatInvertDiagonal_SeqSELL(Mat A,PetscScalar omega,PetscScalar fshift)
788 {
789   Mat_SeqSELL    *a=(Mat_SeqSELL*) A->data;
790   PetscInt       i,*diag,m = A->rmap->n;
791   MatScalar      *val = a->val;
792   PetscScalar    *idiag,*mdiag;
793 
794   PetscFunctionBegin;
795   if (a->idiagvalid) PetscFunctionReturn(0);
796   PetscCall(MatMarkDiagonal_SeqSELL(A));
797   diag = a->diag;
798   if (!a->idiag) {
799     PetscCall(PetscMalloc3(m,&a->idiag,m,&a->mdiag,m,&a->ssor_work));
800     PetscCall(PetscLogObjectMemory((PetscObject)A, 3*m*sizeof(PetscScalar)));
801     val  = a->val;
802   }
803   mdiag = a->mdiag;
804   idiag = a->idiag;
805 
806   if (omega == 1.0 && PetscRealPart(fshift) <= 0.0) {
807     for (i=0; i<m; i++) {
808       mdiag[i] = val[diag[i]];
809       if (!PetscAbsScalar(mdiag[i])) { /* zero diagonal */
810         if (PetscRealPart(fshift)) {
811           PetscCall(PetscInfo(A,"Zero diagonal on row %" PetscInt_FMT "\n",i));
812           A->factorerrortype             = MAT_FACTOR_NUMERIC_ZEROPIVOT;
813           A->factorerror_zeropivot_value = 0.0;
814           A->factorerror_zeropivot_row   = i;
815         } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"Zero diagonal on row %" PetscInt_FMT,i);
816       }
817       idiag[i] = 1.0/val[diag[i]];
818     }
819     PetscCall(PetscLogFlops(m));
820   } else {
821     for (i=0; i<m; i++) {
822       mdiag[i] = val[diag[i]];
823       idiag[i] = omega/(fshift + val[diag[i]]);
824     }
825     PetscCall(PetscLogFlops(2.0*m));
826   }
827   a->idiagvalid = PETSC_TRUE;
828   PetscFunctionReturn(0);
829 }
830 
831 PetscErrorCode MatZeroEntries_SeqSELL(Mat A)
832 {
833   Mat_SeqSELL    *a=(Mat_SeqSELL*)A->data;
834 
835   PetscFunctionBegin;
836   PetscCall(PetscArrayzero(a->val,a->sliidx[a->totalslices]));
837   PetscCall(MatSeqSELLInvalidateDiagonal(A));
838   PetscFunctionReturn(0);
839 }
840 
841 PetscErrorCode MatDestroy_SeqSELL(Mat A)
842 {
843   Mat_SeqSELL    *a=(Mat_SeqSELL*)A->data;
844 
845   PetscFunctionBegin;
846 #if defined(PETSC_USE_LOG)
847   PetscLogObjectState((PetscObject)A,"Rows=%" PetscInt_FMT ", Cols=%" PetscInt_FMT ", NZ=%" PetscInt_FMT,A->rmap->n,A->cmap->n,a->nz);
848 #endif
849   PetscCall(MatSeqXSELLFreeSELL(A,&a->val,&a->colidx));
850   PetscCall(ISDestroy(&a->row));
851   PetscCall(ISDestroy(&a->col));
852   PetscCall(PetscFree(a->diag));
853   PetscCall(PetscFree(a->rlen));
854   PetscCall(PetscFree(a->sliidx));
855   PetscCall(PetscFree3(a->idiag,a->mdiag,a->ssor_work));
856   PetscCall(PetscFree(a->solve_work));
857   PetscCall(ISDestroy(&a->icol));
858   PetscCall(PetscFree(a->saved_values));
859   PetscCall(PetscFree2(a->getrowcols,a->getrowvals));
860 
861   PetscCall(PetscFree(A->data));
862 
863   PetscCall(PetscObjectChangeTypeName((PetscObject)A,NULL));
864   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatStoreValues_C",NULL));
865   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatRetrieveValues_C",NULL));
866   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSeqSELLSetPreallocation_C",NULL));
867   PetscFunctionReturn(0);
868 }
869 
870 PetscErrorCode MatSetOption_SeqSELL(Mat A,MatOption op,PetscBool flg)
871 {
872   Mat_SeqSELL    *a=(Mat_SeqSELL*)A->data;
873 
874   PetscFunctionBegin;
875   switch (op) {
876   case MAT_ROW_ORIENTED:
877     a->roworiented = flg;
878     break;
879   case MAT_KEEP_NONZERO_PATTERN:
880     a->keepnonzeropattern = flg;
881     break;
882   case MAT_NEW_NONZERO_LOCATIONS:
883     a->nonew = (flg ? 0 : 1);
884     break;
885   case MAT_NEW_NONZERO_LOCATION_ERR:
886     a->nonew = (flg ? -1 : 0);
887     break;
888   case MAT_NEW_NONZERO_ALLOCATION_ERR:
889     a->nonew = (flg ? -2 : 0);
890     break;
891   case MAT_UNUSED_NONZERO_LOCATION_ERR:
892     a->nounused = (flg ? -1 : 0);
893     break;
894   case MAT_FORCE_DIAGONAL_ENTRIES:
895   case MAT_IGNORE_OFF_PROC_ENTRIES:
896   case MAT_USE_HASH_TABLE:
897   case MAT_SORTED_FULL:
898     PetscCall(PetscInfo(A,"Option %s ignored\n",MatOptions[op]));
899     break;
900   case MAT_SPD:
901   case MAT_SYMMETRIC:
902   case MAT_STRUCTURALLY_SYMMETRIC:
903   case MAT_HERMITIAN:
904   case MAT_SYMMETRY_ETERNAL:
905     /* These options are handled directly by MatSetOption() */
906     break;
907   default:
908     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"unknown option %d",op);
909   }
910   PetscFunctionReturn(0);
911 }
912 
913 PetscErrorCode MatGetDiagonal_SeqSELL(Mat A,Vec v)
914 {
915   Mat_SeqSELL    *a=(Mat_SeqSELL*)A->data;
916   PetscInt       i,j,n,shift;
917   PetscScalar    *x,zero=0.0;
918 
919   PetscFunctionBegin;
920   PetscCall(VecGetLocalSize(v,&n));
921   PetscCheckFalse(n != A->rmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector");
922 
923   if (A->factortype == MAT_FACTOR_ILU || A->factortype == MAT_FACTOR_LU) {
924     PetscInt *diag=a->diag;
925     PetscCall(VecGetArray(v,&x));
926     for (i=0; i<n; i++) x[i] = 1.0/a->val[diag[i]];
927     PetscCall(VecRestoreArray(v,&x));
928     PetscFunctionReturn(0);
929   }
930 
931   PetscCall(VecSet(v,zero));
932   PetscCall(VecGetArray(v,&x));
933   for (i=0; i<n; i++) { /* loop over rows */
934     shift = a->sliidx[i>>3]+(i&0x07); /* starting index of the row i */
935     x[i] = 0;
936     for (j=0; j<a->rlen[i]; j++) {
937       if (a->colidx[shift+j*8] == i) {
938         x[i] = a->val[shift+j*8];
939         break;
940       }
941     }
942   }
943   PetscCall(VecRestoreArray(v,&x));
944   PetscFunctionReturn(0);
945 }
946 
947 PetscErrorCode MatDiagonalScale_SeqSELL(Mat A,Vec ll,Vec rr)
948 {
949   Mat_SeqSELL       *a=(Mat_SeqSELL*)A->data;
950   const PetscScalar *l,*r;
951   PetscInt          i,j,m,n,row;
952 
953   PetscFunctionBegin;
954   if (ll) {
955     /* The local size is used so that VecMPI can be passed to this routine
956        by MatDiagonalScale_MPISELL */
957     PetscCall(VecGetLocalSize(ll,&m));
958     PetscCheckFalse(m != A->rmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Left scaling vector wrong length");
959     PetscCall(VecGetArrayRead(ll,&l));
960     for (i=0; i<a->totalslices; i++) { /* loop over slices */
961       if (i == a->totalslices-1 && (A->rmap->n & 0x07)) { /* if last slice has padding rows */
962         for (j=a->sliidx[i],row=0; j<a->sliidx[i+1]; j++,row=((row+1)&0x07)) {
963           if (row < (A->rmap->n & 0x07)) a->val[j] *= l[8*i+row];
964         }
965       } else {
966         for (j=a->sliidx[i],row=0; j<a->sliidx[i+1]; j++,row=((row+1)&0x07)) {
967           a->val[j] *= l[8*i+row];
968         }
969       }
970     }
971     PetscCall(VecRestoreArrayRead(ll,&l));
972     PetscCall(PetscLogFlops(a->nz));
973   }
974   if (rr) {
975     PetscCall(VecGetLocalSize(rr,&n));
976     PetscCheckFalse(n != A->cmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Right scaling vector wrong length");
977     PetscCall(VecGetArrayRead(rr,&r));
978     for (i=0; i<a->totalslices; i++) { /* loop over slices */
979       if (i == a->totalslices-1 && (A->rmap->n & 0x07)) { /* if last slice has padding rows */
980         for (j=a->sliidx[i],row=0; j<a->sliidx[i+1]; j++,row=((row+1)&0x07)) {
981           if (row < (A->rmap->n & 0x07)) a->val[j] *= r[a->colidx[j]];
982         }
983       } else {
984         for (j=a->sliidx[i]; j<a->sliidx[i+1]; j++) {
985           a->val[j] *= r[a->colidx[j]];
986         }
987       }
988     }
989     PetscCall(VecRestoreArrayRead(rr,&r));
990     PetscCall(PetscLogFlops(a->nz));
991   }
992   PetscCall(MatSeqSELLInvalidateDiagonal(A));
993   PetscFunctionReturn(0);
994 }
995 
996 PetscErrorCode MatGetValues_SeqSELL(Mat A,PetscInt m,const PetscInt im[],PetscInt n,const PetscInt in[],PetscScalar v[])
997 {
998   Mat_SeqSELL *a=(Mat_SeqSELL*)A->data;
999   PetscInt    *cp,i,k,low,high,t,row,col,l;
1000   PetscInt    shift;
1001   MatScalar   *vp;
1002 
1003   PetscFunctionBegin;
1004   for (k=0; k<m; k++) { /* loop over requested rows */
1005     row = im[k];
1006     if (row<0) continue;
1007     PetscCheck(row < A->rmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Row too large: row %" PetscInt_FMT " max %" PetscInt_FMT,row,A->rmap->n-1);
1008     shift = a->sliidx[row>>3]+(row&0x07); /* starting index of the row */
1009     cp = a->colidx+shift; /* pointer to the row */
1010     vp = a->val+shift; /* pointer to the row */
1011     for (l=0; l<n; l++) { /* loop over requested columns */
1012       col = in[l];
1013       if (col<0) continue;
1014       PetscCheck(col < A->cmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Column too large: row %" PetscInt_FMT " max %" PetscInt_FMT,col,A->cmap->n-1);
1015       high = a->rlen[row]; low = 0; /* assume unsorted */
1016       while (high-low > 5) {
1017         t = (low+high)/2;
1018         if (*(cp+t*8) > col) high = t;
1019         else low = t;
1020       }
1021       for (i=low; i<high; i++) {
1022         if (*(cp+8*i) > col) break;
1023         if (*(cp+8*i) == col) {
1024           *v++ = *(vp+8*i);
1025           goto finished;
1026         }
1027       }
1028       *v++ = 0.0;
1029     finished:;
1030     }
1031   }
1032   PetscFunctionReturn(0);
1033 }
1034 
1035 PetscErrorCode MatView_SeqSELL_ASCII(Mat A,PetscViewer viewer)
1036 {
1037   Mat_SeqSELL       *a=(Mat_SeqSELL*)A->data;
1038   PetscInt          i,j,m=A->rmap->n,shift;
1039   const char        *name;
1040   PetscViewerFormat format;
1041 
1042   PetscFunctionBegin;
1043   PetscCall(PetscViewerGetFormat(viewer,&format));
1044   if (format == PETSC_VIEWER_ASCII_MATLAB) {
1045     PetscInt nofinalvalue = 0;
1046     /*
1047     if (m && ((a->i[m] == a->i[m-1]) || (a->j[a->nz-1] != A->cmap->n-1))) {
1048       nofinalvalue = 1;
1049     }
1050     */
1051     PetscCall(PetscViewerASCIIUseTabs(viewer,PETSC_FALSE));
1052     PetscCall(PetscViewerASCIIPrintf(viewer,"%% Size = %" PetscInt_FMT " %" PetscInt_FMT " \n",m,A->cmap->n));
1053     PetscCall(PetscViewerASCIIPrintf(viewer,"%% Nonzeros = %" PetscInt_FMT " \n",a->nz));
1054 #if defined(PETSC_USE_COMPLEX)
1055     PetscCall(PetscViewerASCIIPrintf(viewer,"zzz = zeros(%" PetscInt_FMT ",4);\n",a->nz+nofinalvalue));
1056 #else
1057     PetscCall(PetscViewerASCIIPrintf(viewer,"zzz = zeros(%" PetscInt_FMT ",3);\n",a->nz+nofinalvalue));
1058 #endif
1059     PetscCall(PetscViewerASCIIPrintf(viewer,"zzz = [\n"));
1060 
1061     for (i=0; i<m; i++) {
1062       shift = a->sliidx[i>>3]+(i&0x07);
1063       for (j=0; j<a->rlen[i]; j++) {
1064 #if defined(PETSC_USE_COMPLEX)
1065         PetscCall(PetscViewerASCIIPrintf(viewer,"%" PetscInt_FMT " %" PetscInt_FMT "  %18.16e %18.16e\n",i+1,a->colidx[shift+8*j]+1,(double)PetscRealPart(a->val[shift+8*j]),(double)PetscImaginaryPart(a->val[shift+8*j])));
1066 #else
1067         PetscCall(PetscViewerASCIIPrintf(viewer,"%" PetscInt_FMT " %" PetscInt_FMT "  %18.16e\n",i+1,a->colidx[shift+8*j]+1,(double)a->val[shift+8*j]));
1068 #endif
1069       }
1070     }
1071     /*
1072     if (nofinalvalue) {
1073 #if defined(PETSC_USE_COMPLEX)
1074       PetscCall(PetscViewerASCIIPrintf(viewer,"%" PetscInt_FMT " %" PetscInt_FMT "  %18.16e %18.16e\n",m,A->cmap->n,0.,0.));
1075 #else
1076       PetscCall(PetscViewerASCIIPrintf(viewer,"%" PetscInt_FMT " %" PetscInt_FMT "  %18.16e\n",m,A->cmap->n,0.0));
1077 #endif
1078     }
1079     */
1080     PetscCall(PetscObjectGetName((PetscObject)A,&name));
1081     PetscCall(PetscViewerASCIIPrintf(viewer,"];\n %s = spconvert(zzz);\n",name));
1082     PetscCall(PetscViewerASCIIUseTabs(viewer,PETSC_TRUE));
1083   } else if (format == PETSC_VIEWER_ASCII_FACTOR_INFO || format == PETSC_VIEWER_ASCII_INFO) {
1084     PetscFunctionReturn(0);
1085   } else if (format == PETSC_VIEWER_ASCII_COMMON) {
1086     PetscCall(PetscViewerASCIIUseTabs(viewer,PETSC_FALSE));
1087     for (i=0; i<m; i++) {
1088       PetscCall(PetscViewerASCIIPrintf(viewer,"row %" PetscInt_FMT ":",i));
1089       shift = a->sliidx[i>>3]+(i&0x07);
1090       for (j=0; j<a->rlen[i]; j++) {
1091 #if defined(PETSC_USE_COMPLEX)
1092         if (PetscImaginaryPart(a->val[shift+8*j]) > 0.0 && PetscRealPart(a->val[shift+8*j]) != 0.0) {
1093           PetscCall(PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g + %g i)",a->colidx[shift+8*j],(double)PetscRealPart(a->val[shift+8*j]),(double)PetscImaginaryPart(a->val[shift+8*j])));
1094         } else if (PetscImaginaryPart(a->val[shift+8*j]) < 0.0 && PetscRealPart(a->val[shift+8*j]) != 0.0) {
1095           PetscCall(PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g - %g i)",a->colidx[shift+8*j],(double)PetscRealPart(a->val[shift+8*j]),(double)-PetscImaginaryPart(a->val[shift+8*j])));
1096         } else if (PetscRealPart(a->val[shift+8*j]) != 0.0) {
1097           PetscCall(PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g) ",a->colidx[shift+8*j],(double)PetscRealPart(a->val[shift+8*j])));
1098         }
1099 #else
1100         if (a->val[shift+8*j] != 0.0) PetscCall(PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g) ",a->colidx[shift+8*j],(double)a->val[shift+8*j]));
1101 #endif
1102       }
1103       PetscCall(PetscViewerASCIIPrintf(viewer,"\n"));
1104     }
1105     PetscCall(PetscViewerASCIIUseTabs(viewer,PETSC_TRUE));
1106   } else if (format == PETSC_VIEWER_ASCII_DENSE) {
1107     PetscInt    cnt=0,jcnt;
1108     PetscScalar value;
1109 #if defined(PETSC_USE_COMPLEX)
1110     PetscBool   realonly=PETSC_TRUE;
1111     for (i=0; i<a->sliidx[a->totalslices]; i++) {
1112       if (PetscImaginaryPart(a->val[i]) != 0.0) {
1113         realonly = PETSC_FALSE;
1114         break;
1115       }
1116     }
1117 #endif
1118 
1119     PetscCall(PetscViewerASCIIUseTabs(viewer,PETSC_FALSE));
1120     for (i=0; i<m; i++) {
1121       jcnt = 0;
1122       shift = a->sliidx[i>>3]+(i&0x07);
1123       for (j=0; j<A->cmap->n; j++) {
1124         if (jcnt < a->rlen[i] && j == a->colidx[shift+8*j]) {
1125           value = a->val[cnt++];
1126           jcnt++;
1127         } else {
1128           value = 0.0;
1129         }
1130 #if defined(PETSC_USE_COMPLEX)
1131         if (realonly) {
1132           PetscCall(PetscViewerASCIIPrintf(viewer," %7.5e ",(double)PetscRealPart(value)));
1133         } else {
1134           PetscCall(PetscViewerASCIIPrintf(viewer," %7.5e+%7.5e i ",(double)PetscRealPart(value),(double)PetscImaginaryPart(value)));
1135         }
1136 #else
1137         PetscCall(PetscViewerASCIIPrintf(viewer," %7.5e ",(double)value));
1138 #endif
1139       }
1140       PetscCall(PetscViewerASCIIPrintf(viewer,"\n"));
1141     }
1142     PetscCall(PetscViewerASCIIUseTabs(viewer,PETSC_TRUE));
1143   } else if (format == PETSC_VIEWER_ASCII_MATRIXMARKET) {
1144     PetscInt fshift=1;
1145     PetscCall(PetscViewerASCIIUseTabs(viewer,PETSC_FALSE));
1146 #if defined(PETSC_USE_COMPLEX)
1147     PetscCall(PetscViewerASCIIPrintf(viewer,"%%%%MatrixMarket matrix coordinate complex general\n"));
1148 #else
1149     PetscCall(PetscViewerASCIIPrintf(viewer,"%%%%MatrixMarket matrix coordinate real general\n"));
1150 #endif
1151     PetscCall(PetscViewerASCIIPrintf(viewer,"%" PetscInt_FMT " %" PetscInt_FMT " %" PetscInt_FMT "\n", m, A->cmap->n, a->nz));
1152     for (i=0; i<m; i++) {
1153       shift = a->sliidx[i>>3]+(i&0x07);
1154       for (j=0; j<a->rlen[i]; j++) {
1155 #if defined(PETSC_USE_COMPLEX)
1156         PetscCall(PetscViewerASCIIPrintf(viewer,"%" PetscInt_FMT " %" PetscInt_FMT " %g %g\n",i+fshift,a->colidx[shift+8*j]+fshift,(double)PetscRealPart(a->val[shift+8*j]),(double)PetscImaginaryPart(a->val[shift+8*j])));
1157 #else
1158         PetscCall(PetscViewerASCIIPrintf(viewer,"%" PetscInt_FMT " %" PetscInt_FMT " %g\n",i+fshift,a->colidx[shift+8*j]+fshift,(double)a->val[shift+8*j]));
1159 #endif
1160       }
1161     }
1162     PetscCall(PetscViewerASCIIUseTabs(viewer,PETSC_TRUE));
1163   } else if (format == PETSC_VIEWER_NATIVE) {
1164     for (i=0; i<a->totalslices; i++) { /* loop over slices */
1165       PetscInt row;
1166       PetscCall(PetscViewerASCIIPrintf(viewer,"slice %" PetscInt_FMT ": %" PetscInt_FMT " %" PetscInt_FMT "\n",i,a->sliidx[i],a->sliidx[i+1]));
1167       for (j=a->sliidx[i],row=0; j<a->sliidx[i+1]; j++,row=((row+1)&0x07)) {
1168 #if defined(PETSC_USE_COMPLEX)
1169         if (PetscImaginaryPart(a->val[j]) > 0.0) {
1170           PetscCall(PetscViewerASCIIPrintf(viewer,"  %" PetscInt_FMT " %" PetscInt_FMT " %g + %g i\n",8*i+row,a->colidx[j],(double)PetscRealPart(a->val[j]),(double)PetscImaginaryPart(a->val[j])));
1171         } else if (PetscImaginaryPart(a->val[j]) < 0.0) {
1172           PetscCall(PetscViewerASCIIPrintf(viewer,"  %" PetscInt_FMT " %" PetscInt_FMT " %g - %g i\n",8*i+row,a->colidx[j],(double)PetscRealPart(a->val[j]),-(double)PetscImaginaryPart(a->val[j])));
1173         } else {
1174           PetscCall(PetscViewerASCIIPrintf(viewer,"  %" PetscInt_FMT " %" PetscInt_FMT " %g\n",8*i+row,a->colidx[j],(double)PetscRealPart(a->val[j])));
1175         }
1176 #else
1177         PetscCall(PetscViewerASCIIPrintf(viewer,"  %" PetscInt_FMT " %" PetscInt_FMT " %g\n",8*i+row,a->colidx[j],(double)a->val[j]));
1178 #endif
1179       }
1180     }
1181   } else {
1182     PetscCall(PetscViewerASCIIUseTabs(viewer,PETSC_FALSE));
1183     if (A->factortype) {
1184       for (i=0; i<m; i++) {
1185         shift = a->sliidx[i>>3]+(i&0x07);
1186         PetscCall(PetscViewerASCIIPrintf(viewer,"row %" PetscInt_FMT ":",i));
1187         /* L part */
1188         for (j=shift; j<a->diag[i]; j+=8) {
1189 #if defined(PETSC_USE_COMPLEX)
1190           if (PetscImaginaryPart(a->val[shift+8*j]) > 0.0) {
1191             PetscCall(PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g + %g i)",a->colidx[j],(double)PetscRealPart(a->val[j]),(double)PetscImaginaryPart(a->val[j])));
1192           } else if (PetscImaginaryPart(a->val[shift+8*j]) < 0.0) {
1193             PetscCall(PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g - %g i)",a->colidx[j],(double)PetscRealPart(a->val[j]),(double)(-PetscImaginaryPart(a->val[j]))));
1194           } else {
1195             PetscCall(PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g) ",a->colidx[j],(double)PetscRealPart(a->val[j])));
1196           }
1197 #else
1198           PetscCall(PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g) ",a->colidx[j],(double)a->val[j]));
1199 #endif
1200         }
1201         /* diagonal */
1202         j = a->diag[i];
1203 #if defined(PETSC_USE_COMPLEX)
1204         if (PetscImaginaryPart(a->val[j]) > 0.0) {
1205           PetscCall(PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g + %g i)",a->colidx[j],(double)PetscRealPart(1.0/a->val[j]),(double)PetscImaginaryPart(1.0/a->val[j])));
1206         } else if (PetscImaginaryPart(a->val[j]) < 0.0) {
1207           PetscCall(PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g - %g i)",a->colidx[j],(double)PetscRealPart(1.0/a->val[j]),(double)(-PetscImaginaryPart(1.0/a->val[j]))));
1208         } else {
1209           PetscCall(PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g) ",a->colidx[j],(double)PetscRealPart(1.0/a->val[j])));
1210         }
1211 #else
1212         PetscCall(PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g) ",a->colidx[j],(double)(1.0/a->val[j])));
1213 #endif
1214 
1215         /* U part */
1216         for (j=a->diag[i]+1; j<shift+8*a->rlen[i]; j+=8) {
1217 #if defined(PETSC_USE_COMPLEX)
1218           if (PetscImaginaryPart(a->val[j]) > 0.0) {
1219             PetscCall(PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g + %g i)",a->colidx[j],(double)PetscRealPart(a->val[j]),(double)PetscImaginaryPart(a->val[j])));
1220           } else if (PetscImaginaryPart(a->val[j]) < 0.0) {
1221             PetscCall(PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g - %g i)",a->colidx[j],(double)PetscRealPart(a->val[j]),(double)(-PetscImaginaryPart(a->val[j]))));
1222           } else {
1223             PetscCall(PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g) ",a->colidx[j],(double)PetscRealPart(a->val[j])));
1224           }
1225 #else
1226           PetscCall(PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g) ",a->colidx[j],(double)a->val[j]));
1227 #endif
1228         }
1229         PetscCall(PetscViewerASCIIPrintf(viewer,"\n"));
1230       }
1231     } else {
1232       for (i=0; i<m; i++) {
1233         shift = a->sliidx[i>>3]+(i&0x07);
1234         PetscCall(PetscViewerASCIIPrintf(viewer,"row %" PetscInt_FMT ":",i));
1235         for (j=0; j<a->rlen[i]; j++) {
1236 #if defined(PETSC_USE_COMPLEX)
1237           if (PetscImaginaryPart(a->val[j]) > 0.0) {
1238             PetscCall(PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g + %g i)",a->colidx[shift+8*j],(double)PetscRealPart(a->val[shift+8*j]),(double)PetscImaginaryPart(a->val[shift+8*j])));
1239           } else if (PetscImaginaryPart(a->val[j]) < 0.0) {
1240             PetscCall(PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g - %g i)",a->colidx[shift+8*j],(double)PetscRealPart(a->val[shift+8*j]),(double)-PetscImaginaryPart(a->val[shift+8*j])));
1241           } else {
1242             PetscCall(PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g) ",a->colidx[shift+8*j],(double)PetscRealPart(a->val[shift+8*j])));
1243           }
1244 #else
1245           PetscCall(PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g) ",a->colidx[shift+8*j],(double)a->val[shift+8*j]));
1246 #endif
1247         }
1248         PetscCall(PetscViewerASCIIPrintf(viewer,"\n"));
1249       }
1250     }
1251     PetscCall(PetscViewerASCIIUseTabs(viewer,PETSC_TRUE));
1252   }
1253   PetscCall(PetscViewerFlush(viewer));
1254   PetscFunctionReturn(0);
1255 }
1256 
1257 #include <petscdraw.h>
1258 PetscErrorCode MatView_SeqSELL_Draw_Zoom(PetscDraw draw,void *Aa)
1259 {
1260   Mat               A=(Mat)Aa;
1261   Mat_SeqSELL       *a=(Mat_SeqSELL*)A->data;
1262   PetscInt          i,j,m=A->rmap->n,shift;
1263   int               color;
1264   PetscReal         xl,yl,xr,yr,x_l,x_r,y_l,y_r;
1265   PetscViewer       viewer;
1266   PetscViewerFormat format;
1267   PetscErrorCode    ierr;
1268 
1269   PetscFunctionBegin;
1270   PetscCall(PetscObjectQuery((PetscObject)A,"Zoomviewer",(PetscObject*)&viewer));
1271   PetscCall(PetscViewerGetFormat(viewer,&format));
1272   PetscCall(PetscDrawGetCoordinates(draw,&xl,&yl,&xr,&yr));
1273 
1274   /* loop over matrix elements drawing boxes */
1275 
1276   if (format != PETSC_VIEWER_DRAW_CONTOUR) {
1277     ierr = PetscDrawCollectiveBegin(draw);PetscCall(ierr);
1278     /* Blue for negative, Cyan for zero and  Red for positive */
1279     color = PETSC_DRAW_BLUE;
1280     for (i=0; i<m; i++) {
1281       shift = a->sliidx[i>>3]+(i&0x07); /* starting index of the row i */
1282       y_l = m - i - 1.0; y_r = y_l + 1.0;
1283       for (j=0; j<a->rlen[i]; j++) {
1284         x_l = a->colidx[shift+j*8]; x_r = x_l + 1.0;
1285         if (PetscRealPart(a->val[shift+8*j]) >=  0.) continue;
1286         PetscCall(PetscDrawRectangle(draw,x_l,y_l,x_r,y_r,color,color,color,color));
1287       }
1288     }
1289     color = PETSC_DRAW_CYAN;
1290     for (i=0; i<m; i++) {
1291       shift = a->sliidx[i>>3]+(i&0x07);
1292       y_l = m - i - 1.0; y_r = y_l + 1.0;
1293       for (j=0; j<a->rlen[i]; j++) {
1294         x_l = a->colidx[shift+j*8]; x_r = x_l + 1.0;
1295         if (a->val[shift+8*j] !=  0.) continue;
1296         PetscCall(PetscDrawRectangle(draw,x_l,y_l,x_r,y_r,color,color,color,color));
1297       }
1298     }
1299     color = PETSC_DRAW_RED;
1300     for (i=0; i<m; i++) {
1301       shift = a->sliidx[i>>3]+(i&0x07);
1302       y_l = m - i - 1.0; y_r = y_l + 1.0;
1303       for (j=0; j<a->rlen[i]; j++) {
1304         x_l = a->colidx[shift+j*8]; x_r = x_l + 1.0;
1305         if (PetscRealPart(a->val[shift+8*j]) <=  0.) continue;
1306         PetscCall(PetscDrawRectangle(draw,x_l,y_l,x_r,y_r,color,color,color,color));
1307       }
1308     }
1309     ierr = PetscDrawCollectiveEnd(draw);PetscCall(ierr);
1310   } else {
1311     /* use contour shading to indicate magnitude of values */
1312     /* first determine max of all nonzero values */
1313     PetscReal minv=0.0,maxv=0.0;
1314     PetscInt  count=0;
1315     PetscDraw popup;
1316     for (i=0; i<a->sliidx[a->totalslices]; i++) {
1317       if (PetscAbsScalar(a->val[i]) > maxv) maxv = PetscAbsScalar(a->val[i]);
1318     }
1319     if (minv >= maxv) maxv = minv + PETSC_SMALL;
1320     PetscCall(PetscDrawGetPopup(draw,&popup));
1321     PetscCall(PetscDrawScalePopup(popup,minv,maxv));
1322 
1323     ierr = PetscDrawCollectiveBegin(draw);PetscCall(ierr);
1324     for (i=0; i<m; i++) {
1325       shift = a->sliidx[i>>3]+(i&0x07);
1326       y_l = m - i - 1.0;
1327       y_r = y_l + 1.0;
1328       for (j=0; j<a->rlen[i]; j++) {
1329         x_l = a->colidx[shift+j*8];
1330         x_r = x_l + 1.0;
1331         color = PetscDrawRealToColor(PetscAbsScalar(a->val[count]),minv,maxv);
1332         PetscCall(PetscDrawRectangle(draw,x_l,y_l,x_r,y_r,color,color,color,color));
1333         count++;
1334       }
1335     }
1336     ierr = PetscDrawCollectiveEnd(draw);PetscCall(ierr);
1337   }
1338   PetscFunctionReturn(0);
1339 }
1340 
1341 #include <petscdraw.h>
1342 PetscErrorCode MatView_SeqSELL_Draw(Mat A,PetscViewer viewer)
1343 {
1344   PetscDraw      draw;
1345   PetscReal      xr,yr,xl,yl,h,w;
1346   PetscBool      isnull;
1347 
1348   PetscFunctionBegin;
1349   PetscCall(PetscViewerDrawGetDraw(viewer,0,&draw));
1350   PetscCall(PetscDrawIsNull(draw,&isnull));
1351   if (isnull) PetscFunctionReturn(0);
1352 
1353   xr   = A->cmap->n; yr  = A->rmap->n; h = yr/10.0; w = xr/10.0;
1354   xr  += w;          yr += h;         xl = -w;     yl = -h;
1355   PetscCall(PetscDrawSetCoordinates(draw,xl,yl,xr,yr));
1356   PetscCall(PetscObjectCompose((PetscObject)A,"Zoomviewer",(PetscObject)viewer));
1357   PetscCall(PetscDrawZoom(draw,MatView_SeqSELL_Draw_Zoom,A));
1358   PetscCall(PetscObjectCompose((PetscObject)A,"Zoomviewer",NULL));
1359   PetscCall(PetscDrawSave(draw));
1360   PetscFunctionReturn(0);
1361 }
1362 
1363 PetscErrorCode MatView_SeqSELL(Mat A,PetscViewer viewer)
1364 {
1365   PetscBool      iascii,isbinary,isdraw;
1366 
1367   PetscFunctionBegin;
1368   PetscCall(PetscObjectTypeCompare((PetscObject)viewer,PETSCVIEWERASCII,&iascii));
1369   PetscCall(PetscObjectTypeCompare((PetscObject)viewer,PETSCVIEWERBINARY,&isbinary));
1370   PetscCall(PetscObjectTypeCompare((PetscObject)viewer,PETSCVIEWERDRAW,&isdraw));
1371   if (iascii) {
1372     PetscCall(MatView_SeqSELL_ASCII(A,viewer));
1373   } else if (isbinary) {
1374     /* PetscCall(MatView_SeqSELL_Binary(A,viewer)); */
1375   } else if (isdraw) {
1376     PetscCall(MatView_SeqSELL_Draw(A,viewer));
1377   }
1378   PetscFunctionReturn(0);
1379 }
1380 
1381 PetscErrorCode MatAssemblyEnd_SeqSELL(Mat A,MatAssemblyType mode)
1382 {
1383   Mat_SeqSELL    *a=(Mat_SeqSELL*)A->data;
1384   PetscInt       i,shift,row_in_slice,row,nrow,*cp,lastcol,j,k;
1385   MatScalar      *vp;
1386 
1387   PetscFunctionBegin;
1388   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0);
1389   /* To do: compress out the unused elements */
1390   PetscCall(MatMarkDiagonal_SeqSELL(A));
1391   PetscCall(PetscInfo(A,"Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: %" PetscInt_FMT " allocated %" PetscInt_FMT " used (%" PetscInt_FMT " nonzeros+%" PetscInt_FMT " paddedzeros)\n",A->rmap->n,A->cmap->n,a->maxallocmat,a->sliidx[a->totalslices],a->nz,a->sliidx[a->totalslices]-a->nz));
1392   PetscCall(PetscInfo(A,"Number of mallocs during MatSetValues() is %" PetscInt_FMT "\n",a->reallocs));
1393   PetscCall(PetscInfo(A,"Maximum nonzeros in any row is %" PetscInt_FMT "\n",a->rlenmax));
1394   /* Set unused slots for column indices to last valid column index. Set unused slots for values to zero. This allows for a use of unmasked intrinsics -> higher performance */
1395   for (i=0; i<a->totalslices; ++i) {
1396     shift = a->sliidx[i];    /* starting index of the slice */
1397     cp    = a->colidx+shift; /* pointer to the column indices of the slice */
1398     vp    = a->val+shift;    /* pointer to the nonzero values of the slice */
1399     for (row_in_slice=0; row_in_slice<8; ++row_in_slice) { /* loop over rows in the slice */
1400       row  = 8*i + row_in_slice;
1401       nrow = a->rlen[row]; /* number of nonzeros in row */
1402       /*
1403         Search for the nearest nonzero. Normally setting the index to zero may cause extra communication.
1404         But if the entire slice are empty, it is fine to use 0 since the index will not be loaded.
1405       */
1406       lastcol = 0;
1407       if (nrow>0) { /* nonempty row */
1408         lastcol = cp[8*(nrow-1)+row_in_slice]; /* use the index from the last nonzero at current row */
1409       } else if (!row_in_slice) { /* first row of the currect slice is empty */
1410         for (j=1;j<8;j++) {
1411           if (a->rlen[8*i+j]) {
1412             lastcol = cp[j];
1413             break;
1414           }
1415         }
1416       } else {
1417         if (a->sliidx[i+1] != shift) lastcol = cp[row_in_slice-1]; /* use the index from the previous row */
1418       }
1419 
1420       for (k=nrow; k<(a->sliidx[i+1]-shift)/8; ++k) {
1421         cp[8*k+row_in_slice] = lastcol;
1422         vp[8*k+row_in_slice] = (MatScalar)0;
1423       }
1424     }
1425   }
1426 
1427   A->info.mallocs += a->reallocs;
1428   a->reallocs      = 0;
1429 
1430   PetscCall(MatSeqSELLInvalidateDiagonal(A));
1431   PetscFunctionReturn(0);
1432 }
1433 
1434 PetscErrorCode MatGetInfo_SeqSELL(Mat A,MatInfoType flag,MatInfo *info)
1435 {
1436   Mat_SeqSELL *a=(Mat_SeqSELL*)A->data;
1437 
1438   PetscFunctionBegin;
1439   info->block_size   = 1.0;
1440   info->nz_allocated = a->maxallocmat;
1441   info->nz_used      = a->sliidx[a->totalslices]; /* include padding zeros */
1442   info->nz_unneeded  = (a->maxallocmat-a->sliidx[a->totalslices]);
1443   info->assemblies   = A->num_ass;
1444   info->mallocs      = A->info.mallocs;
1445   info->memory       = ((PetscObject)A)->mem;
1446   if (A->factortype) {
1447     info->fill_ratio_given  = A->info.fill_ratio_given;
1448     info->fill_ratio_needed = A->info.fill_ratio_needed;
1449     info->factor_mallocs    = A->info.factor_mallocs;
1450   } else {
1451     info->fill_ratio_given  = 0;
1452     info->fill_ratio_needed = 0;
1453     info->factor_mallocs    = 0;
1454   }
1455   PetscFunctionReturn(0);
1456 }
1457 
1458 PetscErrorCode MatSetValues_SeqSELL(Mat A,PetscInt m,const PetscInt im[],PetscInt n,const PetscInt in[],const PetscScalar v[],InsertMode is)
1459 {
1460   Mat_SeqSELL    *a=(Mat_SeqSELL*)A->data;
1461   PetscInt       shift,i,k,l,low,high,t,ii,row,col,nrow;
1462   PetscInt       *cp,nonew=a->nonew,lastcol=-1;
1463   MatScalar      *vp,value;
1464 
1465   PetscFunctionBegin;
1466   for (k=0; k<m; k++) { /* loop over added rows */
1467     row = im[k];
1468     if (row < 0) continue;
1469     PetscCheck(row < A->rmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Row too large: row %" PetscInt_FMT " max %" PetscInt_FMT,row,A->rmap->n-1);
1470     shift = a->sliidx[row>>3]+(row&0x07); /* starting index of the row */
1471     cp    = a->colidx+shift; /* pointer to the row */
1472     vp    = a->val+shift; /* pointer to the row */
1473     nrow  = a->rlen[row];
1474     low   = 0;
1475     high  = nrow;
1476 
1477     for (l=0; l<n; l++) { /* loop over added columns */
1478       col = in[l];
1479       if (col<0) continue;
1480       PetscCheck(col < A->cmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Col too large: row %" PetscInt_FMT " max %" PetscInt_FMT,col,A->cmap->n-1);
1481       if (a->roworiented) {
1482         value = v[l+k*n];
1483       } else {
1484         value = v[k+l*m];
1485       }
1486       if ((value == 0.0 && a->ignorezeroentries) && (is == ADD_VALUES)) continue;
1487 
1488       /* search in this row for the specified column, i indicates the column to be set */
1489       if (col <= lastcol) low = 0;
1490       else high = nrow;
1491       lastcol = col;
1492       while (high-low > 5) {
1493         t = (low+high)/2;
1494         if (*(cp+t*8) > col) high = t;
1495         else low = t;
1496       }
1497       for (i=low; i<high; i++) {
1498         if (*(cp+i*8) > col) break;
1499         if (*(cp+i*8) == col) {
1500           if (is == ADD_VALUES) *(vp+i*8) += value;
1501           else *(vp+i*8) = value;
1502           low = i + 1;
1503           goto noinsert;
1504         }
1505       }
1506       if (value == 0.0 && a->ignorezeroentries) goto noinsert;
1507       if (nonew == 1) goto noinsert;
1508       PetscCheckFalse(nonew == -1,PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Inserting a new nonzero (%" PetscInt_FMT ", %" PetscInt_FMT ") in the matrix", row, col);
1509       /* If the current row length exceeds the slice width (e.g. nrow==slice_width), allocate a new space, otherwise do nothing */
1510       MatSeqXSELLReallocateSELL(A,A->rmap->n,1,nrow,a->sliidx,row/8,row,col,a->colidx,a->val,cp,vp,nonew,MatScalar);
1511       /* add the new nonzero to the high position, shift the remaining elements in current row to the right by one slot */
1512       for (ii=nrow-1; ii>=i; ii--) {
1513         *(cp+(ii+1)*8) = *(cp+ii*8);
1514         *(vp+(ii+1)*8) = *(vp+ii*8);
1515       }
1516       a->rlen[row]++;
1517       *(cp+i*8) = col;
1518       *(vp+i*8) = value;
1519       a->nz++;
1520       A->nonzerostate++;
1521       low = i+1; high++; nrow++;
1522 noinsert:;
1523     }
1524     a->rlen[row] = nrow;
1525   }
1526   PetscFunctionReturn(0);
1527 }
1528 
1529 PetscErrorCode MatCopy_SeqSELL(Mat A,Mat B,MatStructure str)
1530 {
1531   PetscFunctionBegin;
1532   /* If the two matrices have the same copy implementation, use fast copy. */
1533   if (str == SAME_NONZERO_PATTERN && (A->ops->copy == B->ops->copy)) {
1534     Mat_SeqSELL *a=(Mat_SeqSELL*)A->data;
1535     Mat_SeqSELL *b=(Mat_SeqSELL*)B->data;
1536 
1537     PetscCheckFalse(a->sliidx[a->totalslices] != b->sliidx[b->totalslices],PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"Number of nonzeros in two matrices are different");
1538     PetscCall(PetscArraycpy(b->val,a->val,a->sliidx[a->totalslices]));
1539   } else {
1540     PetscCall(MatCopy_Basic(A,B,str));
1541   }
1542   PetscFunctionReturn(0);
1543 }
1544 
1545 PetscErrorCode MatSetUp_SeqSELL(Mat A)
1546 {
1547   PetscFunctionBegin;
1548   PetscCall(MatSeqSELLSetPreallocation(A,PETSC_DEFAULT,NULL));
1549   PetscFunctionReturn(0);
1550 }
1551 
1552 PetscErrorCode MatSeqSELLGetArray_SeqSELL(Mat A,PetscScalar *array[])
1553 {
1554   Mat_SeqSELL *a=(Mat_SeqSELL*)A->data;
1555 
1556   PetscFunctionBegin;
1557   *array = a->val;
1558   PetscFunctionReturn(0);
1559 }
1560 
1561 PetscErrorCode MatSeqSELLRestoreArray_SeqSELL(Mat A,PetscScalar *array[])
1562 {
1563   PetscFunctionBegin;
1564   PetscFunctionReturn(0);
1565 }
1566 
1567 PetscErrorCode MatRealPart_SeqSELL(Mat A)
1568 {
1569   Mat_SeqSELL *a=(Mat_SeqSELL*)A->data;
1570   PetscInt    i;
1571   MatScalar   *aval=a->val;
1572 
1573   PetscFunctionBegin;
1574   for (i=0; i<a->sliidx[a->totalslices]; i++) aval[i]=PetscRealPart(aval[i]);
1575   PetscFunctionReturn(0);
1576 }
1577 
1578 PetscErrorCode MatImaginaryPart_SeqSELL(Mat A)
1579 {
1580   Mat_SeqSELL    *a=(Mat_SeqSELL*)A->data;
1581   PetscInt       i;
1582   MatScalar      *aval=a->val;
1583 
1584   PetscFunctionBegin;
1585   for (i=0; i<a->sliidx[a->totalslices]; i++) aval[i] = PetscImaginaryPart(aval[i]);
1586   PetscCall(MatSeqSELLInvalidateDiagonal(A));
1587   PetscFunctionReturn(0);
1588 }
1589 
1590 PetscErrorCode MatScale_SeqSELL(Mat inA,PetscScalar alpha)
1591 {
1592   Mat_SeqSELL    *a=(Mat_SeqSELL*)inA->data;
1593   MatScalar      *aval=a->val;
1594   PetscScalar    oalpha=alpha;
1595   PetscBLASInt   one=1,size;
1596 
1597   PetscFunctionBegin;
1598   PetscCall(PetscBLASIntCast(a->sliidx[a->totalslices],&size));
1599   PetscStackCallBLAS("BLASscal",BLASscal_(&size,&oalpha,aval,&one));
1600   PetscCall(PetscLogFlops(a->nz));
1601   PetscCall(MatSeqSELLInvalidateDiagonal(inA));
1602   PetscFunctionReturn(0);
1603 }
1604 
1605 PetscErrorCode MatShift_SeqSELL(Mat Y,PetscScalar a)
1606 {
1607   Mat_SeqSELL    *y=(Mat_SeqSELL*)Y->data;
1608 
1609   PetscFunctionBegin;
1610   if (!Y->preallocated || !y->nz) {
1611     PetscCall(MatSeqSELLSetPreallocation(Y,1,NULL));
1612   }
1613   PetscCall(MatShift_Basic(Y,a));
1614   PetscFunctionReturn(0);
1615 }
1616 
1617 PetscErrorCode MatSOR_SeqSELL(Mat A,Vec bb,PetscReal omega,MatSORType flag,PetscReal fshift,PetscInt its,PetscInt lits,Vec xx)
1618 {
1619   Mat_SeqSELL       *a=(Mat_SeqSELL*)A->data;
1620   PetscScalar       *x,sum,*t;
1621   const MatScalar   *idiag=NULL,*mdiag;
1622   const PetscScalar *b,*xb;
1623   PetscInt          n,m=A->rmap->n,i,j,shift;
1624   const PetscInt    *diag;
1625 
1626   PetscFunctionBegin;
1627   its = its*lits;
1628 
1629   if (fshift != a->fshift || omega != a->omega) a->idiagvalid = PETSC_FALSE; /* must recompute idiag[] */
1630   if (!a->idiagvalid) PetscCall(MatInvertDiagonal_SeqSELL(A,omega,fshift));
1631   a->fshift = fshift;
1632   a->omega  = omega;
1633 
1634   diag  = a->diag;
1635   t     = a->ssor_work;
1636   idiag = a->idiag;
1637   mdiag = a->mdiag;
1638 
1639   PetscCall(VecGetArray(xx,&x));
1640   PetscCall(VecGetArrayRead(bb,&b));
1641   /* We count flops by assuming the upper triangular and lower triangular parts have the same number of nonzeros */
1642   PetscCheckFalse(flag == SOR_APPLY_UPPER,PETSC_COMM_SELF,PETSC_ERR_SUP,"SOR_APPLY_UPPER is not implemented");
1643   PetscCheckFalse(flag == SOR_APPLY_LOWER,PETSC_COMM_SELF,PETSC_ERR_SUP,"SOR_APPLY_LOWER is not implemented");
1644   PetscCheckFalse(flag & SOR_EISENSTAT,PETSC_COMM_SELF,PETSC_ERR_SUP,"No support yet for Eisenstat");
1645 
1646   if (flag & SOR_ZERO_INITIAL_GUESS) {
1647     if ((flag & SOR_FORWARD_SWEEP) || (flag & SOR_LOCAL_FORWARD_SWEEP)) {
1648       for (i=0; i<m; i++) {
1649         shift = a->sliidx[i>>3]+(i&0x07); /* starting index of the row i */
1650         sum   = b[i];
1651         n     = (diag[i]-shift)/8;
1652         for (j=0; j<n; j++) sum -= a->val[shift+j*8]*x[a->colidx[shift+j*8]];
1653         t[i]  = sum;
1654         x[i]  = sum*idiag[i];
1655       }
1656       xb   = t;
1657       PetscCall(PetscLogFlops(a->nz));
1658     } else xb = b;
1659     if ((flag & SOR_BACKWARD_SWEEP) || (flag & SOR_LOCAL_BACKWARD_SWEEP)) {
1660       for (i=m-1; i>=0; i--) {
1661         shift = a->sliidx[i>>3]+(i&0x07); /* starting index of the row i */
1662         sum   = xb[i];
1663         n     = a->rlen[i]-(diag[i]-shift)/8-1;
1664         for (j=1; j<=n; j++) sum -= a->val[diag[i]+j*8]*x[a->colidx[diag[i]+j*8]];
1665         if (xb == b) {
1666           x[i] = sum*idiag[i];
1667         } else {
1668           x[i] = (1.-omega)*x[i]+sum*idiag[i];  /* omega in idiag */
1669         }
1670       }
1671       PetscCall(PetscLogFlops(a->nz)); /* assumes 1/2 in upper */
1672     }
1673     its--;
1674   }
1675   while (its--) {
1676     if ((flag & SOR_FORWARD_SWEEP) || (flag & SOR_LOCAL_FORWARD_SWEEP)) {
1677       for (i=0; i<m; i++) {
1678         /* lower */
1679         shift = a->sliidx[i>>3]+(i&0x07); /* starting index of the row i */
1680         sum   = b[i];
1681         n     = (diag[i]-shift)/8;
1682         for (j=0; j<n; j++) sum -= a->val[shift+j*8]*x[a->colidx[shift+j*8]];
1683         t[i]  = sum;             /* save application of the lower-triangular part */
1684         /* upper */
1685         n     = a->rlen[i]-(diag[i]-shift)/8-1;
1686         for (j=1; j<=n; j++) sum -= a->val[diag[i]+j*8]*x[a->colidx[diag[i]+j*8]];
1687         x[i]  = (1.-omega)*x[i]+sum*idiag[i];  /* omega in idiag */
1688       }
1689       xb   = t;
1690       PetscCall(PetscLogFlops(2.0*a->nz));
1691     } else xb = b;
1692     if ((flag & SOR_BACKWARD_SWEEP) || (flag & SOR_LOCAL_BACKWARD_SWEEP)) {
1693       for (i=m-1; i>=0; i--) {
1694         shift = a->sliidx[i>>3]+(i&0x07); /* starting index of the row i */
1695         sum = xb[i];
1696         if (xb == b) {
1697           /* whole matrix (no checkpointing available) */
1698           n     = a->rlen[i];
1699           for (j=0; j<n; j++) sum -= a->val[shift+j*8]*x[a->colidx[shift+j*8]];
1700           x[i] = (1.-omega)*x[i]+(sum+mdiag[i]*x[i])*idiag[i];
1701         } else { /* lower-triangular part has been saved, so only apply upper-triangular */
1702           n     = a->rlen[i]-(diag[i]-shift)/8-1;
1703           for (j=1; j<=n; j++) sum -= a->val[diag[i]+j*8]*x[a->colidx[diag[i]+j*8]];
1704           x[i]  = (1.-omega)*x[i]+sum*idiag[i];  /* omega in idiag */
1705         }
1706       }
1707       if (xb == b) {
1708         PetscCall(PetscLogFlops(2.0*a->nz));
1709       } else {
1710         PetscCall(PetscLogFlops(a->nz)); /* assumes 1/2 in upper */
1711       }
1712     }
1713   }
1714   PetscCall(VecRestoreArray(xx,&x));
1715   PetscCall(VecRestoreArrayRead(bb,&b));
1716   PetscFunctionReturn(0);
1717 }
1718 
1719 /* -------------------------------------------------------------------*/
1720 static struct _MatOps MatOps_Values = {MatSetValues_SeqSELL,
1721                                        MatGetRow_SeqSELL,
1722                                        MatRestoreRow_SeqSELL,
1723                                        MatMult_SeqSELL,
1724                                /* 4*/  MatMultAdd_SeqSELL,
1725                                        MatMultTranspose_SeqSELL,
1726                                        MatMultTransposeAdd_SeqSELL,
1727                                        NULL,
1728                                        NULL,
1729                                        NULL,
1730                                /* 10*/ NULL,
1731                                        NULL,
1732                                        NULL,
1733                                        MatSOR_SeqSELL,
1734                                        NULL,
1735                                /* 15*/ MatGetInfo_SeqSELL,
1736                                        MatEqual_SeqSELL,
1737                                        MatGetDiagonal_SeqSELL,
1738                                        MatDiagonalScale_SeqSELL,
1739                                        NULL,
1740                                /* 20*/ NULL,
1741                                        MatAssemblyEnd_SeqSELL,
1742                                        MatSetOption_SeqSELL,
1743                                        MatZeroEntries_SeqSELL,
1744                                /* 24*/ NULL,
1745                                        NULL,
1746                                        NULL,
1747                                        NULL,
1748                                        NULL,
1749                                /* 29*/ MatSetUp_SeqSELL,
1750                                        NULL,
1751                                        NULL,
1752                                        NULL,
1753                                        NULL,
1754                                /* 34*/ MatDuplicate_SeqSELL,
1755                                        NULL,
1756                                        NULL,
1757                                        NULL,
1758                                        NULL,
1759                                /* 39*/ NULL,
1760                                        NULL,
1761                                        NULL,
1762                                        MatGetValues_SeqSELL,
1763                                        MatCopy_SeqSELL,
1764                                /* 44*/ NULL,
1765                                        MatScale_SeqSELL,
1766                                        MatShift_SeqSELL,
1767                                        NULL,
1768                                        NULL,
1769                                /* 49*/ NULL,
1770                                        NULL,
1771                                        NULL,
1772                                        NULL,
1773                                        NULL,
1774                                /* 54*/ MatFDColoringCreate_SeqXAIJ,
1775                                        NULL,
1776                                        NULL,
1777                                        NULL,
1778                                        NULL,
1779                                /* 59*/ NULL,
1780                                        MatDestroy_SeqSELL,
1781                                        MatView_SeqSELL,
1782                                        NULL,
1783                                        NULL,
1784                                /* 64*/ NULL,
1785                                        NULL,
1786                                        NULL,
1787                                        NULL,
1788                                        NULL,
1789                                /* 69*/ NULL,
1790                                        NULL,
1791                                        NULL,
1792                                        NULL,
1793                                        NULL,
1794                                /* 74*/ NULL,
1795                                        MatFDColoringApply_AIJ, /* reuse the FDColoring function for AIJ */
1796                                        NULL,
1797                                        NULL,
1798                                        NULL,
1799                                /* 79*/ NULL,
1800                                        NULL,
1801                                        NULL,
1802                                        NULL,
1803                                        NULL,
1804                                /* 84*/ NULL,
1805                                        NULL,
1806                                        NULL,
1807                                        NULL,
1808                                        NULL,
1809                                /* 89*/ NULL,
1810                                        NULL,
1811                                        NULL,
1812                                        NULL,
1813                                        NULL,
1814                                /* 94*/ NULL,
1815                                        NULL,
1816                                        NULL,
1817                                        NULL,
1818                                        NULL,
1819                                /* 99*/ NULL,
1820                                        NULL,
1821                                        NULL,
1822                                        MatConjugate_SeqSELL,
1823                                        NULL,
1824                                /*104*/ NULL,
1825                                        NULL,
1826                                        NULL,
1827                                        NULL,
1828                                        NULL,
1829                                /*109*/ NULL,
1830                                        NULL,
1831                                        NULL,
1832                                        NULL,
1833                                        MatMissingDiagonal_SeqSELL,
1834                                /*114*/ NULL,
1835                                        NULL,
1836                                        NULL,
1837                                        NULL,
1838                                        NULL,
1839                                /*119*/ NULL,
1840                                        NULL,
1841                                        NULL,
1842                                        NULL,
1843                                        NULL,
1844                                /*124*/ NULL,
1845                                        NULL,
1846                                        NULL,
1847                                        NULL,
1848                                        NULL,
1849                                /*129*/ NULL,
1850                                        NULL,
1851                                        NULL,
1852                                        NULL,
1853                                        NULL,
1854                                /*134*/ NULL,
1855                                        NULL,
1856                                        NULL,
1857                                        NULL,
1858                                        NULL,
1859                                /*139*/ NULL,
1860                                        NULL,
1861                                        NULL,
1862                                        MatFDColoringSetUp_SeqXAIJ,
1863                                        NULL,
1864                                /*144*/ NULL,
1865                                        NULL,
1866                                        NULL,
1867                                        NULL
1868 };
1869 
1870 PetscErrorCode MatStoreValues_SeqSELL(Mat mat)
1871 {
1872   Mat_SeqSELL    *a=(Mat_SeqSELL*)mat->data;
1873 
1874   PetscFunctionBegin;
1875   PetscCheck(a->nonew,PETSC_COMM_SELF,PETSC_ERR_ORDER,"Must call MatSetOption(A,MAT_NEW_NONZERO_LOCATIONS,PETSC_FALSE);first");
1876 
1877   /* allocate space for values if not already there */
1878   if (!a->saved_values) {
1879     PetscCall(PetscMalloc1(a->sliidx[a->totalslices]+1,&a->saved_values));
1880     PetscCall(PetscLogObjectMemory((PetscObject)mat,(a->sliidx[a->totalslices]+1)*sizeof(PetscScalar)));
1881   }
1882 
1883   /* copy values over */
1884   PetscCall(PetscArraycpy(a->saved_values,a->val,a->sliidx[a->totalslices]));
1885   PetscFunctionReturn(0);
1886 }
1887 
1888 PetscErrorCode MatRetrieveValues_SeqSELL(Mat mat)
1889 {
1890   Mat_SeqSELL    *a=(Mat_SeqSELL*)mat->data;
1891 
1892   PetscFunctionBegin;
1893   PetscCheck(a->nonew,PETSC_COMM_SELF,PETSC_ERR_ORDER,"Must call MatSetOption(A,MAT_NEW_NONZERO_LOCATIONS,PETSC_FALSE);first");
1894   PetscCheck(a->saved_values,PETSC_COMM_SELF,PETSC_ERR_ORDER,"Must call MatStoreValues(A);first");
1895   PetscCall(PetscArraycpy(a->val,a->saved_values,a->sliidx[a->totalslices]));
1896   PetscFunctionReturn(0);
1897 }
1898 
1899 /*@C
1900  MatSeqSELLRestoreArray - returns access to the array where the data for a MATSEQSELL matrix is stored obtained by MatSeqSELLGetArray()
1901 
1902  Not Collective
1903 
1904  Input Parameters:
1905  .  mat - a MATSEQSELL matrix
1906  .  array - pointer to the data
1907 
1908  Level: intermediate
1909 
1910  .seealso: MatSeqSELLGetArray(), MatSeqSELLRestoreArrayF90()
1911  @*/
1912 PetscErrorCode MatSeqSELLRestoreArray(Mat A,PetscScalar **array)
1913 {
1914   PetscFunctionBegin;
1915   PetscUseMethod(A,"MatSeqSELLRestoreArray_C",(Mat,PetscScalar**),(A,array));
1916   PetscFunctionReturn(0);
1917 }
1918 
1919 PETSC_EXTERN PetscErrorCode MatCreate_SeqSELL(Mat B)
1920 {
1921   Mat_SeqSELL    *b;
1922   PetscMPIInt    size;
1923 
1924   PetscFunctionBegin;
1925   PetscCall(PetscCitationsRegister(citation,&cited));
1926   PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)B),&size));
1927   PetscCheckFalse(size > 1,PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Comm must be of size 1");
1928 
1929   PetscCall(PetscNewLog(B,&b));
1930 
1931   B->data = (void*)b;
1932 
1933   PetscCall(PetscMemcpy(B->ops,&MatOps_Values,sizeof(struct _MatOps)));
1934 
1935   b->row                = NULL;
1936   b->col                = NULL;
1937   b->icol               = NULL;
1938   b->reallocs           = 0;
1939   b->ignorezeroentries  = PETSC_FALSE;
1940   b->roworiented        = PETSC_TRUE;
1941   b->nonew              = 0;
1942   b->diag               = NULL;
1943   b->solve_work         = NULL;
1944   B->spptr              = NULL;
1945   b->saved_values       = NULL;
1946   b->idiag              = NULL;
1947   b->mdiag              = NULL;
1948   b->ssor_work          = NULL;
1949   b->omega              = 1.0;
1950   b->fshift             = 0.0;
1951   b->idiagvalid         = PETSC_FALSE;
1952   b->keepnonzeropattern = PETSC_FALSE;
1953 
1954   PetscCall(PetscObjectChangeTypeName((PetscObject)B,MATSEQSELL));
1955   PetscCall(PetscObjectComposeFunction((PetscObject)B,"MatSeqSELLGetArray_C",MatSeqSELLGetArray_SeqSELL));
1956   PetscCall(PetscObjectComposeFunction((PetscObject)B,"MatSeqSELLRestoreArray_C",MatSeqSELLRestoreArray_SeqSELL));
1957   PetscCall(PetscObjectComposeFunction((PetscObject)B,"MatStoreValues_C",MatStoreValues_SeqSELL));
1958   PetscCall(PetscObjectComposeFunction((PetscObject)B,"MatRetrieveValues_C",MatRetrieveValues_SeqSELL));
1959   PetscCall(PetscObjectComposeFunction((PetscObject)B,"MatSeqSELLSetPreallocation_C",MatSeqSELLSetPreallocation_SeqSELL));
1960   PetscCall(PetscObjectComposeFunction((PetscObject)B,"MatConvert_seqsell_seqaij_C",MatConvert_SeqSELL_SeqAIJ));
1961   PetscFunctionReturn(0);
1962 }
1963 
1964 /*
1965  Given a matrix generated with MatGetFactor() duplicates all the information in A into B
1966  */
1967 PetscErrorCode MatDuplicateNoCreate_SeqSELL(Mat C,Mat A,MatDuplicateOption cpvalues,PetscBool mallocmatspace)
1968 {
1969   Mat_SeqSELL    *c = (Mat_SeqSELL*)C->data,*a = (Mat_SeqSELL*)A->data;
1970   PetscInt       i,m=A->rmap->n;
1971   PetscInt       totalslices=a->totalslices;
1972 
1973   PetscFunctionBegin;
1974   C->factortype = A->factortype;
1975   c->row        = NULL;
1976   c->col        = NULL;
1977   c->icol       = NULL;
1978   c->reallocs   = 0;
1979   C->assembled = PETSC_TRUE;
1980 
1981   PetscCall(PetscLayoutReference(A->rmap,&C->rmap));
1982   PetscCall(PetscLayoutReference(A->cmap,&C->cmap));
1983 
1984   PetscCall(PetscMalloc1(8*totalslices,&c->rlen));
1985   PetscCall(PetscLogObjectMemory((PetscObject)C,m*sizeof(PetscInt)));
1986   PetscCall(PetscMalloc1(totalslices+1,&c->sliidx));
1987   PetscCall(PetscLogObjectMemory((PetscObject)C, (totalslices+1)*sizeof(PetscInt)));
1988 
1989   for (i=0; i<m; i++) c->rlen[i] = a->rlen[i];
1990   for (i=0; i<totalslices+1; i++) c->sliidx[i] = a->sliidx[i];
1991 
1992   /* allocate the matrix space */
1993   if (mallocmatspace) {
1994     PetscCall(PetscMalloc2(a->maxallocmat,&c->val,a->maxallocmat,&c->colidx));
1995     PetscCall(PetscLogObjectMemory((PetscObject)C,a->maxallocmat*(sizeof(PetscScalar)+sizeof(PetscInt))));
1996 
1997     c->singlemalloc = PETSC_TRUE;
1998 
1999     if (m > 0) {
2000       PetscCall(PetscArraycpy(c->colidx,a->colidx,a->maxallocmat));
2001       if (cpvalues == MAT_COPY_VALUES) {
2002         PetscCall(PetscArraycpy(c->val,a->val,a->maxallocmat));
2003       } else {
2004         PetscCall(PetscArrayzero(c->val,a->maxallocmat));
2005       }
2006     }
2007   }
2008 
2009   c->ignorezeroentries = a->ignorezeroentries;
2010   c->roworiented       = a->roworiented;
2011   c->nonew             = a->nonew;
2012   if (a->diag) {
2013     PetscCall(PetscMalloc1(m,&c->diag));
2014     PetscCall(PetscLogObjectMemory((PetscObject)C,m*sizeof(PetscInt)));
2015     for (i=0; i<m; i++) {
2016       c->diag[i] = a->diag[i];
2017     }
2018   } else c->diag = NULL;
2019 
2020   c->solve_work         = NULL;
2021   c->saved_values       = NULL;
2022   c->idiag              = NULL;
2023   c->ssor_work          = NULL;
2024   c->keepnonzeropattern = a->keepnonzeropattern;
2025   c->free_val           = PETSC_TRUE;
2026   c->free_colidx        = PETSC_TRUE;
2027 
2028   c->maxallocmat  = a->maxallocmat;
2029   c->maxallocrow  = a->maxallocrow;
2030   c->rlenmax      = a->rlenmax;
2031   c->nz           = a->nz;
2032   C->preallocated = PETSC_TRUE;
2033 
2034   c->nonzerorowcnt = a->nonzerorowcnt;
2035   C->nonzerostate  = A->nonzerostate;
2036 
2037   PetscCall(PetscFunctionListDuplicate(((PetscObject)A)->qlist,&((PetscObject)C)->qlist));
2038   PetscFunctionReturn(0);
2039 }
2040 
2041 PetscErrorCode MatDuplicate_SeqSELL(Mat A,MatDuplicateOption cpvalues,Mat *B)
2042 {
2043   PetscFunctionBegin;
2044   PetscCall(MatCreate(PetscObjectComm((PetscObject)A),B));
2045   PetscCall(MatSetSizes(*B,A->rmap->n,A->cmap->n,A->rmap->n,A->cmap->n));
2046   if (!(A->rmap->n % A->rmap->bs) && !(A->cmap->n % A->cmap->bs)) {
2047     PetscCall(MatSetBlockSizesFromMats(*B,A,A));
2048   }
2049   PetscCall(MatSetType(*B,((PetscObject)A)->type_name));
2050   PetscCall(MatDuplicateNoCreate_SeqSELL(*B,A,cpvalues,PETSC_TRUE));
2051   PetscFunctionReturn(0);
2052 }
2053 
2054 /*MC
2055    MATSEQSELL - MATSEQSELL = "seqsell" - A matrix type to be used for sequential sparse matrices,
2056    based on the sliced Ellpack format
2057 
2058    Options Database Keys:
2059 . -mat_type seqsell - sets the matrix type to "seqsell" during a call to MatSetFromOptions()
2060 
2061    Level: beginner
2062 
2063 .seealso: MatCreateSeqSell(), MATSELL, MATMPISELL, MATSEQAIJ, MATAIJ, MATMPIAIJ
2064 M*/
2065 
2066 /*MC
2067    MATSELL - MATSELL = "sell" - A matrix type to be used for sparse matrices.
2068 
2069    This matrix type is identical to MATSEQSELL when constructed with a single process communicator,
2070    and MATMPISELL otherwise.  As a result, for single process communicators,
2071   MatSeqSELLSetPreallocation() is supported, and similarly MatMPISELLSetPreallocation() is supported
2072   for communicators controlling multiple processes.  It is recommended that you call both of
2073   the above preallocation routines for simplicity.
2074 
2075    Options Database Keys:
2076 . -mat_type sell - sets the matrix type to "sell" during a call to MatSetFromOptions()
2077 
2078   Level: beginner
2079 
2080   Notes:
2081    This format is only supported for real scalars, double precision, and 32 bit indices (the defaults).
2082 
2083    It can provide better performance on Intel and AMD processes with AVX2 or AVX512 support for matrices that have a similar number of
2084    non-zeros in contiguous groups of rows. However if the computation is memory bandwidth limited it may not provide much improvement.
2085 
2086   Developer Notes:
2087    On Intel (and AMD) systems some of the matrix operations use SIMD (AVX) instructions to achieve higher performance.
2088 
2089    The sparse matrix format is as follows. For simplicity we assume a slice size of 2, it is actually 8
2090 .vb
2091                             (2 0  3 4)
2092    Consider the matrix A =  (5 0  6 0)
2093                             (0 0  7 8)
2094                             (0 0  9 9)
2095 
2096    symbolically the Ellpack format can be written as
2097 
2098         (2 3 4 |)           (0 2 3 |)
2099    v =  (5 6 0 |)  colidx = (0 2 2 |)
2100         --------            ---------
2101         (7 8 |)             (2 3 |)
2102         (9 9 |)             (2 3 |)
2103 
2104     The data for 2 contiguous rows of the matrix are stored together (in column-major format) (with any left-over rows handled as a special case).
2105     Any of the rows in a slice fewer columns than the rest of the slice (row 1 above) are padded with a previous valid column in their "extra" colidx[] locations and
2106     zeros in their "extra" v locations so that the matrix operations do not need special code to handle different length rows within the 2 rows in a slice.
2107 
2108     The one-dimensional representation of v used in the code is (2 5 3 6 4 0 7 9 8 9)  and for colidx is (0 0 2 2 3 2 2 2 3 3)
2109 
2110 .ve
2111 
2112       See MatMult_SeqSELL() for how this format is used with the SIMD operations to achieve high performance.
2113 
2114  References:
2115 . * - Hong Zhang, Richard T. Mills, Karl Rupp, and Barry F. Smith, Vectorized Parallel Sparse Matrix-Vector Multiplication in {PETSc} Using {AVX-512},
2116    Proceedings of the 47th International Conference on Parallel Processing, 2018.
2117 
2118 .seealso: MatCreateSeqSELL(), MatCreateSeqAIJ(), MatCreateSell(), MATSEQSELL, MATMPISELL, MATSEQAIJ, MATMPIAIJ, MATAIJ
2119 M*/
2120 
2121 /*@C
2122        MatCreateSeqSELL - Creates a sparse matrix in SELL format.
2123 
2124  Collective on comm
2125 
2126  Input Parameters:
2127 +  comm - MPI communicator, set to PETSC_COMM_SELF
2128 .  m - number of rows
2129 .  n - number of columns
2130 .  rlenmax - maximum number of nonzeros in a row
2131 -  rlen - array containing the number of nonzeros in the various rows
2132  (possibly different for each row) or NULL
2133 
2134  Output Parameter:
2135 .  A - the matrix
2136 
2137  It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
2138  MatXXXXSetPreallocation() paradigm instead of this routine directly.
2139  [MatXXXXSetPreallocation() is, for example, MatSeqSELLSetPreallocation]
2140 
2141  Notes:
2142  If nnz is given then nz is ignored
2143 
2144  Specify the preallocated storage with either rlenmax or rlen (not both).
2145  Set rlenmax=PETSC_DEFAULT and rlen=NULL for PETSc to control dynamic memory
2146  allocation.  For large problems you MUST preallocate memory or you
2147  will get TERRIBLE performance, see the users' manual chapter on matrices.
2148 
2149  Level: intermediate
2150 
2151  .seealso: MatCreate(), MatCreateSELL(), MatSetValues(), MatSeqSELLSetPreallocation(), MATSELL, MATSEQSELL, MATMPISELL
2152 
2153  @*/
2154 PetscErrorCode MatCreateSeqSELL(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt maxallocrow,const PetscInt rlen[],Mat *A)
2155 {
2156   PetscFunctionBegin;
2157   PetscCall(MatCreate(comm,A));
2158   PetscCall(MatSetSizes(*A,m,n,m,n));
2159   PetscCall(MatSetType(*A,MATSEQSELL));
2160   PetscCall(MatSeqSELLSetPreallocation_SeqSELL(*A,maxallocrow,rlen));
2161   PetscFunctionReturn(0);
2162 }
2163 
2164 PetscErrorCode MatEqual_SeqSELL(Mat A,Mat B,PetscBool * flg)
2165 {
2166   Mat_SeqSELL    *a=(Mat_SeqSELL*)A->data,*b=(Mat_SeqSELL*)B->data;
2167   PetscInt       totalslices=a->totalslices;
2168 
2169   PetscFunctionBegin;
2170   /* If the  matrix dimensions are not equal,or no of nonzeros */
2171   if ((A->rmap->n != B->rmap->n) || (A->cmap->n != B->cmap->n) ||(a->nz != b->nz) || (a->rlenmax != b->rlenmax)) {
2172     *flg = PETSC_FALSE;
2173     PetscFunctionReturn(0);
2174   }
2175   /* if the a->colidx are the same */
2176   PetscCall(PetscArraycmp(a->colidx,b->colidx,a->sliidx[totalslices],flg));
2177   if (!*flg) PetscFunctionReturn(0);
2178   /* if a->val are the same */
2179   PetscCall(PetscArraycmp(a->val,b->val,a->sliidx[totalslices],flg));
2180   PetscFunctionReturn(0);
2181 }
2182 
2183 PetscErrorCode MatSeqSELLInvalidateDiagonal(Mat A)
2184 {
2185   Mat_SeqSELL *a=(Mat_SeqSELL*)A->data;
2186 
2187   PetscFunctionBegin;
2188   a->idiagvalid  = PETSC_FALSE;
2189   PetscFunctionReturn(0);
2190 }
2191 
2192 PetscErrorCode MatConjugate_SeqSELL(Mat A)
2193 {
2194 #if defined(PETSC_USE_COMPLEX)
2195   Mat_SeqSELL *a=(Mat_SeqSELL*)A->data;
2196   PetscInt    i;
2197   PetscScalar *val = a->val;
2198 
2199   PetscFunctionBegin;
2200   for (i=0; i<a->sliidx[a->totalslices]; i++) {
2201     val[i] = PetscConj(val[i]);
2202   }
2203 #else
2204   PetscFunctionBegin;
2205 #endif
2206   PetscFunctionReturn(0);
2207 }
2208