xref: /petsc/src/mat/impls/sell/seq/sell.c (revision b45e3bf4ff73d80a20c3202b6cd9f79d2f2d3efe)
1 
2 /*
3   Defines the basic matrix operations for the SELL matrix storage format.
4 */
5 #include <../src/mat/impls/sell/seq/sell.h>  /*I   "petscmat.h"  I*/
6 #include <petscblaslapack.h>
7 #include <petsc/private/kernels/blocktranspose.h>
8 
9 static PetscBool  cited = PETSC_FALSE;
10 static const char citation[] =
11   "@inproceedings{ZhangELLPACK2018,\n"
12   " author = {Hong Zhang and Richard T. Mills and Karl Rupp and Barry F. Smith},\n"
13   " title = {Vectorized Parallel Sparse Matrix-Vector Multiplication in {PETSc} Using {AVX-512}},\n"
14   " booktitle = {Proceedings of the 47th International Conference on Parallel Processing},\n"
15   " year = 2018\n"
16 "}\n";
17 
18 #if defined(PETSC_HAVE_IMMINTRIN_H) && (defined(__AVX512F__) || (defined(__AVX2__) && defined(__FMA__)) || defined(__AVX__)) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
19 
20   #include <immintrin.h>
21 
22   #if !defined(_MM_SCALE_8)
23   #define _MM_SCALE_8    8
24   #endif
25 
26   #if defined(__AVX512F__)
27   /* these do not work
28    vec_idx  = _mm512_loadunpackhi_epi32(vec_idx,acolidx);
29    vec_vals = _mm512_loadunpackhi_pd(vec_vals,aval);
30   */
31     #define AVX512_Mult_Private(vec_idx,vec_x,vec_vals,vec_y) \
32     /* if the mask bit is set, copy from acolidx, otherwise from vec_idx */ \
33     vec_idx  = _mm256_loadu_si256((__m256i const*)acolidx); \
34     vec_vals = _mm512_loadu_pd(aval); \
35     vec_x    = _mm512_i32gather_pd(vec_idx,x,_MM_SCALE_8); \
36     vec_y    = _mm512_fmadd_pd(vec_x,vec_vals,vec_y)
37   #elif defined(__AVX2__) && defined(__FMA__)
38     #define AVX2_Mult_Private(vec_idx,vec_x,vec_vals,vec_y) \
39     vec_vals = _mm256_loadu_pd(aval); \
40     vec_idx  = _mm_loadu_si128((__m128i const*)acolidx); /* SSE2 */ \
41     vec_x    = _mm256_i32gather_pd(x,vec_idx,_MM_SCALE_8); \
42     vec_y    = _mm256_fmadd_pd(vec_x,vec_vals,vec_y)
43   #endif
44 #endif  /* PETSC_HAVE_IMMINTRIN_H */
45 
46 /*@C
47  MatSeqSELLSetPreallocation - For good matrix assembly performance
48  the user should preallocate the matrix storage by setting the parameter nz
49  (or the array nnz).  By setting these parameters accurately, performance
50  during matrix assembly can be increased significantly.
51 
52  Collective
53 
54  Input Parameters:
55  +  B - The matrix
56  .  nz - number of nonzeros per row (same for all rows)
57  -  nnz - array containing the number of nonzeros in the various rows
58  (possibly different for each row) or NULL
59 
60  Notes:
61  If nnz is given then nz is ignored.
62 
63  Specify the preallocated storage with either nz or nnz (not both).
64  Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
65  allocation.  For large problems you MUST preallocate memory or you
66  will get TERRIBLE performance, see the users' manual chapter on matrices.
67 
68  You can call MatGetInfo() to get information on how effective the preallocation was;
69  for example the fields mallocs,nz_allocated,nz_used,nz_unneeded;
70  You can also run with the option -info and look for messages with the string
71  malloc in them to see if additional memory allocation was needed.
72 
73  Developers: Use nz of MAT_SKIP_ALLOCATION to not allocate any space for the matrix
74  entries or columns indices.
75 
76  The maximum number of nonzeos in any row should be as accurate as possible.
77  If it is underestimated, you will get bad performance due to reallocation
78  (MatSeqXSELLReallocateSELL).
79 
80  Level: intermediate
81 
82  .seealso: MatCreate(), MatCreateSELL(), MatSetValues(), MatGetInfo()
83 
84  @*/
85 PetscErrorCode MatSeqSELLSetPreallocation(Mat B,PetscInt rlenmax,const PetscInt rlen[])
86 {
87   PetscErrorCode ierr;
88 
89   PetscFunctionBegin;
90   PetscValidHeaderSpecific(B,MAT_CLASSID,1);
91   PetscValidType(B,1);
92   ierr = PetscTryMethod(B,"MatSeqSELLSetPreallocation_C",(Mat,PetscInt,const PetscInt[]),(B,rlenmax,rlen));CHKERRQ(ierr);
93   PetscFunctionReturn(0);
94 }
95 
96 PetscErrorCode MatSeqSELLSetPreallocation_SeqSELL(Mat B,PetscInt maxallocrow,const PetscInt rlen[])
97 {
98   Mat_SeqSELL    *b;
99   PetscInt       i,j,totalslices;
100   PetscBool      skipallocation=PETSC_FALSE,realalloc=PETSC_FALSE;
101   PetscErrorCode ierr;
102 
103   PetscFunctionBegin;
104   if (maxallocrow >= 0 || rlen) realalloc = PETSC_TRUE;
105   if (maxallocrow == MAT_SKIP_ALLOCATION) {
106     skipallocation = PETSC_TRUE;
107     maxallocrow    = 0;
108   }
109 
110   ierr = PetscLayoutSetUp(B->rmap);CHKERRQ(ierr);
111   ierr = PetscLayoutSetUp(B->cmap);CHKERRQ(ierr);
112 
113   /* FIXME: if one preallocates more space than needed, the matrix does not shrink automatically, but for best performance it should */
114   if (maxallocrow == PETSC_DEFAULT || maxallocrow == PETSC_DECIDE) maxallocrow = 5;
115   PetscCheckFalse(maxallocrow < 0,PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"maxallocrow cannot be less than 0: value %" PetscInt_FMT,maxallocrow);
116   if (rlen) {
117     for (i=0; i<B->rmap->n; i++) {
118       PetscCheckFalse(rlen[i] < 0,PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"rlen cannot be less than 0: local row %" PetscInt_FMT " value %" PetscInt_FMT,i,rlen[i]);
119       PetscCheckFalse(rlen[i] > B->cmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"rlen cannot be greater than row length: local row %" PetscInt_FMT " value %" PetscInt_FMT " rowlength %" PetscInt_FMT,i,rlen[i],B->cmap->n);
120     }
121   }
122 
123   B->preallocated = PETSC_TRUE;
124 
125   b = (Mat_SeqSELL*)B->data;
126 
127   totalslices = PetscCeilInt(B->rmap->n,8);
128   b->totalslices = totalslices;
129   if (!skipallocation) {
130     if (B->rmap->n & 0x07) {ierr = PetscInfo(B,"Padding rows to the SEQSELL matrix because the number of rows is not the multiple of 8 (value %" PetscInt_FMT ")\n",B->rmap->n);CHKERRQ(ierr);}
131 
132     if (!b->sliidx) { /* sliidx gives the starting index of each slice, the last element is the total space allocated */
133       ierr = PetscMalloc1(totalslices+1,&b->sliidx);CHKERRQ(ierr);
134       ierr = PetscLogObjectMemory((PetscObject)B,(totalslices+1)*sizeof(PetscInt));CHKERRQ(ierr);
135     }
136     if (!rlen) { /* if rlen is not provided, allocate same space for all the slices */
137       if (maxallocrow == PETSC_DEFAULT || maxallocrow == PETSC_DECIDE) maxallocrow = 10;
138       else if (maxallocrow < 0) maxallocrow = 1;
139       for (i=0; i<=totalslices; i++) b->sliidx[i] = i*8*maxallocrow;
140     } else {
141       maxallocrow = 0;
142       b->sliidx[0] = 0;
143       for (i=1; i<totalslices; i++) {
144         b->sliidx[i] = 0;
145         for (j=0;j<8;j++) {
146           b->sliidx[i] = PetscMax(b->sliidx[i],rlen[8*(i-1)+j]);
147         }
148         maxallocrow = PetscMax(b->sliidx[i],maxallocrow);
149         ierr = PetscIntSumError(b->sliidx[i-1],8*b->sliidx[i],&b->sliidx[i]);CHKERRQ(ierr);
150       }
151       /* last slice */
152       b->sliidx[totalslices] = 0;
153       for (j=(totalslices-1)*8;j<B->rmap->n;j++) b->sliidx[totalslices] = PetscMax(b->sliidx[totalslices],rlen[j]);
154       maxallocrow = PetscMax(b->sliidx[totalslices],maxallocrow);
155       b->sliidx[totalslices] = b->sliidx[totalslices-1] + 8*b->sliidx[totalslices];
156     }
157 
158     /* allocate space for val, colidx, rlen */
159     /* FIXME: should B's old memory be unlogged? */
160     ierr = MatSeqXSELLFreeSELL(B,&b->val,&b->colidx);CHKERRQ(ierr);
161     /* FIXME: assuming an element of the bit array takes 8 bits */
162     ierr = PetscMalloc2(b->sliidx[totalslices],&b->val,b->sliidx[totalslices],&b->colidx);CHKERRQ(ierr);
163     ierr = PetscLogObjectMemory((PetscObject)B,b->sliidx[totalslices]*(sizeof(PetscScalar)+sizeof(PetscInt)));CHKERRQ(ierr);
164     /* b->rlen will count nonzeros in each row so far. We dont copy rlen to b->rlen because the matrix has not been set. */
165     ierr = PetscCalloc1(8*totalslices,&b->rlen);CHKERRQ(ierr);
166     ierr = PetscLogObjectMemory((PetscObject)B,8*totalslices*sizeof(PetscInt));CHKERRQ(ierr);
167 
168     b->singlemalloc = PETSC_TRUE;
169     b->free_val     = PETSC_TRUE;
170     b->free_colidx  = PETSC_TRUE;
171   } else {
172     b->free_val    = PETSC_FALSE;
173     b->free_colidx = PETSC_FALSE;
174   }
175 
176   b->nz               = 0;
177   b->maxallocrow      = maxallocrow;
178   b->rlenmax          = maxallocrow;
179   b->maxallocmat      = b->sliidx[totalslices];
180   B->info.nz_unneeded = (double)b->maxallocmat;
181   if (realalloc) {
182     ierr = MatSetOption(B,MAT_NEW_NONZERO_ALLOCATION_ERR,PETSC_TRUE);CHKERRQ(ierr);
183   }
184   PetscFunctionReturn(0);
185 }
186 
187 PetscErrorCode MatGetRow_SeqSELL(Mat A,PetscInt row,PetscInt *nz,PetscInt **idx,PetscScalar **v)
188 {
189   Mat_SeqSELL *a = (Mat_SeqSELL*)A->data;
190   PetscInt    shift;
191 
192   PetscFunctionBegin;
193   PetscCheckFalse(row < 0 || row >= A->rmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Row %" PetscInt_FMT " out of range",row);
194   if (nz) *nz = a->rlen[row];
195   shift = a->sliidx[row>>3]+(row&0x07);
196   if (!a->getrowcols) {
197     PetscErrorCode ierr;
198 
199     ierr = PetscMalloc2(a->rlenmax,&a->getrowcols,a->rlenmax,&a->getrowvals);CHKERRQ(ierr);
200   }
201   if (idx) {
202     PetscInt j;
203     for (j=0; j<a->rlen[row]; j++) a->getrowcols[j] = a->colidx[shift+8*j];
204     *idx = a->getrowcols;
205   }
206   if (v) {
207     PetscInt j;
208     for (j=0; j<a->rlen[row]; j++) a->getrowvals[j] = a->val[shift+8*j];
209     *v = a->getrowvals;
210   }
211   PetscFunctionReturn(0);
212 }
213 
214 PetscErrorCode MatRestoreRow_SeqSELL(Mat A,PetscInt row,PetscInt *nz,PetscInt **idx,PetscScalar **v)
215 {
216   PetscFunctionBegin;
217   PetscFunctionReturn(0);
218 }
219 
220 PetscErrorCode MatConvert_SeqSELL_SeqAIJ(Mat A, MatType newtype,MatReuse reuse,Mat *newmat)
221 {
222   Mat            B;
223   Mat_SeqSELL    *a=(Mat_SeqSELL*)A->data;
224   PetscInt       i;
225   PetscErrorCode ierr;
226 
227   PetscFunctionBegin;
228   if (reuse == MAT_REUSE_MATRIX) {
229     B    = *newmat;
230     ierr = MatZeroEntries(B);CHKERRQ(ierr);
231   } else {
232     ierr = MatCreate(PetscObjectComm((PetscObject)A),&B);CHKERRQ(ierr);
233     ierr = MatSetSizes(B,A->rmap->n,A->cmap->n,A->rmap->N,A->cmap->N);CHKERRQ(ierr);
234     ierr = MatSetType(B,MATSEQAIJ);CHKERRQ(ierr);
235     ierr = MatSeqAIJSetPreallocation(B,0,a->rlen);CHKERRQ(ierr);
236   }
237 
238   for (i=0; i<A->rmap->n; i++) {
239     PetscInt    nz = 0,*cols = NULL;
240     PetscScalar *vals = NULL;
241 
242     ierr = MatGetRow_SeqSELL(A,i,&nz,&cols,&vals);CHKERRQ(ierr);
243     ierr = MatSetValues(B,1,&i,nz,cols,vals,INSERT_VALUES);CHKERRQ(ierr);
244     ierr = MatRestoreRow_SeqSELL(A,i,&nz,&cols,&vals);CHKERRQ(ierr);
245   }
246 
247   ierr = MatAssemblyBegin(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
248   ierr = MatAssemblyEnd(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
249   B->rmap->bs = A->rmap->bs;
250 
251   if (reuse == MAT_INPLACE_MATRIX) {
252     ierr = MatHeaderReplace(A,&B);CHKERRQ(ierr);
253   } else {
254     *newmat = B;
255   }
256   PetscFunctionReturn(0);
257 }
258 
259 #include <../src/mat/impls/aij/seq/aij.h>
260 
261 PetscErrorCode MatConvert_SeqAIJ_SeqSELL(Mat A,MatType newtype,MatReuse reuse,Mat *newmat)
262 {
263   Mat               B;
264   Mat_SeqAIJ        *a=(Mat_SeqAIJ*)A->data;
265   PetscInt          *ai=a->i,m=A->rmap->N,n=A->cmap->N,i,*rowlengths,row,ncols;
266   const PetscInt    *cols;
267   const PetscScalar *vals;
268   PetscErrorCode    ierr;
269 
270   PetscFunctionBegin;
271 
272   if (reuse == MAT_REUSE_MATRIX) {
273     B = *newmat;
274   } else {
275     if (PetscDefined(USE_DEBUG) || !a->ilen) {
276       ierr = PetscMalloc1(m,&rowlengths);CHKERRQ(ierr);
277       for (i=0; i<m; i++) {
278         rowlengths[i] = ai[i+1] - ai[i];
279       }
280     }
281     if (PetscDefined(USE_DEBUG) && a->ilen) {
282       PetscBool eq;
283       ierr = PetscMemcmp(rowlengths,a->ilen,m*sizeof(PetscInt),&eq);CHKERRQ(ierr);
284       PetscCheckFalse(!eq,PETSC_COMM_SELF,PETSC_ERR_PLIB,"SeqAIJ ilen array incorrect");
285       ierr = PetscFree(rowlengths);CHKERRQ(ierr);
286       rowlengths = a->ilen;
287     } else if (a->ilen) rowlengths = a->ilen;
288     ierr = MatCreate(PetscObjectComm((PetscObject)A),&B);CHKERRQ(ierr);
289     ierr = MatSetSizes(B,m,n,m,n);CHKERRQ(ierr);
290     ierr = MatSetType(B,MATSEQSELL);CHKERRQ(ierr);
291     ierr = MatSeqSELLSetPreallocation(B,0,rowlengths);CHKERRQ(ierr);
292     if (rowlengths != a->ilen) {ierr = PetscFree(rowlengths);CHKERRQ(ierr);}
293   }
294 
295   for (row=0; row<m; row++) {
296     ierr = MatGetRow_SeqAIJ(A,row,&ncols,(PetscInt**)&cols,(PetscScalar**)&vals);CHKERRQ(ierr);
297     ierr = MatSetValues_SeqSELL(B,1,&row,ncols,cols,vals,INSERT_VALUES);CHKERRQ(ierr);
298     ierr = MatRestoreRow_SeqAIJ(A,row,&ncols,(PetscInt**)&cols,(PetscScalar**)&vals);CHKERRQ(ierr);
299   }
300   ierr = MatAssemblyBegin(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
301   ierr = MatAssemblyEnd(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
302   B->rmap->bs = A->rmap->bs;
303 
304   if (reuse == MAT_INPLACE_MATRIX) {
305     ierr = MatHeaderReplace(A,&B);CHKERRQ(ierr);
306   } else {
307     *newmat = B;
308   }
309   PetscFunctionReturn(0);
310 }
311 
312 PetscErrorCode MatMult_SeqSELL(Mat A,Vec xx,Vec yy)
313 {
314   Mat_SeqSELL       *a=(Mat_SeqSELL*)A->data;
315   PetscScalar       *y;
316   const PetscScalar *x;
317   const MatScalar   *aval=a->val;
318   PetscInt          totalslices=a->totalslices;
319   const PetscInt    *acolidx=a->colidx;
320   PetscInt          i,j;
321   PetscErrorCode    ierr;
322 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
323   __m512d           vec_x,vec_y,vec_vals;
324   __m256i           vec_idx;
325   __mmask8          mask;
326   __m512d           vec_x2,vec_y2,vec_vals2,vec_x3,vec_y3,vec_vals3,vec_x4,vec_y4,vec_vals4;
327   __m256i           vec_idx2,vec_idx3,vec_idx4;
328 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
329   __m128i           vec_idx;
330   __m256d           vec_x,vec_y,vec_y2,vec_vals;
331   MatScalar         yval;
332   PetscInt          r,rows_left,row,nnz_in_row;
333 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
334   __m128d           vec_x_tmp;
335   __m256d           vec_x,vec_y,vec_y2,vec_vals;
336   MatScalar         yval;
337   PetscInt          r,rows_left,row,nnz_in_row;
338 #else
339   PetscScalar       sum[8];
340 #endif
341 
342 #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
343 #pragma disjoint(*x,*y,*aval)
344 #endif
345 
346   PetscFunctionBegin;
347   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
348   ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
349 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
350   for (i=0; i<totalslices; i++) { /* loop over slices */
351     PetscPrefetchBlock(acolidx,a->sliidx[i+1]-a->sliidx[i],0,PETSC_PREFETCH_HINT_T0);
352     PetscPrefetchBlock(aval,a->sliidx[i+1]-a->sliidx[i],0,PETSC_PREFETCH_HINT_T0);
353 
354     vec_y  = _mm512_setzero_pd();
355     vec_y2 = _mm512_setzero_pd();
356     vec_y3 = _mm512_setzero_pd();
357     vec_y4 = _mm512_setzero_pd();
358 
359     j = a->sliidx[i]>>3; /* 8 bytes are read at each time, corresponding to a slice columnn */
360     switch ((a->sliidx[i+1]-a->sliidx[i])/8 & 3) {
361     case 3:
362       AVX512_Mult_Private(vec_idx,vec_x,vec_vals,vec_y);
363       acolidx += 8; aval += 8;
364       AVX512_Mult_Private(vec_idx2,vec_x2,vec_vals2,vec_y2);
365       acolidx += 8; aval += 8;
366       AVX512_Mult_Private(vec_idx3,vec_x3,vec_vals3,vec_y3);
367       acolidx += 8; aval += 8;
368       j += 3;
369       break;
370     case 2:
371       AVX512_Mult_Private(vec_idx,vec_x,vec_vals,vec_y);
372       acolidx += 8; aval += 8;
373       AVX512_Mult_Private(vec_idx2,vec_x2,vec_vals2,vec_y2);
374       acolidx += 8; aval += 8;
375       j += 2;
376       break;
377     case 1:
378       AVX512_Mult_Private(vec_idx,vec_x,vec_vals,vec_y);
379       acolidx += 8; aval += 8;
380       j += 1;
381       break;
382     }
383     #pragma novector
384     for (; j<(a->sliidx[i+1]>>3); j+=4) {
385       AVX512_Mult_Private(vec_idx,vec_x,vec_vals,vec_y);
386       acolidx += 8; aval += 8;
387       AVX512_Mult_Private(vec_idx2,vec_x2,vec_vals2,vec_y2);
388       acolidx += 8; aval += 8;
389       AVX512_Mult_Private(vec_idx3,vec_x3,vec_vals3,vec_y3);
390       acolidx += 8; aval += 8;
391       AVX512_Mult_Private(vec_idx4,vec_x4,vec_vals4,vec_y4);
392       acolidx += 8; aval += 8;
393     }
394 
395     vec_y = _mm512_add_pd(vec_y,vec_y2);
396     vec_y = _mm512_add_pd(vec_y,vec_y3);
397     vec_y = _mm512_add_pd(vec_y,vec_y4);
398     if (i == totalslices-1 && A->rmap->n & 0x07) { /* if last slice has padding rows */
399       mask = (__mmask8)(0xff >> (8-(A->rmap->n & 0x07)));
400       _mm512_mask_storeu_pd(&y[8*i],mask,vec_y);
401     } else {
402       _mm512_storeu_pd(&y[8*i],vec_y);
403     }
404   }
405 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
406   for (i=0; i<totalslices; i++) { /* loop over full slices */
407     PetscPrefetchBlock(acolidx,a->sliidx[i+1]-a->sliidx[i],0,PETSC_PREFETCH_HINT_T0);
408     PetscPrefetchBlock(aval,a->sliidx[i+1]-a->sliidx[i],0,PETSC_PREFETCH_HINT_T0);
409 
410     /* last slice may have padding rows. Don't use vectorization. */
411     if (i == totalslices-1 && (A->rmap->n & 0x07)) {
412       rows_left = A->rmap->n - 8*i;
413       for (r=0; r<rows_left; ++r) {
414         yval = (MatScalar)0;
415         row = 8*i + r;
416         nnz_in_row = a->rlen[row];
417         for (j=0; j<nnz_in_row; ++j) yval += aval[8*j+r] * x[acolidx[8*j+r]];
418         y[row] = yval;
419       }
420       break;
421     }
422 
423     vec_y  = _mm256_setzero_pd();
424     vec_y2 = _mm256_setzero_pd();
425 
426     /* Process slice of height 8 (512 bits) via two subslices of height 4 (256 bits) via AVX */
427     #pragma novector
428     #pragma unroll(2)
429     for (j=a->sliidx[i]; j<a->sliidx[i+1]; j+=8) {
430       AVX2_Mult_Private(vec_idx,vec_x,vec_vals,vec_y);
431       aval += 4; acolidx += 4;
432       AVX2_Mult_Private(vec_idx,vec_x,vec_vals,vec_y2);
433       aval += 4; acolidx += 4;
434     }
435 
436     _mm256_storeu_pd(y+i*8,vec_y);
437     _mm256_storeu_pd(y+i*8+4,vec_y2);
438   }
439 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
440   for (i=0; i<totalslices; i++) { /* loop over full slices */
441     PetscPrefetchBlock(acolidx,a->sliidx[i+1]-a->sliidx[i],0,PETSC_PREFETCH_HINT_T0);
442     PetscPrefetchBlock(aval,a->sliidx[i+1]-a->sliidx[i],0,PETSC_PREFETCH_HINT_T0);
443 
444     vec_y  = _mm256_setzero_pd();
445     vec_y2 = _mm256_setzero_pd();
446 
447     /* last slice may have padding rows. Don't use vectorization. */
448     if (i == totalslices-1 && (A->rmap->n & 0x07)) {
449       rows_left = A->rmap->n - 8*i;
450       for (r=0; r<rows_left; ++r) {
451         yval = (MatScalar)0;
452         row = 8*i + r;
453         nnz_in_row = a->rlen[row];
454         for (j=0; j<nnz_in_row; ++j) yval += aval[8*j + r] * x[acolidx[8*j + r]];
455         y[row] = yval;
456       }
457       break;
458     }
459 
460     /* Process slice of height 8 (512 bits) via two subslices of height 4 (256 bits) via AVX */
461     #pragma novector
462     #pragma unroll(2)
463     for (j=a->sliidx[i]; j<a->sliidx[i+1]; j+=8) {
464       vec_vals  = _mm256_loadu_pd(aval);
465       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
466       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
467       vec_x     = _mm256_insertf128_pd(vec_x,vec_x_tmp,0);
468       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
469       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
470       vec_x     = _mm256_insertf128_pd(vec_x,vec_x_tmp,1);
471       vec_y     = _mm256_add_pd(_mm256_mul_pd(vec_x,vec_vals),vec_y);
472       aval     += 4;
473 
474       vec_vals  = _mm256_loadu_pd(aval);
475       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
476       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
477       vec_x     = _mm256_insertf128_pd(vec_x,vec_x_tmp,0);
478       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
479       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
480       vec_x     = _mm256_insertf128_pd(vec_x,vec_x_tmp,1);
481       vec_y2    = _mm256_add_pd(_mm256_mul_pd(vec_x,vec_vals),vec_y2);
482       aval     += 4;
483     }
484 
485     _mm256_storeu_pd(y + i*8,     vec_y);
486     _mm256_storeu_pd(y + i*8 + 4, vec_y2);
487   }
488 #else
489   for (i=0; i<totalslices; i++) { /* loop over slices */
490     for (j=0; j<8; j++) sum[j] = 0.0;
491     for (j=a->sliidx[i]; j<a->sliidx[i+1]; j+=8) {
492       sum[0] += aval[j] * x[acolidx[j]];
493       sum[1] += aval[j+1] * x[acolidx[j+1]];
494       sum[2] += aval[j+2] * x[acolidx[j+2]];
495       sum[3] += aval[j+3] * x[acolidx[j+3]];
496       sum[4] += aval[j+4] * x[acolidx[j+4]];
497       sum[5] += aval[j+5] * x[acolidx[j+5]];
498       sum[6] += aval[j+6] * x[acolidx[j+6]];
499       sum[7] += aval[j+7] * x[acolidx[j+7]];
500     }
501     if (i == totalslices-1 && (A->rmap->n & 0x07)) { /* if last slice has padding rows */
502       for (j=0; j<(A->rmap->n & 0x07); j++) y[8*i+j] = sum[j];
503     } else {
504       for (j=0; j<8; j++) y[8*i+j] = sum[j];
505     }
506   }
507 #endif
508 
509   ierr = PetscLogFlops(2.0*a->nz-a->nonzerorowcnt);CHKERRQ(ierr); /* theoretical minimal FLOPs */
510   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
511   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
512   PetscFunctionReturn(0);
513 }
514 
515 #include <../src/mat/impls/aij/seq/ftn-kernels/fmultadd.h>
516 PetscErrorCode MatMultAdd_SeqSELL(Mat A,Vec xx,Vec yy,Vec zz)
517 {
518   Mat_SeqSELL       *a=(Mat_SeqSELL*)A->data;
519   PetscScalar       *y,*z;
520   const PetscScalar *x;
521   const MatScalar   *aval=a->val;
522   PetscInt          totalslices=a->totalslices;
523   const PetscInt    *acolidx=a->colidx;
524   PetscInt          i,j;
525   PetscErrorCode    ierr;
526 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
527   __m512d           vec_x,vec_y,vec_vals;
528   __m256i           vec_idx;
529   __mmask8          mask;
530   __m512d           vec_x2,vec_y2,vec_vals2,vec_x3,vec_y3,vec_vals3,vec_x4,vec_y4,vec_vals4;
531   __m256i           vec_idx2,vec_idx3,vec_idx4;
532 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
533   __m128d           vec_x_tmp;
534   __m256d           vec_x,vec_y,vec_y2,vec_vals;
535   MatScalar         yval;
536   PetscInt          r,row,nnz_in_row;
537 #else
538   PetscScalar       sum[8];
539 #endif
540 
541 #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
542 #pragma disjoint(*x,*y,*aval)
543 #endif
544 
545   PetscFunctionBegin;
546   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
547   ierr = VecGetArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
548 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
549   for (i=0; i<totalslices; i++) { /* loop over slices */
550     PetscPrefetchBlock(acolidx,a->sliidx[i+1]-a->sliidx[i],0,PETSC_PREFETCH_HINT_T0);
551     PetscPrefetchBlock(aval,a->sliidx[i+1]-a->sliidx[i],0,PETSC_PREFETCH_HINT_T0);
552 
553     if (i == totalslices-1 && A->rmap->n & 0x07) { /* if last slice has padding rows */
554       mask   = (__mmask8)(0xff >> (8-(A->rmap->n & 0x07)));
555       vec_y  = _mm512_mask_loadu_pd(vec_y,mask,&y[8*i]);
556     } else {
557       vec_y  = _mm512_loadu_pd(&y[8*i]);
558     }
559     vec_y2 = _mm512_setzero_pd();
560     vec_y3 = _mm512_setzero_pd();
561     vec_y4 = _mm512_setzero_pd();
562 
563     j = a->sliidx[i]>>3; /* 8 bytes are read at each time, corresponding to a slice columnn */
564     switch ((a->sliidx[i+1]-a->sliidx[i])/8 & 3) {
565     case 3:
566       AVX512_Mult_Private(vec_idx,vec_x,vec_vals,vec_y);
567       acolidx += 8; aval += 8;
568       AVX512_Mult_Private(vec_idx2,vec_x2,vec_vals2,vec_y2);
569       acolidx += 8; aval += 8;
570       AVX512_Mult_Private(vec_idx3,vec_x3,vec_vals3,vec_y3);
571       acolidx += 8; aval += 8;
572       j += 3;
573       break;
574     case 2:
575       AVX512_Mult_Private(vec_idx,vec_x,vec_vals,vec_y);
576       acolidx += 8; aval += 8;
577       AVX512_Mult_Private(vec_idx2,vec_x2,vec_vals2,vec_y2);
578       acolidx += 8; aval += 8;
579       j += 2;
580       break;
581     case 1:
582       AVX512_Mult_Private(vec_idx,vec_x,vec_vals,vec_y);
583       acolidx += 8; aval += 8;
584       j += 1;
585       break;
586     }
587     #pragma novector
588     for (; j<(a->sliidx[i+1]>>3); j+=4) {
589       AVX512_Mult_Private(vec_idx,vec_x,vec_vals,vec_y);
590       acolidx += 8; aval += 8;
591       AVX512_Mult_Private(vec_idx2,vec_x2,vec_vals2,vec_y2);
592       acolidx += 8; aval += 8;
593       AVX512_Mult_Private(vec_idx3,vec_x3,vec_vals3,vec_y3);
594       acolidx += 8; aval += 8;
595       AVX512_Mult_Private(vec_idx4,vec_x4,vec_vals4,vec_y4);
596       acolidx += 8; aval += 8;
597     }
598 
599     vec_y = _mm512_add_pd(vec_y,vec_y2);
600     vec_y = _mm512_add_pd(vec_y,vec_y3);
601     vec_y = _mm512_add_pd(vec_y,vec_y4);
602     if (i == totalslices-1 && A->rmap->n & 0x07) { /* if last slice has padding rows */
603       _mm512_mask_storeu_pd(&z[8*i],mask,vec_y);
604     } else {
605       _mm512_storeu_pd(&z[8*i],vec_y);
606     }
607   }
608 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
609   for (i=0; i<totalslices; i++) { /* loop over full slices */
610     PetscPrefetchBlock(acolidx,a->sliidx[i+1]-a->sliidx[i],0,PETSC_PREFETCH_HINT_T0);
611     PetscPrefetchBlock(aval,a->sliidx[i+1]-a->sliidx[i],0,PETSC_PREFETCH_HINT_T0);
612 
613     /* last slice may have padding rows. Don't use vectorization. */
614     if (i == totalslices-1 && (A->rmap->n & 0x07)) {
615       for (r=0; r<(A->rmap->n & 0x07); ++r) {
616         row        = 8*i + r;
617         yval       = (MatScalar)0.0;
618         nnz_in_row = a->rlen[row];
619         for (j=0; j<nnz_in_row; ++j) yval += aval[8*j+r] * x[acolidx[8*j+r]];
620         z[row] = y[row] + yval;
621       }
622       break;
623     }
624 
625     vec_y  = _mm256_loadu_pd(y+8*i);
626     vec_y2 = _mm256_loadu_pd(y+8*i+4);
627 
628     /* Process slice of height 8 (512 bits) via two subslices of height 4 (256 bits) via AVX */
629     for (j=a->sliidx[i]; j<a->sliidx[i+1]; j+=8) {
630       vec_vals  = _mm256_loadu_pd(aval);
631       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
632       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
633       vec_x     = _mm256_insertf128_pd(vec_x,vec_x_tmp,0);
634       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
635       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
636       vec_x     = _mm256_insertf128_pd(vec_x,vec_x_tmp,1);
637       vec_y     = _mm256_add_pd(_mm256_mul_pd(vec_x,vec_vals),vec_y);
638       aval     += 4;
639 
640       vec_vals  = _mm256_loadu_pd(aval);
641       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
642       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
643       vec_x     = _mm256_insertf128_pd(vec_x,vec_x_tmp,0);
644       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
645       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
646       vec_x     = _mm256_insertf128_pd(vec_x,vec_x_tmp,1);
647       vec_y2    = _mm256_add_pd(_mm256_mul_pd(vec_x,vec_vals),vec_y2);
648       aval     += 4;
649     }
650 
651     _mm256_storeu_pd(z+i*8,vec_y);
652     _mm256_storeu_pd(z+i*8+4,vec_y2);
653   }
654 #else
655   for (i=0; i<totalslices; i++) { /* loop over slices */
656     for (j=0; j<8; j++) sum[j] = 0.0;
657     for (j=a->sliidx[i]; j<a->sliidx[i+1]; j+=8) {
658       sum[0] += aval[j] * x[acolidx[j]];
659       sum[1] += aval[j+1] * x[acolidx[j+1]];
660       sum[2] += aval[j+2] * x[acolidx[j+2]];
661       sum[3] += aval[j+3] * x[acolidx[j+3]];
662       sum[4] += aval[j+4] * x[acolidx[j+4]];
663       sum[5] += aval[j+5] * x[acolidx[j+5]];
664       sum[6] += aval[j+6] * x[acolidx[j+6]];
665       sum[7] += aval[j+7] * x[acolidx[j+7]];
666     }
667     if (i == totalslices-1 && (A->rmap->n & 0x07)) {
668       for (j=0; j<(A->rmap->n & 0x07); j++) z[8*i+j] = y[8*i+j] + sum[j];
669     } else {
670       for (j=0; j<8; j++) z[8*i+j] = y[8*i+j] + sum[j];
671     }
672   }
673 #endif
674 
675   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
676   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
677   ierr = VecRestoreArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
678   PetscFunctionReturn(0);
679 }
680 
681 PetscErrorCode MatMultTransposeAdd_SeqSELL(Mat A,Vec xx,Vec zz,Vec yy)
682 {
683   Mat_SeqSELL       *a=(Mat_SeqSELL*)A->data;
684   PetscScalar       *y;
685   const PetscScalar *x;
686   const MatScalar   *aval=a->val;
687   const PetscInt    *acolidx=a->colidx;
688   PetscInt          i,j,r,row,nnz_in_row,totalslices=a->totalslices;
689   PetscErrorCode    ierr;
690 
691 #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
692 #pragma disjoint(*x,*y,*aval)
693 #endif
694 
695   PetscFunctionBegin;
696   if (A->symmetric) {
697     ierr = MatMultAdd_SeqSELL(A,xx,zz,yy);CHKERRQ(ierr);
698     PetscFunctionReturn(0);
699   }
700   if (zz != yy) { ierr = VecCopy(zz,yy);CHKERRQ(ierr); }
701   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
702   ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
703   for (i=0; i<a->totalslices; i++) { /* loop over slices */
704     if (i == totalslices-1 && (A->rmap->n & 0x07)) {
705       for (r=0; r<(A->rmap->n & 0x07); ++r) {
706         row        = 8*i + r;
707         nnz_in_row = a->rlen[row];
708         for (j=0; j<nnz_in_row; ++j) y[acolidx[8*j+r]] += aval[8*j+r] * x[row];
709       }
710       break;
711     }
712     for (j=a->sliidx[i]; j<a->sliidx[i+1]; j+=8) {
713       y[acolidx[j]]   += aval[j] * x[8*i];
714       y[acolidx[j+1]] += aval[j+1] * x[8*i+1];
715       y[acolidx[j+2]] += aval[j+2] * x[8*i+2];
716       y[acolidx[j+3]] += aval[j+3] * x[8*i+3];
717       y[acolidx[j+4]] += aval[j+4] * x[8*i+4];
718       y[acolidx[j+5]] += aval[j+5] * x[8*i+5];
719       y[acolidx[j+6]] += aval[j+6] * x[8*i+6];
720       y[acolidx[j+7]] += aval[j+7] * x[8*i+7];
721     }
722   }
723   ierr = PetscLogFlops(2.0*a->sliidx[a->totalslices]);CHKERRQ(ierr);
724   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
725   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
726   PetscFunctionReturn(0);
727 }
728 
729 PetscErrorCode MatMultTranspose_SeqSELL(Mat A,Vec xx,Vec yy)
730 {
731   PetscErrorCode ierr;
732 
733   PetscFunctionBegin;
734   if (A->symmetric) {
735     ierr = MatMult_SeqSELL(A,xx,yy);CHKERRQ(ierr);
736   } else {
737     ierr = VecSet(yy,0.0);CHKERRQ(ierr);
738     ierr = MatMultTransposeAdd_SeqSELL(A,xx,yy,yy);CHKERRQ(ierr);
739   }
740   PetscFunctionReturn(0);
741 }
742 
743 /*
744      Checks for missing diagonals
745 */
746 PetscErrorCode MatMissingDiagonal_SeqSELL(Mat A,PetscBool  *missing,PetscInt *d)
747 {
748   Mat_SeqSELL    *a=(Mat_SeqSELL*)A->data;
749   PetscInt       *diag,i;
750   PetscErrorCode ierr;
751 
752   PetscFunctionBegin;
753   *missing = PETSC_FALSE;
754   if (A->rmap->n > 0 && !(a->colidx)) {
755     *missing = PETSC_TRUE;
756     if (d) *d = 0;
757     ierr = PetscInfo(A,"Matrix has no entries therefore is missing diagonal\n");CHKERRQ(ierr);
758   } else {
759     diag = a->diag;
760     for (i=0; i<A->rmap->n; i++) {
761       if (diag[i] == -1) {
762         *missing = PETSC_TRUE;
763         if (d) *d = i;
764         ierr = PetscInfo(A,"Matrix is missing diagonal number %" PetscInt_FMT "\n",i);CHKERRQ(ierr);
765         break;
766       }
767     }
768   }
769   PetscFunctionReturn(0);
770 }
771 
772 PetscErrorCode MatMarkDiagonal_SeqSELL(Mat A)
773 {
774   Mat_SeqSELL    *a=(Mat_SeqSELL*)A->data;
775   PetscInt       i,j,m=A->rmap->n,shift;
776   PetscErrorCode ierr;
777 
778   PetscFunctionBegin;
779   if (!a->diag) {
780     ierr         = PetscMalloc1(m,&a->diag);CHKERRQ(ierr);
781     ierr         = PetscLogObjectMemory((PetscObject)A,m*sizeof(PetscInt));CHKERRQ(ierr);
782     a->free_diag = PETSC_TRUE;
783   }
784   for (i=0; i<m; i++) { /* loop over rows */
785     shift = a->sliidx[i>>3]+(i&0x07); /* starting index of the row i */
786     a->diag[i] = -1;
787     for (j=0; j<a->rlen[i]; j++) {
788       if (a->colidx[shift+j*8] == i) {
789         a->diag[i] = shift+j*8;
790         break;
791       }
792     }
793   }
794   PetscFunctionReturn(0);
795 }
796 
797 /*
798   Negative shift indicates do not generate an error if there is a zero diagonal, just invert it anyways
799 */
800 PetscErrorCode MatInvertDiagonal_SeqSELL(Mat A,PetscScalar omega,PetscScalar fshift)
801 {
802   Mat_SeqSELL    *a=(Mat_SeqSELL*) A->data;
803   PetscInt       i,*diag,m = A->rmap->n;
804   MatScalar      *val = a->val;
805   PetscScalar    *idiag,*mdiag;
806   PetscErrorCode ierr;
807 
808   PetscFunctionBegin;
809   if (a->idiagvalid) PetscFunctionReturn(0);
810   ierr = MatMarkDiagonal_SeqSELL(A);CHKERRQ(ierr);
811   diag = a->diag;
812   if (!a->idiag) {
813     ierr = PetscMalloc3(m,&a->idiag,m,&a->mdiag,m,&a->ssor_work);CHKERRQ(ierr);
814     ierr = PetscLogObjectMemory((PetscObject)A, 3*m*sizeof(PetscScalar));CHKERRQ(ierr);
815     val  = a->val;
816   }
817   mdiag = a->mdiag;
818   idiag = a->idiag;
819 
820   if (omega == 1.0 && PetscRealPart(fshift) <= 0.0) {
821     for (i=0; i<m; i++) {
822       mdiag[i] = val[diag[i]];
823       if (!PetscAbsScalar(mdiag[i])) { /* zero diagonal */
824         if (PetscRealPart(fshift)) {
825           ierr = PetscInfo(A,"Zero diagonal on row %" PetscInt_FMT "\n",i);CHKERRQ(ierr);
826           A->factorerrortype             = MAT_FACTOR_NUMERIC_ZEROPIVOT;
827           A->factorerror_zeropivot_value = 0.0;
828           A->factorerror_zeropivot_row   = i;
829         } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"Zero diagonal on row %" PetscInt_FMT,i);
830       }
831       idiag[i] = 1.0/val[diag[i]];
832     }
833     ierr = PetscLogFlops(m);CHKERRQ(ierr);
834   } else {
835     for (i=0; i<m; i++) {
836       mdiag[i] = val[diag[i]];
837       idiag[i] = omega/(fshift + val[diag[i]]);
838     }
839     ierr = PetscLogFlops(2.0*m);CHKERRQ(ierr);
840   }
841   a->idiagvalid = PETSC_TRUE;
842   PetscFunctionReturn(0);
843 }
844 
845 PetscErrorCode MatZeroEntries_SeqSELL(Mat A)
846 {
847   Mat_SeqSELL    *a=(Mat_SeqSELL*)A->data;
848   PetscErrorCode ierr;
849 
850   PetscFunctionBegin;
851   ierr = PetscArrayzero(a->val,a->sliidx[a->totalslices]);CHKERRQ(ierr);
852   ierr = MatSeqSELLInvalidateDiagonal(A);CHKERRQ(ierr);
853   PetscFunctionReturn(0);
854 }
855 
856 PetscErrorCode MatDestroy_SeqSELL(Mat A)
857 {
858   Mat_SeqSELL    *a=(Mat_SeqSELL*)A->data;
859   PetscErrorCode ierr;
860 
861   PetscFunctionBegin;
862 #if defined(PETSC_USE_LOG)
863   PetscLogObjectState((PetscObject)A,"Rows=%" PetscInt_FMT ", Cols=%" PetscInt_FMT ", NZ=%" PetscInt_FMT,A->rmap->n,A->cmap->n,a->nz);
864 #endif
865   ierr = MatSeqXSELLFreeSELL(A,&a->val,&a->colidx);CHKERRQ(ierr);
866   ierr = ISDestroy(&a->row);CHKERRQ(ierr);
867   ierr = ISDestroy(&a->col);CHKERRQ(ierr);
868   ierr = PetscFree(a->diag);CHKERRQ(ierr);
869   ierr = PetscFree(a->rlen);CHKERRQ(ierr);
870   ierr = PetscFree(a->sliidx);CHKERRQ(ierr);
871   ierr = PetscFree3(a->idiag,a->mdiag,a->ssor_work);CHKERRQ(ierr);
872   ierr = PetscFree(a->solve_work);CHKERRQ(ierr);
873   ierr = ISDestroy(&a->icol);CHKERRQ(ierr);
874   ierr = PetscFree(a->saved_values);CHKERRQ(ierr);
875   ierr = PetscFree2(a->getrowcols,a->getrowvals);CHKERRQ(ierr);
876 
877   ierr = PetscFree(A->data);CHKERRQ(ierr);
878 
879   ierr = PetscObjectChangeTypeName((PetscObject)A,NULL);CHKERRQ(ierr);
880   ierr = PetscObjectComposeFunction((PetscObject)A,"MatStoreValues_C",NULL);CHKERRQ(ierr);
881   ierr = PetscObjectComposeFunction((PetscObject)A,"MatRetrieveValues_C",NULL);CHKERRQ(ierr);
882   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqSELLSetPreallocation_C",NULL);CHKERRQ(ierr);
883   PetscFunctionReturn(0);
884 }
885 
886 PetscErrorCode MatSetOption_SeqSELL(Mat A,MatOption op,PetscBool flg)
887 {
888   Mat_SeqSELL    *a=(Mat_SeqSELL*)A->data;
889   PetscErrorCode ierr;
890 
891   PetscFunctionBegin;
892   switch (op) {
893   case MAT_ROW_ORIENTED:
894     a->roworiented = flg;
895     break;
896   case MAT_KEEP_NONZERO_PATTERN:
897     a->keepnonzeropattern = flg;
898     break;
899   case MAT_NEW_NONZERO_LOCATIONS:
900     a->nonew = (flg ? 0 : 1);
901     break;
902   case MAT_NEW_NONZERO_LOCATION_ERR:
903     a->nonew = (flg ? -1 : 0);
904     break;
905   case MAT_NEW_NONZERO_ALLOCATION_ERR:
906     a->nonew = (flg ? -2 : 0);
907     break;
908   case MAT_UNUSED_NONZERO_LOCATION_ERR:
909     a->nounused = (flg ? -1 : 0);
910     break;
911   case MAT_FORCE_DIAGONAL_ENTRIES:
912   case MAT_IGNORE_OFF_PROC_ENTRIES:
913   case MAT_USE_HASH_TABLE:
914   case MAT_SORTED_FULL:
915     ierr = PetscInfo(A,"Option %s ignored\n",MatOptions[op]);CHKERRQ(ierr);
916     break;
917   case MAT_SPD:
918   case MAT_SYMMETRIC:
919   case MAT_STRUCTURALLY_SYMMETRIC:
920   case MAT_HERMITIAN:
921   case MAT_SYMMETRY_ETERNAL:
922     /* These options are handled directly by MatSetOption() */
923     break;
924   default:
925     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"unknown option %d",op);
926   }
927   PetscFunctionReturn(0);
928 }
929 
930 PetscErrorCode MatGetDiagonal_SeqSELL(Mat A,Vec v)
931 {
932   Mat_SeqSELL    *a=(Mat_SeqSELL*)A->data;
933   PetscInt       i,j,n,shift;
934   PetscScalar    *x,zero=0.0;
935   PetscErrorCode ierr;
936 
937   PetscFunctionBegin;
938   ierr = VecGetLocalSize(v,&n);CHKERRQ(ierr);
939   PetscCheckFalse(n != A->rmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector");
940 
941   if (A->factortype == MAT_FACTOR_ILU || A->factortype == MAT_FACTOR_LU) {
942     PetscInt *diag=a->diag;
943     ierr = VecGetArray(v,&x);CHKERRQ(ierr);
944     for (i=0; i<n; i++) x[i] = 1.0/a->val[diag[i]];
945     ierr = VecRestoreArray(v,&x);CHKERRQ(ierr);
946     PetscFunctionReturn(0);
947   }
948 
949   ierr = VecSet(v,zero);CHKERRQ(ierr);
950   ierr = VecGetArray(v,&x);CHKERRQ(ierr);
951   for (i=0; i<n; i++) { /* loop over rows */
952     shift = a->sliidx[i>>3]+(i&0x07); /* starting index of the row i */
953     x[i] = 0;
954     for (j=0; j<a->rlen[i]; j++) {
955       if (a->colidx[shift+j*8] == i) {
956         x[i] = a->val[shift+j*8];
957         break;
958       }
959     }
960   }
961   ierr = VecRestoreArray(v,&x);CHKERRQ(ierr);
962   PetscFunctionReturn(0);
963 }
964 
965 PetscErrorCode MatDiagonalScale_SeqSELL(Mat A,Vec ll,Vec rr)
966 {
967   Mat_SeqSELL       *a=(Mat_SeqSELL*)A->data;
968   const PetscScalar *l,*r;
969   PetscInt          i,j,m,n,row;
970   PetscErrorCode    ierr;
971 
972   PetscFunctionBegin;
973   if (ll) {
974     /* The local size is used so that VecMPI can be passed to this routine
975        by MatDiagonalScale_MPISELL */
976     ierr = VecGetLocalSize(ll,&m);CHKERRQ(ierr);
977     PetscCheckFalse(m != A->rmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Left scaling vector wrong length");
978     ierr = VecGetArrayRead(ll,&l);CHKERRQ(ierr);
979     for (i=0; i<a->totalslices; i++) { /* loop over slices */
980       if (i == a->totalslices-1 && (A->rmap->n & 0x07)) { /* if last slice has padding rows */
981         for (j=a->sliidx[i],row=0; j<a->sliidx[i+1]; j++,row=((row+1)&0x07)) {
982           if (row < (A->rmap->n & 0x07)) a->val[j] *= l[8*i+row];
983         }
984       } else {
985         for (j=a->sliidx[i],row=0; j<a->sliidx[i+1]; j++,row=((row+1)&0x07)) {
986           a->val[j] *= l[8*i+row];
987         }
988       }
989     }
990     ierr = VecRestoreArrayRead(ll,&l);CHKERRQ(ierr);
991     ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
992   }
993   if (rr) {
994     ierr = VecGetLocalSize(rr,&n);CHKERRQ(ierr);
995     PetscCheckFalse(n != A->cmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Right scaling vector wrong length");
996     ierr = VecGetArrayRead(rr,&r);CHKERRQ(ierr);
997     for (i=0; i<a->totalslices; i++) { /* loop over slices */
998       if (i == a->totalslices-1 && (A->rmap->n & 0x07)) { /* if last slice has padding rows */
999         for (j=a->sliidx[i],row=0; j<a->sliidx[i+1]; j++,row=((row+1)&0x07)) {
1000           if (row < (A->rmap->n & 0x07)) a->val[j] *= r[a->colidx[j]];
1001         }
1002       } else {
1003         for (j=a->sliidx[i]; j<a->sliidx[i+1]; j++) {
1004           a->val[j] *= r[a->colidx[j]];
1005         }
1006       }
1007     }
1008     ierr = VecRestoreArrayRead(rr,&r);CHKERRQ(ierr);
1009     ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
1010   }
1011   ierr = MatSeqSELLInvalidateDiagonal(A);CHKERRQ(ierr);
1012   PetscFunctionReturn(0);
1013 }
1014 
1015 PetscErrorCode MatGetValues_SeqSELL(Mat A,PetscInt m,const PetscInt im[],PetscInt n,const PetscInt in[],PetscScalar v[])
1016 {
1017   Mat_SeqSELL *a=(Mat_SeqSELL*)A->data;
1018   PetscInt    *cp,i,k,low,high,t,row,col,l;
1019   PetscInt    shift;
1020   MatScalar   *vp;
1021 
1022   PetscFunctionBegin;
1023   for (k=0; k<m; k++) { /* loop over requested rows */
1024     row = im[k];
1025     if (row<0) continue;
1026     PetscCheck(row < A->rmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Row too large: row %" PetscInt_FMT " max %" PetscInt_FMT,row,A->rmap->n-1);
1027     shift = a->sliidx[row>>3]+(row&0x07); /* starting index of the row */
1028     cp = a->colidx+shift; /* pointer to the row */
1029     vp = a->val+shift; /* pointer to the row */
1030     for (l=0; l<n; l++) { /* loop over requested columns */
1031       col = in[l];
1032       if (col<0) continue;
1033       PetscCheck(col < A->cmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Column too large: row %" PetscInt_FMT " max %" PetscInt_FMT,col,A->cmap->n-1);
1034       high = a->rlen[row]; low = 0; /* assume unsorted */
1035       while (high-low > 5) {
1036         t = (low+high)/2;
1037         if (*(cp+t*8) > col) high = t;
1038         else low = t;
1039       }
1040       for (i=low; i<high; i++) {
1041         if (*(cp+8*i) > col) break;
1042         if (*(cp+8*i) == col) {
1043           *v++ = *(vp+8*i);
1044           goto finished;
1045         }
1046       }
1047       *v++ = 0.0;
1048     finished:;
1049     }
1050   }
1051   PetscFunctionReturn(0);
1052 }
1053 
1054 PetscErrorCode MatView_SeqSELL_ASCII(Mat A,PetscViewer viewer)
1055 {
1056   Mat_SeqSELL       *a=(Mat_SeqSELL*)A->data;
1057   PetscInt          i,j,m=A->rmap->n,shift;
1058   const char        *name;
1059   PetscViewerFormat format;
1060   PetscErrorCode    ierr;
1061 
1062   PetscFunctionBegin;
1063   ierr = PetscViewerGetFormat(viewer,&format);CHKERRQ(ierr);
1064   if (format == PETSC_VIEWER_ASCII_MATLAB) {
1065     PetscInt nofinalvalue = 0;
1066     /*
1067     if (m && ((a->i[m] == a->i[m-1]) || (a->j[a->nz-1] != A->cmap->n-1))) {
1068       nofinalvalue = 1;
1069     }
1070     */
1071     ierr = PetscViewerASCIIUseTabs(viewer,PETSC_FALSE);CHKERRQ(ierr);
1072     ierr = PetscViewerASCIIPrintf(viewer,"%% Size = %" PetscInt_FMT " %" PetscInt_FMT " \n",m,A->cmap->n);CHKERRQ(ierr);
1073     ierr = PetscViewerASCIIPrintf(viewer,"%% Nonzeros = %" PetscInt_FMT " \n",a->nz);CHKERRQ(ierr);
1074 #if defined(PETSC_USE_COMPLEX)
1075     ierr = PetscViewerASCIIPrintf(viewer,"zzz = zeros(%" PetscInt_FMT ",4);\n",a->nz+nofinalvalue);CHKERRQ(ierr);
1076 #else
1077     ierr = PetscViewerASCIIPrintf(viewer,"zzz = zeros(%" PetscInt_FMT ",3);\n",a->nz+nofinalvalue);CHKERRQ(ierr);
1078 #endif
1079     ierr = PetscViewerASCIIPrintf(viewer,"zzz = [\n");CHKERRQ(ierr);
1080 
1081     for (i=0; i<m; i++) {
1082       shift = a->sliidx[i>>3]+(i&0x07);
1083       for (j=0; j<a->rlen[i]; j++) {
1084 #if defined(PETSC_USE_COMPLEX)
1085         ierr = PetscViewerASCIIPrintf(viewer,"%" PetscInt_FMT " %" PetscInt_FMT "  %18.16e %18.16e\n",i+1,a->colidx[shift+8*j]+1,(double)PetscRealPart(a->val[shift+8*j]),(double)PetscImaginaryPart(a->val[shift+8*j]));CHKERRQ(ierr);
1086 #else
1087         ierr = PetscViewerASCIIPrintf(viewer,"%" PetscInt_FMT " %" PetscInt_FMT "  %18.16e\n",i+1,a->colidx[shift+8*j]+1,(double)a->val[shift+8*j]);CHKERRQ(ierr);
1088 #endif
1089       }
1090     }
1091     /*
1092     if (nofinalvalue) {
1093 #if defined(PETSC_USE_COMPLEX)
1094       ierr = PetscViewerASCIIPrintf(viewer,"%" PetscInt_FMT " %" PetscInt_FMT "  %18.16e %18.16e\n",m,A->cmap->n,0.,0.);CHKERRQ(ierr);
1095 #else
1096       ierr = PetscViewerASCIIPrintf(viewer,"%" PetscInt_FMT " %" PetscInt_FMT "  %18.16e\n",m,A->cmap->n,0.0);CHKERRQ(ierr);
1097 #endif
1098     }
1099     */
1100     ierr = PetscObjectGetName((PetscObject)A,&name);CHKERRQ(ierr);
1101     ierr = PetscViewerASCIIPrintf(viewer,"];\n %s = spconvert(zzz);\n",name);CHKERRQ(ierr);
1102     ierr = PetscViewerASCIIUseTabs(viewer,PETSC_TRUE);CHKERRQ(ierr);
1103   } else if (format == PETSC_VIEWER_ASCII_FACTOR_INFO || format == PETSC_VIEWER_ASCII_INFO) {
1104     PetscFunctionReturn(0);
1105   } else if (format == PETSC_VIEWER_ASCII_COMMON) {
1106     ierr = PetscViewerASCIIUseTabs(viewer,PETSC_FALSE);CHKERRQ(ierr);
1107     for (i=0; i<m; i++) {
1108       ierr = PetscViewerASCIIPrintf(viewer,"row %" PetscInt_FMT ":",i);CHKERRQ(ierr);
1109       shift = a->sliidx[i>>3]+(i&0x07);
1110       for (j=0; j<a->rlen[i]; j++) {
1111 #if defined(PETSC_USE_COMPLEX)
1112         if (PetscImaginaryPart(a->val[shift+8*j]) > 0.0 && PetscRealPart(a->val[shift+8*j]) != 0.0) {
1113           ierr = PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g + %g i)",a->colidx[shift+8*j],(double)PetscRealPart(a->val[shift+8*j]),(double)PetscImaginaryPart(a->val[shift+8*j]));CHKERRQ(ierr);
1114         } else if (PetscImaginaryPart(a->val[shift+8*j]) < 0.0 && PetscRealPart(a->val[shift+8*j]) != 0.0) {
1115           ierr = PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g - %g i)",a->colidx[shift+8*j],(double)PetscRealPart(a->val[shift+8*j]),(double)-PetscImaginaryPart(a->val[shift+8*j]));CHKERRQ(ierr);
1116         } else if (PetscRealPart(a->val[shift+8*j]) != 0.0) {
1117           ierr = PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g) ",a->colidx[shift+8*j],(double)PetscRealPart(a->val[shift+8*j]));CHKERRQ(ierr);
1118         }
1119 #else
1120         if (a->val[shift+8*j] != 0.0) {ierr = PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g) ",a->colidx[shift+8*j],(double)a->val[shift+8*j]);CHKERRQ(ierr);}
1121 #endif
1122       }
1123       ierr = PetscViewerASCIIPrintf(viewer,"\n");CHKERRQ(ierr);
1124     }
1125     ierr = PetscViewerASCIIUseTabs(viewer,PETSC_TRUE);CHKERRQ(ierr);
1126   } else if (format == PETSC_VIEWER_ASCII_DENSE) {
1127     PetscInt    cnt=0,jcnt;
1128     PetscScalar value;
1129 #if defined(PETSC_USE_COMPLEX)
1130     PetscBool   realonly=PETSC_TRUE;
1131     for (i=0; i<a->sliidx[a->totalslices]; i++) {
1132       if (PetscImaginaryPart(a->val[i]) != 0.0) {
1133         realonly = PETSC_FALSE;
1134         break;
1135       }
1136     }
1137 #endif
1138 
1139     ierr = PetscViewerASCIIUseTabs(viewer,PETSC_FALSE);CHKERRQ(ierr);
1140     for (i=0; i<m; i++) {
1141       jcnt = 0;
1142       shift = a->sliidx[i>>3]+(i&0x07);
1143       for (j=0; j<A->cmap->n; j++) {
1144         if (jcnt < a->rlen[i] && j == a->colidx[shift+8*j]) {
1145           value = a->val[cnt++];
1146           jcnt++;
1147         } else {
1148           value = 0.0;
1149         }
1150 #if defined(PETSC_USE_COMPLEX)
1151         if (realonly) {
1152           ierr = PetscViewerASCIIPrintf(viewer," %7.5e ",(double)PetscRealPart(value));CHKERRQ(ierr);
1153         } else {
1154           ierr = PetscViewerASCIIPrintf(viewer," %7.5e+%7.5e i ",(double)PetscRealPart(value),(double)PetscImaginaryPart(value));CHKERRQ(ierr);
1155         }
1156 #else
1157         ierr = PetscViewerASCIIPrintf(viewer," %7.5e ",(double)value);CHKERRQ(ierr);
1158 #endif
1159       }
1160       ierr = PetscViewerASCIIPrintf(viewer,"\n");CHKERRQ(ierr);
1161     }
1162     ierr = PetscViewerASCIIUseTabs(viewer,PETSC_TRUE);CHKERRQ(ierr);
1163   } else if (format == PETSC_VIEWER_ASCII_MATRIXMARKET) {
1164     PetscInt fshift=1;
1165     ierr = PetscViewerASCIIUseTabs(viewer,PETSC_FALSE);CHKERRQ(ierr);
1166 #if defined(PETSC_USE_COMPLEX)
1167     ierr = PetscViewerASCIIPrintf(viewer,"%%%%MatrixMarket matrix coordinate complex general\n");CHKERRQ(ierr);
1168 #else
1169     ierr = PetscViewerASCIIPrintf(viewer,"%%%%MatrixMarket matrix coordinate real general\n");CHKERRQ(ierr);
1170 #endif
1171     ierr = PetscViewerASCIIPrintf(viewer,"%" PetscInt_FMT " %" PetscInt_FMT " %" PetscInt_FMT "\n", m, A->cmap->n, a->nz);CHKERRQ(ierr);
1172     for (i=0; i<m; i++) {
1173       shift = a->sliidx[i>>3]+(i&0x07);
1174       for (j=0; j<a->rlen[i]; j++) {
1175 #if defined(PETSC_USE_COMPLEX)
1176         ierr = PetscViewerASCIIPrintf(viewer,"%" PetscInt_FMT " %" PetscInt_FMT " %g %g\n",i+fshift,a->colidx[shift+8*j]+fshift,(double)PetscRealPart(a->val[shift+8*j]),(double)PetscImaginaryPart(a->val[shift+8*j]));CHKERRQ(ierr);
1177 #else
1178         ierr = PetscViewerASCIIPrintf(viewer,"%" PetscInt_FMT " %" PetscInt_FMT " %g\n",i+fshift,a->colidx[shift+8*j]+fshift,(double)a->val[shift+8*j]);CHKERRQ(ierr);
1179 #endif
1180       }
1181     }
1182     ierr = PetscViewerASCIIUseTabs(viewer,PETSC_TRUE);CHKERRQ(ierr);
1183   } else if (format == PETSC_VIEWER_NATIVE) {
1184     for (i=0; i<a->totalslices; i++) { /* loop over slices */
1185       PetscInt row;
1186       ierr = PetscViewerASCIIPrintf(viewer,"slice %" PetscInt_FMT ": %" PetscInt_FMT " %" PetscInt_FMT "\n",i,a->sliidx[i],a->sliidx[i+1]);CHKERRQ(ierr);
1187       for (j=a->sliidx[i],row=0; j<a->sliidx[i+1]; j++,row=((row+1)&0x07)) {
1188 #if defined(PETSC_USE_COMPLEX)
1189         if (PetscImaginaryPart(a->val[j]) > 0.0) {
1190           ierr = PetscViewerASCIIPrintf(viewer,"  %" PetscInt_FMT " %" PetscInt_FMT " %g + %g i\n",8*i+row,a->colidx[j],(double)PetscRealPart(a->val[j]),(double)PetscImaginaryPart(a->val[j]));CHKERRQ(ierr);
1191         } else if (PetscImaginaryPart(a->val[j]) < 0.0) {
1192           ierr = PetscViewerASCIIPrintf(viewer,"  %" PetscInt_FMT " %" PetscInt_FMT " %g - %g i\n",8*i+row,a->colidx[j],(double)PetscRealPart(a->val[j]),-(double)PetscImaginaryPart(a->val[j]));CHKERRQ(ierr);
1193         } else {
1194           ierr = PetscViewerASCIIPrintf(viewer,"  %" PetscInt_FMT " %" PetscInt_FMT " %g\n",8*i+row,a->colidx[j],(double)PetscRealPart(a->val[j]));CHKERRQ(ierr);
1195         }
1196 #else
1197         ierr = PetscViewerASCIIPrintf(viewer,"  %" PetscInt_FMT " %" PetscInt_FMT " %g\n",8*i+row,a->colidx[j],(double)a->val[j]);CHKERRQ(ierr);
1198 #endif
1199       }
1200     }
1201   } else {
1202     ierr = PetscViewerASCIIUseTabs(viewer,PETSC_FALSE);CHKERRQ(ierr);
1203     if (A->factortype) {
1204       for (i=0; i<m; i++) {
1205         shift = a->sliidx[i>>3]+(i&0x07);
1206         ierr = PetscViewerASCIIPrintf(viewer,"row %" PetscInt_FMT ":",i);CHKERRQ(ierr);
1207         /* L part */
1208         for (j=shift; j<a->diag[i]; j+=8) {
1209 #if defined(PETSC_USE_COMPLEX)
1210           if (PetscImaginaryPart(a->val[shift+8*j]) > 0.0) {
1211             ierr = PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g + %g i)",a->colidx[j],(double)PetscRealPart(a->val[j]),(double)PetscImaginaryPart(a->val[j]));CHKERRQ(ierr);
1212           } else if (PetscImaginaryPart(a->val[shift+8*j]) < 0.0) {
1213             ierr = PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g - %g i)",a->colidx[j],(double)PetscRealPart(a->val[j]),(double)(-PetscImaginaryPart(a->val[j])));CHKERRQ(ierr);
1214           } else {
1215             ierr = PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g) ",a->colidx[j],(double)PetscRealPart(a->val[j]));CHKERRQ(ierr);
1216           }
1217 #else
1218           ierr = PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g) ",a->colidx[j],(double)a->val[j]);CHKERRQ(ierr);
1219 #endif
1220         }
1221         /* diagonal */
1222         j = a->diag[i];
1223 #if defined(PETSC_USE_COMPLEX)
1224         if (PetscImaginaryPart(a->val[j]) > 0.0) {
1225           ierr = PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g + %g i)",a->colidx[j],(double)PetscRealPart(1.0/a->val[j]),(double)PetscImaginaryPart(1.0/a->val[j]));CHKERRQ(ierr);
1226         } else if (PetscImaginaryPart(a->val[j]) < 0.0) {
1227           ierr = PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g - %g i)",a->colidx[j],(double)PetscRealPart(1.0/a->val[j]),(double)(-PetscImaginaryPart(1.0/a->val[j])));CHKERRQ(ierr);
1228         } else {
1229           ierr = PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g) ",a->colidx[j],(double)PetscRealPart(1.0/a->val[j]));CHKERRQ(ierr);
1230         }
1231 #else
1232         ierr = PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g) ",a->colidx[j],(double)(1.0/a->val[j]));CHKERRQ(ierr);
1233 #endif
1234 
1235         /* U part */
1236         for (j=a->diag[i]+1; j<shift+8*a->rlen[i]; j+=8) {
1237 #if defined(PETSC_USE_COMPLEX)
1238           if (PetscImaginaryPart(a->val[j]) > 0.0) {
1239             ierr = PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g + %g i)",a->colidx[j],(double)PetscRealPart(a->val[j]),(double)PetscImaginaryPart(a->val[j]));CHKERRQ(ierr);
1240           } else if (PetscImaginaryPart(a->val[j]) < 0.0) {
1241             ierr = PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g - %g i)",a->colidx[j],(double)PetscRealPart(a->val[j]),(double)(-PetscImaginaryPart(a->val[j])));CHKERRQ(ierr);
1242           } else {
1243             ierr = PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g) ",a->colidx[j],(double)PetscRealPart(a->val[j]));CHKERRQ(ierr);
1244           }
1245 #else
1246           ierr = PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g) ",a->colidx[j],(double)a->val[j]);CHKERRQ(ierr);
1247 #endif
1248         }
1249         ierr = PetscViewerASCIIPrintf(viewer,"\n");CHKERRQ(ierr);
1250       }
1251     } else {
1252       for (i=0; i<m; i++) {
1253         shift = a->sliidx[i>>3]+(i&0x07);
1254         ierr = PetscViewerASCIIPrintf(viewer,"row %" PetscInt_FMT ":",i);CHKERRQ(ierr);
1255         for (j=0; j<a->rlen[i]; j++) {
1256 #if defined(PETSC_USE_COMPLEX)
1257           if (PetscImaginaryPart(a->val[j]) > 0.0) {
1258             ierr = PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g + %g i)",a->colidx[shift+8*j],(double)PetscRealPart(a->val[shift+8*j]),(double)PetscImaginaryPart(a->val[shift+8*j]));CHKERRQ(ierr);
1259           } else if (PetscImaginaryPart(a->val[j]) < 0.0) {
1260             ierr = PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g - %g i)",a->colidx[shift+8*j],(double)PetscRealPart(a->val[shift+8*j]),(double)-PetscImaginaryPart(a->val[shift+8*j]));CHKERRQ(ierr);
1261           } else {
1262             ierr = PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g) ",a->colidx[shift+8*j],(double)PetscRealPart(a->val[shift+8*j]));CHKERRQ(ierr);
1263           }
1264 #else
1265           ierr = PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g) ",a->colidx[shift+8*j],(double)a->val[shift+8*j]);CHKERRQ(ierr);
1266 #endif
1267         }
1268         ierr = PetscViewerASCIIPrintf(viewer,"\n");CHKERRQ(ierr);
1269       }
1270     }
1271     ierr = PetscViewerASCIIUseTabs(viewer,PETSC_TRUE);CHKERRQ(ierr);
1272   }
1273   ierr = PetscViewerFlush(viewer);CHKERRQ(ierr);
1274   PetscFunctionReturn(0);
1275 }
1276 
1277 #include <petscdraw.h>
1278 PetscErrorCode MatView_SeqSELL_Draw_Zoom(PetscDraw draw,void *Aa)
1279 {
1280   Mat               A=(Mat)Aa;
1281   Mat_SeqSELL       *a=(Mat_SeqSELL*)A->data;
1282   PetscInt          i,j,m=A->rmap->n,shift;
1283   int               color;
1284   PetscReal         xl,yl,xr,yr,x_l,x_r,y_l,y_r;
1285   PetscViewer       viewer;
1286   PetscViewerFormat format;
1287   PetscErrorCode    ierr;
1288 
1289   PetscFunctionBegin;
1290   ierr = PetscObjectQuery((PetscObject)A,"Zoomviewer",(PetscObject*)&viewer);CHKERRQ(ierr);
1291   ierr = PetscViewerGetFormat(viewer,&format);CHKERRQ(ierr);
1292   ierr = PetscDrawGetCoordinates(draw,&xl,&yl,&xr,&yr);CHKERRQ(ierr);
1293 
1294   /* loop over matrix elements drawing boxes */
1295 
1296   if (format != PETSC_VIEWER_DRAW_CONTOUR) {
1297     ierr = PetscDrawCollectiveBegin(draw);CHKERRQ(ierr);
1298     /* Blue for negative, Cyan for zero and  Red for positive */
1299     color = PETSC_DRAW_BLUE;
1300     for (i=0; i<m; i++) {
1301       shift = a->sliidx[i>>3]+(i&0x07); /* starting index of the row i */
1302       y_l = m - i - 1.0; y_r = y_l + 1.0;
1303       for (j=0; j<a->rlen[i]; j++) {
1304         x_l = a->colidx[shift+j*8]; x_r = x_l + 1.0;
1305         if (PetscRealPart(a->val[shift+8*j]) >=  0.) continue;
1306         ierr = PetscDrawRectangle(draw,x_l,y_l,x_r,y_r,color,color,color,color);CHKERRQ(ierr);
1307       }
1308     }
1309     color = PETSC_DRAW_CYAN;
1310     for (i=0; i<m; i++) {
1311       shift = a->sliidx[i>>3]+(i&0x07);
1312       y_l = m - i - 1.0; y_r = y_l + 1.0;
1313       for (j=0; j<a->rlen[i]; j++) {
1314         x_l = a->colidx[shift+j*8]; x_r = x_l + 1.0;
1315         if (a->val[shift+8*j] !=  0.) continue;
1316         ierr = PetscDrawRectangle(draw,x_l,y_l,x_r,y_r,color,color,color,color);CHKERRQ(ierr);
1317       }
1318     }
1319     color = PETSC_DRAW_RED;
1320     for (i=0; i<m; i++) {
1321       shift = a->sliidx[i>>3]+(i&0x07);
1322       y_l = m - i - 1.0; y_r = y_l + 1.0;
1323       for (j=0; j<a->rlen[i]; j++) {
1324         x_l = a->colidx[shift+j*8]; x_r = x_l + 1.0;
1325         if (PetscRealPart(a->val[shift+8*j]) <=  0.) continue;
1326         ierr = PetscDrawRectangle(draw,x_l,y_l,x_r,y_r,color,color,color,color);CHKERRQ(ierr);
1327       }
1328     }
1329     ierr = PetscDrawCollectiveEnd(draw);CHKERRQ(ierr);
1330   } else {
1331     /* use contour shading to indicate magnitude of values */
1332     /* first determine max of all nonzero values */
1333     PetscReal minv=0.0,maxv=0.0;
1334     PetscInt  count=0;
1335     PetscDraw popup;
1336     for (i=0; i<a->sliidx[a->totalslices]; i++) {
1337       if (PetscAbsScalar(a->val[i]) > maxv) maxv = PetscAbsScalar(a->val[i]);
1338     }
1339     if (minv >= maxv) maxv = minv + PETSC_SMALL;
1340     ierr = PetscDrawGetPopup(draw,&popup);CHKERRQ(ierr);
1341     ierr = PetscDrawScalePopup(popup,minv,maxv);CHKERRQ(ierr);
1342 
1343     ierr = PetscDrawCollectiveBegin(draw);CHKERRQ(ierr);
1344     for (i=0; i<m; i++) {
1345       shift = a->sliidx[i>>3]+(i&0x07);
1346       y_l = m - i - 1.0;
1347       y_r = y_l + 1.0;
1348       for (j=0; j<a->rlen[i]; j++) {
1349         x_l = a->colidx[shift+j*8];
1350         x_r = x_l + 1.0;
1351         color = PetscDrawRealToColor(PetscAbsScalar(a->val[count]),minv,maxv);
1352         ierr = PetscDrawRectangle(draw,x_l,y_l,x_r,y_r,color,color,color,color);CHKERRQ(ierr);
1353         count++;
1354       }
1355     }
1356     ierr = PetscDrawCollectiveEnd(draw);CHKERRQ(ierr);
1357   }
1358   PetscFunctionReturn(0);
1359 }
1360 
1361 #include <petscdraw.h>
1362 PetscErrorCode MatView_SeqSELL_Draw(Mat A,PetscViewer viewer)
1363 {
1364   PetscDraw      draw;
1365   PetscReal      xr,yr,xl,yl,h,w;
1366   PetscBool      isnull;
1367   PetscErrorCode ierr;
1368 
1369   PetscFunctionBegin;
1370   ierr = PetscViewerDrawGetDraw(viewer,0,&draw);CHKERRQ(ierr);
1371   ierr = PetscDrawIsNull(draw,&isnull);CHKERRQ(ierr);
1372   if (isnull) PetscFunctionReturn(0);
1373 
1374   xr   = A->cmap->n; yr  = A->rmap->n; h = yr/10.0; w = xr/10.0;
1375   xr  += w;          yr += h;         xl = -w;     yl = -h;
1376   ierr = PetscDrawSetCoordinates(draw,xl,yl,xr,yr);CHKERRQ(ierr);
1377   ierr = PetscObjectCompose((PetscObject)A,"Zoomviewer",(PetscObject)viewer);CHKERRQ(ierr);
1378   ierr = PetscDrawZoom(draw,MatView_SeqSELL_Draw_Zoom,A);CHKERRQ(ierr);
1379   ierr = PetscObjectCompose((PetscObject)A,"Zoomviewer",NULL);CHKERRQ(ierr);
1380   ierr = PetscDrawSave(draw);CHKERRQ(ierr);
1381   PetscFunctionReturn(0);
1382 }
1383 
1384 PetscErrorCode MatView_SeqSELL(Mat A,PetscViewer viewer)
1385 {
1386   PetscBool      iascii,isbinary,isdraw;
1387   PetscErrorCode ierr;
1388 
1389   PetscFunctionBegin;
1390   ierr = PetscObjectTypeCompare((PetscObject)viewer,PETSCVIEWERASCII,&iascii);CHKERRQ(ierr);
1391   ierr = PetscObjectTypeCompare((PetscObject)viewer,PETSCVIEWERBINARY,&isbinary);CHKERRQ(ierr);
1392   ierr = PetscObjectTypeCompare((PetscObject)viewer,PETSCVIEWERDRAW,&isdraw);CHKERRQ(ierr);
1393   if (iascii) {
1394     ierr = MatView_SeqSELL_ASCII(A,viewer);CHKERRQ(ierr);
1395   } else if (isbinary) {
1396     /* ierr = MatView_SeqSELL_Binary(A,viewer);CHKERRQ(ierr); */
1397   } else if (isdraw) {
1398     ierr = MatView_SeqSELL_Draw(A,viewer);CHKERRQ(ierr);
1399   }
1400   PetscFunctionReturn(0);
1401 }
1402 
1403 PetscErrorCode MatAssemblyEnd_SeqSELL(Mat A,MatAssemblyType mode)
1404 {
1405   Mat_SeqSELL    *a=(Mat_SeqSELL*)A->data;
1406   PetscInt       i,shift,row_in_slice,row,nrow,*cp,lastcol,j,k;
1407   MatScalar      *vp;
1408   PetscErrorCode ierr;
1409 
1410   PetscFunctionBegin;
1411   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0);
1412   /* To do: compress out the unused elements */
1413   ierr = MatMarkDiagonal_SeqSELL(A);CHKERRQ(ierr);
1414   ierr = PetscInfo(A,"Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: %" PetscInt_FMT " allocated %" PetscInt_FMT " used (%" PetscInt_FMT " nonzeros+%" PetscInt_FMT " paddedzeros)\n",A->rmap->n,A->cmap->n,a->maxallocmat,a->sliidx[a->totalslices],a->nz,a->sliidx[a->totalslices]-a->nz);CHKERRQ(ierr);
1415   ierr = PetscInfo(A,"Number of mallocs during MatSetValues() is %" PetscInt_FMT "\n",a->reallocs);CHKERRQ(ierr);
1416   ierr = PetscInfo(A,"Maximum nonzeros in any row is %" PetscInt_FMT "\n",a->rlenmax);CHKERRQ(ierr);
1417   /* Set unused slots for column indices to last valid column index. Set unused slots for values to zero. This allows for a use of unmasked intrinsics -> higher performance */
1418   for (i=0; i<a->totalslices; ++i) {
1419     shift = a->sliidx[i];    /* starting index of the slice */
1420     cp    = a->colidx+shift; /* pointer to the column indices of the slice */
1421     vp    = a->val+shift;    /* pointer to the nonzero values of the slice */
1422     for (row_in_slice=0; row_in_slice<8; ++row_in_slice) { /* loop over rows in the slice */
1423       row  = 8*i + row_in_slice;
1424       nrow = a->rlen[row]; /* number of nonzeros in row */
1425       /*
1426         Search for the nearest nonzero. Normally setting the index to zero may cause extra communication.
1427         But if the entire slice are empty, it is fine to use 0 since the index will not be loaded.
1428       */
1429       lastcol = 0;
1430       if (nrow>0) { /* nonempty row */
1431         lastcol = cp[8*(nrow-1)+row_in_slice]; /* use the index from the last nonzero at current row */
1432       } else if (!row_in_slice) { /* first row of the currect slice is empty */
1433         for (j=1;j<8;j++) {
1434           if (a->rlen[8*i+j]) {
1435             lastcol = cp[j];
1436             break;
1437           }
1438         }
1439       } else {
1440         if (a->sliidx[i+1] != shift) lastcol = cp[row_in_slice-1]; /* use the index from the previous row */
1441       }
1442 
1443       for (k=nrow; k<(a->sliidx[i+1]-shift)/8; ++k) {
1444         cp[8*k+row_in_slice] = lastcol;
1445         vp[8*k+row_in_slice] = (MatScalar)0;
1446       }
1447     }
1448   }
1449 
1450   A->info.mallocs += a->reallocs;
1451   a->reallocs      = 0;
1452 
1453   ierr = MatSeqSELLInvalidateDiagonal(A);CHKERRQ(ierr);
1454   PetscFunctionReturn(0);
1455 }
1456 
1457 PetscErrorCode MatGetInfo_SeqSELL(Mat A,MatInfoType flag,MatInfo *info)
1458 {
1459   Mat_SeqSELL *a=(Mat_SeqSELL*)A->data;
1460 
1461   PetscFunctionBegin;
1462   info->block_size   = 1.0;
1463   info->nz_allocated = a->maxallocmat;
1464   info->nz_used      = a->sliidx[a->totalslices]; /* include padding zeros */
1465   info->nz_unneeded  = (a->maxallocmat-a->sliidx[a->totalslices]);
1466   info->assemblies   = A->num_ass;
1467   info->mallocs      = A->info.mallocs;
1468   info->memory       = ((PetscObject)A)->mem;
1469   if (A->factortype) {
1470     info->fill_ratio_given  = A->info.fill_ratio_given;
1471     info->fill_ratio_needed = A->info.fill_ratio_needed;
1472     info->factor_mallocs    = A->info.factor_mallocs;
1473   } else {
1474     info->fill_ratio_given  = 0;
1475     info->fill_ratio_needed = 0;
1476     info->factor_mallocs    = 0;
1477   }
1478   PetscFunctionReturn(0);
1479 }
1480 
1481 PetscErrorCode MatSetValues_SeqSELL(Mat A,PetscInt m,const PetscInt im[],PetscInt n,const PetscInt in[],const PetscScalar v[],InsertMode is)
1482 {
1483   Mat_SeqSELL    *a=(Mat_SeqSELL*)A->data;
1484   PetscInt       shift,i,k,l,low,high,t,ii,row,col,nrow;
1485   PetscInt       *cp,nonew=a->nonew,lastcol=-1;
1486   MatScalar      *vp,value;
1487   PetscErrorCode ierr;
1488 
1489   PetscFunctionBegin;
1490   for (k=0; k<m; k++) { /* loop over added rows */
1491     row = im[k];
1492     if (row < 0) continue;
1493     PetscCheck(row < A->rmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Row too large: row %" PetscInt_FMT " max %" PetscInt_FMT,row,A->rmap->n-1);
1494     shift = a->sliidx[row>>3]+(row&0x07); /* starting index of the row */
1495     cp    = a->colidx+shift; /* pointer to the row */
1496     vp    = a->val+shift; /* pointer to the row */
1497     nrow  = a->rlen[row];
1498     low   = 0;
1499     high  = nrow;
1500 
1501     for (l=0; l<n; l++) { /* loop over added columns */
1502       col = in[l];
1503       if (col<0) continue;
1504       PetscCheck(col < A->cmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Col too large: row %" PetscInt_FMT " max %" PetscInt_FMT,col,A->cmap->n-1);
1505       if (a->roworiented) {
1506         value = v[l+k*n];
1507       } else {
1508         value = v[k+l*m];
1509       }
1510       if ((value == 0.0 && a->ignorezeroentries) && (is == ADD_VALUES)) continue;
1511 
1512       /* search in this row for the specified column, i indicates the column to be set */
1513       if (col <= lastcol) low = 0;
1514       else high = nrow;
1515       lastcol = col;
1516       while (high-low > 5) {
1517         t = (low+high)/2;
1518         if (*(cp+t*8) > col) high = t;
1519         else low = t;
1520       }
1521       for (i=low; i<high; i++) {
1522         if (*(cp+i*8) > col) break;
1523         if (*(cp+i*8) == col) {
1524           if (is == ADD_VALUES) *(vp+i*8) += value;
1525           else *(vp+i*8) = value;
1526           low = i + 1;
1527           goto noinsert;
1528         }
1529       }
1530       if (value == 0.0 && a->ignorezeroentries) goto noinsert;
1531       if (nonew == 1) goto noinsert;
1532       PetscCheckFalse(nonew == -1,PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Inserting a new nonzero (%" PetscInt_FMT ", %" PetscInt_FMT ") in the matrix", row, col);
1533       /* If the current row length exceeds the slice width (e.g. nrow==slice_width), allocate a new space, otherwise do nothing */
1534       MatSeqXSELLReallocateSELL(A,A->rmap->n,1,nrow,a->sliidx,row/8,row,col,a->colidx,a->val,cp,vp,nonew,MatScalar);
1535       /* add the new nonzero to the high position, shift the remaining elements in current row to the right by one slot */
1536       for (ii=nrow-1; ii>=i; ii--) {
1537         *(cp+(ii+1)*8) = *(cp+ii*8);
1538         *(vp+(ii+1)*8) = *(vp+ii*8);
1539       }
1540       a->rlen[row]++;
1541       *(cp+i*8) = col;
1542       *(vp+i*8) = value;
1543       a->nz++;
1544       A->nonzerostate++;
1545       low = i+1; high++; nrow++;
1546 noinsert:;
1547     }
1548     a->rlen[row] = nrow;
1549   }
1550   PetscFunctionReturn(0);
1551 }
1552 
1553 PetscErrorCode MatCopy_SeqSELL(Mat A,Mat B,MatStructure str)
1554 {
1555   PetscErrorCode ierr;
1556 
1557   PetscFunctionBegin;
1558   /* If the two matrices have the same copy implementation, use fast copy. */
1559   if (str == SAME_NONZERO_PATTERN && (A->ops->copy == B->ops->copy)) {
1560     Mat_SeqSELL *a=(Mat_SeqSELL*)A->data;
1561     Mat_SeqSELL *b=(Mat_SeqSELL*)B->data;
1562 
1563     PetscCheckFalse(a->sliidx[a->totalslices] != b->sliidx[b->totalslices],PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"Number of nonzeros in two matrices are different");
1564     ierr = PetscArraycpy(b->val,a->val,a->sliidx[a->totalslices]);CHKERRQ(ierr);
1565   } else {
1566     ierr = MatCopy_Basic(A,B,str);CHKERRQ(ierr);
1567   }
1568   PetscFunctionReturn(0);
1569 }
1570 
1571 PetscErrorCode MatSetUp_SeqSELL(Mat A)
1572 {
1573   PetscErrorCode ierr;
1574 
1575   PetscFunctionBegin;
1576   ierr = MatSeqSELLSetPreallocation(A,PETSC_DEFAULT,NULL);CHKERRQ(ierr);
1577   PetscFunctionReturn(0);
1578 }
1579 
1580 PetscErrorCode MatSeqSELLGetArray_SeqSELL(Mat A,PetscScalar *array[])
1581 {
1582   Mat_SeqSELL *a=(Mat_SeqSELL*)A->data;
1583 
1584   PetscFunctionBegin;
1585   *array = a->val;
1586   PetscFunctionReturn(0);
1587 }
1588 
1589 PetscErrorCode MatSeqSELLRestoreArray_SeqSELL(Mat A,PetscScalar *array[])
1590 {
1591   PetscFunctionBegin;
1592   PetscFunctionReturn(0);
1593 }
1594 
1595 PetscErrorCode MatRealPart_SeqSELL(Mat A)
1596 {
1597   Mat_SeqSELL *a=(Mat_SeqSELL*)A->data;
1598   PetscInt    i;
1599   MatScalar   *aval=a->val;
1600 
1601   PetscFunctionBegin;
1602   for (i=0; i<a->sliidx[a->totalslices]; i++) aval[i]=PetscRealPart(aval[i]);
1603   PetscFunctionReturn(0);
1604 }
1605 
1606 PetscErrorCode MatImaginaryPart_SeqSELL(Mat A)
1607 {
1608   Mat_SeqSELL    *a=(Mat_SeqSELL*)A->data;
1609   PetscInt       i;
1610   MatScalar      *aval=a->val;
1611   PetscErrorCode ierr;
1612 
1613   PetscFunctionBegin;
1614   for (i=0; i<a->sliidx[a->totalslices]; i++) aval[i] = PetscImaginaryPart(aval[i]);
1615   ierr = MatSeqSELLInvalidateDiagonal(A);CHKERRQ(ierr);
1616   PetscFunctionReturn(0);
1617 }
1618 
1619 PetscErrorCode MatScale_SeqSELL(Mat inA,PetscScalar alpha)
1620 {
1621   Mat_SeqSELL    *a=(Mat_SeqSELL*)inA->data;
1622   MatScalar      *aval=a->val;
1623   PetscScalar    oalpha=alpha;
1624   PetscBLASInt   one=1,size;
1625   PetscErrorCode ierr;
1626 
1627   PetscFunctionBegin;
1628   ierr = PetscBLASIntCast(a->sliidx[a->totalslices],&size);CHKERRQ(ierr);
1629   PetscStackCallBLAS("BLASscal",BLASscal_(&size,&oalpha,aval,&one));
1630   ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
1631   ierr = MatSeqSELLInvalidateDiagonal(inA);CHKERRQ(ierr);
1632   PetscFunctionReturn(0);
1633 }
1634 
1635 PetscErrorCode MatShift_SeqSELL(Mat Y,PetscScalar a)
1636 {
1637   Mat_SeqSELL    *y=(Mat_SeqSELL*)Y->data;
1638   PetscErrorCode ierr;
1639 
1640   PetscFunctionBegin;
1641   if (!Y->preallocated || !y->nz) {
1642     ierr = MatSeqSELLSetPreallocation(Y,1,NULL);CHKERRQ(ierr);
1643   }
1644   ierr = MatShift_Basic(Y,a);CHKERRQ(ierr);
1645   PetscFunctionReturn(0);
1646 }
1647 
1648 PetscErrorCode MatSOR_SeqSELL(Mat A,Vec bb,PetscReal omega,MatSORType flag,PetscReal fshift,PetscInt its,PetscInt lits,Vec xx)
1649 {
1650   Mat_SeqSELL       *a=(Mat_SeqSELL*)A->data;
1651   PetscScalar       *x,sum,*t;
1652   const MatScalar   *idiag=NULL,*mdiag;
1653   const PetscScalar *b,*xb;
1654   PetscInt          n,m=A->rmap->n,i,j,shift;
1655   const PetscInt    *diag;
1656   PetscErrorCode    ierr;
1657 
1658   PetscFunctionBegin;
1659   its = its*lits;
1660 
1661   if (fshift != a->fshift || omega != a->omega) a->idiagvalid = PETSC_FALSE; /* must recompute idiag[] */
1662   if (!a->idiagvalid) {ierr = MatInvertDiagonal_SeqSELL(A,omega,fshift);CHKERRQ(ierr);}
1663   a->fshift = fshift;
1664   a->omega  = omega;
1665 
1666   diag  = a->diag;
1667   t     = a->ssor_work;
1668   idiag = a->idiag;
1669   mdiag = a->mdiag;
1670 
1671   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1672   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
1673   /* We count flops by assuming the upper triangular and lower triangular parts have the same number of nonzeros */
1674   PetscCheckFalse(flag == SOR_APPLY_UPPER,PETSC_COMM_SELF,PETSC_ERR_SUP,"SOR_APPLY_UPPER is not implemented");
1675   PetscCheckFalse(flag == SOR_APPLY_LOWER,PETSC_COMM_SELF,PETSC_ERR_SUP,"SOR_APPLY_LOWER is not implemented");
1676   PetscCheckFalse(flag & SOR_EISENSTAT,PETSC_COMM_SELF,PETSC_ERR_SUP,"No support yet for Eisenstat");
1677 
1678   if (flag & SOR_ZERO_INITIAL_GUESS) {
1679     if ((flag & SOR_FORWARD_SWEEP) || (flag & SOR_LOCAL_FORWARD_SWEEP)) {
1680       for (i=0; i<m; i++) {
1681         shift = a->sliidx[i>>3]+(i&0x07); /* starting index of the row i */
1682         sum   = b[i];
1683         n     = (diag[i]-shift)/8;
1684         for (j=0; j<n; j++) sum -= a->val[shift+j*8]*x[a->colidx[shift+j*8]];
1685         t[i]  = sum;
1686         x[i]  = sum*idiag[i];
1687       }
1688       xb   = t;
1689       ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
1690     } else xb = b;
1691     if ((flag & SOR_BACKWARD_SWEEP) || (flag & SOR_LOCAL_BACKWARD_SWEEP)) {
1692       for (i=m-1; i>=0; i--) {
1693         shift = a->sliidx[i>>3]+(i&0x07); /* starting index of the row i */
1694         sum   = xb[i];
1695         n     = a->rlen[i]-(diag[i]-shift)/8-1;
1696         for (j=1; j<=n; j++) sum -= a->val[diag[i]+j*8]*x[a->colidx[diag[i]+j*8]];
1697         if (xb == b) {
1698           x[i] = sum*idiag[i];
1699         } else {
1700           x[i] = (1.-omega)*x[i]+sum*idiag[i];  /* omega in idiag */
1701         }
1702       }
1703       ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); /* assumes 1/2 in upper */
1704     }
1705     its--;
1706   }
1707   while (its--) {
1708     if ((flag & SOR_FORWARD_SWEEP) || (flag & SOR_LOCAL_FORWARD_SWEEP)) {
1709       for (i=0; i<m; i++) {
1710         /* lower */
1711         shift = a->sliidx[i>>3]+(i&0x07); /* starting index of the row i */
1712         sum   = b[i];
1713         n     = (diag[i]-shift)/8;
1714         for (j=0; j<n; j++) sum -= a->val[shift+j*8]*x[a->colidx[shift+j*8]];
1715         t[i]  = sum;             /* save application of the lower-triangular part */
1716         /* upper */
1717         n     = a->rlen[i]-(diag[i]-shift)/8-1;
1718         for (j=1; j<=n; j++) sum -= a->val[diag[i]+j*8]*x[a->colidx[diag[i]+j*8]];
1719         x[i]  = (1.-omega)*x[i]+sum*idiag[i];  /* omega in idiag */
1720       }
1721       xb   = t;
1722       ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
1723     } else xb = b;
1724     if ((flag & SOR_BACKWARD_SWEEP) || (flag & SOR_LOCAL_BACKWARD_SWEEP)) {
1725       for (i=m-1; i>=0; i--) {
1726         shift = a->sliidx[i>>3]+(i&0x07); /* starting index of the row i */
1727         sum = xb[i];
1728         if (xb == b) {
1729           /* whole matrix (no checkpointing available) */
1730           n     = a->rlen[i];
1731           for (j=0; j<n; j++) sum -= a->val[shift+j*8]*x[a->colidx[shift+j*8]];
1732           x[i] = (1.-omega)*x[i]+(sum+mdiag[i]*x[i])*idiag[i];
1733         } else { /* lower-triangular part has been saved, so only apply upper-triangular */
1734           n     = a->rlen[i]-(diag[i]-shift)/8-1;
1735           for (j=1; j<=n; j++) sum -= a->val[diag[i]+j*8]*x[a->colidx[diag[i]+j*8]];
1736           x[i]  = (1.-omega)*x[i]+sum*idiag[i];  /* omega in idiag */
1737         }
1738       }
1739       if (xb == b) {
1740         ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
1741       } else {
1742         ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); /* assumes 1/2 in upper */
1743       }
1744     }
1745   }
1746   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1747   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
1748   PetscFunctionReturn(0);
1749 }
1750 
1751 /* -------------------------------------------------------------------*/
1752 static struct _MatOps MatOps_Values = {MatSetValues_SeqSELL,
1753                                        MatGetRow_SeqSELL,
1754                                        MatRestoreRow_SeqSELL,
1755                                        MatMult_SeqSELL,
1756                                /* 4*/  MatMultAdd_SeqSELL,
1757                                        MatMultTranspose_SeqSELL,
1758                                        MatMultTransposeAdd_SeqSELL,
1759                                        NULL,
1760                                        NULL,
1761                                        NULL,
1762                                /* 10*/ NULL,
1763                                        NULL,
1764                                        NULL,
1765                                        MatSOR_SeqSELL,
1766                                        NULL,
1767                                /* 15*/ MatGetInfo_SeqSELL,
1768                                        MatEqual_SeqSELL,
1769                                        MatGetDiagonal_SeqSELL,
1770                                        MatDiagonalScale_SeqSELL,
1771                                        NULL,
1772                                /* 20*/ NULL,
1773                                        MatAssemblyEnd_SeqSELL,
1774                                        MatSetOption_SeqSELL,
1775                                        MatZeroEntries_SeqSELL,
1776                                /* 24*/ NULL,
1777                                        NULL,
1778                                        NULL,
1779                                        NULL,
1780                                        NULL,
1781                                /* 29*/ MatSetUp_SeqSELL,
1782                                        NULL,
1783                                        NULL,
1784                                        NULL,
1785                                        NULL,
1786                                /* 34*/ MatDuplicate_SeqSELL,
1787                                        NULL,
1788                                        NULL,
1789                                        NULL,
1790                                        NULL,
1791                                /* 39*/ NULL,
1792                                        NULL,
1793                                        NULL,
1794                                        MatGetValues_SeqSELL,
1795                                        MatCopy_SeqSELL,
1796                                /* 44*/ NULL,
1797                                        MatScale_SeqSELL,
1798                                        MatShift_SeqSELL,
1799                                        NULL,
1800                                        NULL,
1801                                /* 49*/ NULL,
1802                                        NULL,
1803                                        NULL,
1804                                        NULL,
1805                                        NULL,
1806                                /* 54*/ MatFDColoringCreate_SeqXAIJ,
1807                                        NULL,
1808                                        NULL,
1809                                        NULL,
1810                                        NULL,
1811                                /* 59*/ NULL,
1812                                        MatDestroy_SeqSELL,
1813                                        MatView_SeqSELL,
1814                                        NULL,
1815                                        NULL,
1816                                /* 64*/ NULL,
1817                                        NULL,
1818                                        NULL,
1819                                        NULL,
1820                                        NULL,
1821                                /* 69*/ NULL,
1822                                        NULL,
1823                                        NULL,
1824                                        NULL,
1825                                        NULL,
1826                                /* 74*/ NULL,
1827                                        MatFDColoringApply_AIJ, /* reuse the FDColoring function for AIJ */
1828                                        NULL,
1829                                        NULL,
1830                                        NULL,
1831                                /* 79*/ NULL,
1832                                        NULL,
1833                                        NULL,
1834                                        NULL,
1835                                        NULL,
1836                                /* 84*/ NULL,
1837                                        NULL,
1838                                        NULL,
1839                                        NULL,
1840                                        NULL,
1841                                /* 89*/ NULL,
1842                                        NULL,
1843                                        NULL,
1844                                        NULL,
1845                                        NULL,
1846                                /* 94*/ NULL,
1847                                        NULL,
1848                                        NULL,
1849                                        NULL,
1850                                        NULL,
1851                                /* 99*/ NULL,
1852                                        NULL,
1853                                        NULL,
1854                                        MatConjugate_SeqSELL,
1855                                        NULL,
1856                                /*104*/ NULL,
1857                                        NULL,
1858                                        NULL,
1859                                        NULL,
1860                                        NULL,
1861                                /*109*/ NULL,
1862                                        NULL,
1863                                        NULL,
1864                                        NULL,
1865                                        MatMissingDiagonal_SeqSELL,
1866                                /*114*/ NULL,
1867                                        NULL,
1868                                        NULL,
1869                                        NULL,
1870                                        NULL,
1871                                /*119*/ NULL,
1872                                        NULL,
1873                                        NULL,
1874                                        NULL,
1875                                        NULL,
1876                                /*124*/ NULL,
1877                                        NULL,
1878                                        NULL,
1879                                        NULL,
1880                                        NULL,
1881                                /*129*/ NULL,
1882                                        NULL,
1883                                        NULL,
1884                                        NULL,
1885                                        NULL,
1886                                /*134*/ NULL,
1887                                        NULL,
1888                                        NULL,
1889                                        NULL,
1890                                        NULL,
1891                                /*139*/ NULL,
1892                                        NULL,
1893                                        NULL,
1894                                        MatFDColoringSetUp_SeqXAIJ,
1895                                        NULL,
1896                                /*144*/ NULL,
1897                                        NULL,
1898                                        NULL,
1899                                        NULL
1900 };
1901 
1902 PetscErrorCode MatStoreValues_SeqSELL(Mat mat)
1903 {
1904   Mat_SeqSELL    *a=(Mat_SeqSELL*)mat->data;
1905   PetscErrorCode ierr;
1906 
1907   PetscFunctionBegin;
1908   PetscCheckFalse(!a->nonew,PETSC_COMM_SELF,PETSC_ERR_ORDER,"Must call MatSetOption(A,MAT_NEW_NONZERO_LOCATIONS,PETSC_FALSE);first");
1909 
1910   /* allocate space for values if not already there */
1911   if (!a->saved_values) {
1912     ierr = PetscMalloc1(a->sliidx[a->totalslices]+1,&a->saved_values);CHKERRQ(ierr);
1913     ierr = PetscLogObjectMemory((PetscObject)mat,(a->sliidx[a->totalslices]+1)*sizeof(PetscScalar));CHKERRQ(ierr);
1914   }
1915 
1916   /* copy values over */
1917   ierr = PetscArraycpy(a->saved_values,a->val,a->sliidx[a->totalslices]);CHKERRQ(ierr);
1918   PetscFunctionReturn(0);
1919 }
1920 
1921 PetscErrorCode MatRetrieveValues_SeqSELL(Mat mat)
1922 {
1923   Mat_SeqSELL    *a=(Mat_SeqSELL*)mat->data;
1924   PetscErrorCode ierr;
1925 
1926   PetscFunctionBegin;
1927   PetscCheckFalse(!a->nonew,PETSC_COMM_SELF,PETSC_ERR_ORDER,"Must call MatSetOption(A,MAT_NEW_NONZERO_LOCATIONS,PETSC_FALSE);first");
1928   PetscCheckFalse(!a->saved_values,PETSC_COMM_SELF,PETSC_ERR_ORDER,"Must call MatStoreValues(A);first");
1929   ierr = PetscArraycpy(a->val,a->saved_values,a->sliidx[a->totalslices]);CHKERRQ(ierr);
1930   PetscFunctionReturn(0);
1931 }
1932 
1933 /*@C
1934  MatSeqSELLRestoreArray - returns access to the array where the data for a MATSEQSELL matrix is stored obtained by MatSeqSELLGetArray()
1935 
1936  Not Collective
1937 
1938  Input Parameters:
1939  .  mat - a MATSEQSELL matrix
1940  .  array - pointer to the data
1941 
1942  Level: intermediate
1943 
1944  .seealso: MatSeqSELLGetArray(), MatSeqSELLRestoreArrayF90()
1945  @*/
1946 PetscErrorCode MatSeqSELLRestoreArray(Mat A,PetscScalar **array)
1947 {
1948   PetscErrorCode ierr;
1949 
1950   PetscFunctionBegin;
1951   ierr = PetscUseMethod(A,"MatSeqSELLRestoreArray_C",(Mat,PetscScalar**),(A,array));CHKERRQ(ierr);
1952   PetscFunctionReturn(0);
1953 }
1954 
1955 PETSC_EXTERN PetscErrorCode MatCreate_SeqSELL(Mat B)
1956 {
1957   Mat_SeqSELL    *b;
1958   PetscMPIInt    size;
1959   PetscErrorCode ierr;
1960 
1961   PetscFunctionBegin;
1962   ierr = PetscCitationsRegister(citation,&cited);CHKERRQ(ierr);
1963   ierr = MPI_Comm_size(PetscObjectComm((PetscObject)B),&size);CHKERRMPI(ierr);
1964   PetscCheckFalse(size > 1,PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Comm must be of size 1");
1965 
1966   ierr = PetscNewLog(B,&b);CHKERRQ(ierr);
1967 
1968   B->data = (void*)b;
1969 
1970   ierr = PetscMemcpy(B->ops,&MatOps_Values,sizeof(struct _MatOps));CHKERRQ(ierr);
1971 
1972   b->row                = NULL;
1973   b->col                = NULL;
1974   b->icol               = NULL;
1975   b->reallocs           = 0;
1976   b->ignorezeroentries  = PETSC_FALSE;
1977   b->roworiented        = PETSC_TRUE;
1978   b->nonew              = 0;
1979   b->diag               = NULL;
1980   b->solve_work         = NULL;
1981   B->spptr              = NULL;
1982   b->saved_values       = NULL;
1983   b->idiag              = NULL;
1984   b->mdiag              = NULL;
1985   b->ssor_work          = NULL;
1986   b->omega              = 1.0;
1987   b->fshift             = 0.0;
1988   b->idiagvalid         = PETSC_FALSE;
1989   b->keepnonzeropattern = PETSC_FALSE;
1990 
1991   ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQSELL);CHKERRQ(ierr);
1992   ierr = PetscObjectComposeFunction((PetscObject)B,"MatSeqSELLGetArray_C",MatSeqSELLGetArray_SeqSELL);CHKERRQ(ierr);
1993   ierr = PetscObjectComposeFunction((PetscObject)B,"MatSeqSELLRestoreArray_C",MatSeqSELLRestoreArray_SeqSELL);CHKERRQ(ierr);
1994   ierr = PetscObjectComposeFunction((PetscObject)B,"MatStoreValues_C",MatStoreValues_SeqSELL);CHKERRQ(ierr);
1995   ierr = PetscObjectComposeFunction((PetscObject)B,"MatRetrieveValues_C",MatRetrieveValues_SeqSELL);CHKERRQ(ierr);
1996   ierr = PetscObjectComposeFunction((PetscObject)B,"MatSeqSELLSetPreallocation_C",MatSeqSELLSetPreallocation_SeqSELL);CHKERRQ(ierr);
1997   ierr = PetscObjectComposeFunction((PetscObject)B,"MatConvert_seqsell_seqaij_C",MatConvert_SeqSELL_SeqAIJ);CHKERRQ(ierr);
1998   PetscFunctionReturn(0);
1999 }
2000 
2001 /*
2002  Given a matrix generated with MatGetFactor() duplicates all the information in A into B
2003  */
2004 PetscErrorCode MatDuplicateNoCreate_SeqSELL(Mat C,Mat A,MatDuplicateOption cpvalues,PetscBool mallocmatspace)
2005 {
2006   Mat_SeqSELL    *c = (Mat_SeqSELL*)C->data,*a = (Mat_SeqSELL*)A->data;
2007   PetscInt       i,m=A->rmap->n;
2008   PetscInt       totalslices=a->totalslices;
2009   PetscErrorCode ierr;
2010 
2011   PetscFunctionBegin;
2012   C->factortype = A->factortype;
2013   c->row        = NULL;
2014   c->col        = NULL;
2015   c->icol       = NULL;
2016   c->reallocs   = 0;
2017   C->assembled = PETSC_TRUE;
2018 
2019   ierr = PetscLayoutReference(A->rmap,&C->rmap);CHKERRQ(ierr);
2020   ierr = PetscLayoutReference(A->cmap,&C->cmap);CHKERRQ(ierr);
2021 
2022   ierr = PetscMalloc1(8*totalslices,&c->rlen);CHKERRQ(ierr);
2023   ierr = PetscLogObjectMemory((PetscObject)C,m*sizeof(PetscInt));CHKERRQ(ierr);
2024   ierr = PetscMalloc1(totalslices+1,&c->sliidx);CHKERRQ(ierr);
2025   ierr = PetscLogObjectMemory((PetscObject)C, (totalslices+1)*sizeof(PetscInt));CHKERRQ(ierr);
2026 
2027   for (i=0; i<m; i++) c->rlen[i] = a->rlen[i];
2028   for (i=0; i<totalslices+1; i++) c->sliidx[i] = a->sliidx[i];
2029 
2030   /* allocate the matrix space */
2031   if (mallocmatspace) {
2032     ierr = PetscMalloc2(a->maxallocmat,&c->val,a->maxallocmat,&c->colidx);CHKERRQ(ierr);
2033     ierr = PetscLogObjectMemory((PetscObject)C,a->maxallocmat*(sizeof(PetscScalar)+sizeof(PetscInt)));CHKERRQ(ierr);
2034 
2035     c->singlemalloc = PETSC_TRUE;
2036 
2037     if (m > 0) {
2038       ierr = PetscArraycpy(c->colidx,a->colidx,a->maxallocmat);CHKERRQ(ierr);
2039       if (cpvalues == MAT_COPY_VALUES) {
2040         ierr = PetscArraycpy(c->val,a->val,a->maxallocmat);CHKERRQ(ierr);
2041       } else {
2042         ierr = PetscArrayzero(c->val,a->maxallocmat);CHKERRQ(ierr);
2043       }
2044     }
2045   }
2046 
2047   c->ignorezeroentries = a->ignorezeroentries;
2048   c->roworiented       = a->roworiented;
2049   c->nonew             = a->nonew;
2050   if (a->diag) {
2051     ierr = PetscMalloc1(m,&c->diag);CHKERRQ(ierr);
2052     ierr = PetscLogObjectMemory((PetscObject)C,m*sizeof(PetscInt));CHKERRQ(ierr);
2053     for (i=0; i<m; i++) {
2054       c->diag[i] = a->diag[i];
2055     }
2056   } else c->diag = NULL;
2057 
2058   c->solve_work         = NULL;
2059   c->saved_values       = NULL;
2060   c->idiag              = NULL;
2061   c->ssor_work          = NULL;
2062   c->keepnonzeropattern = a->keepnonzeropattern;
2063   c->free_val           = PETSC_TRUE;
2064   c->free_colidx        = PETSC_TRUE;
2065 
2066   c->maxallocmat  = a->maxallocmat;
2067   c->maxallocrow  = a->maxallocrow;
2068   c->rlenmax      = a->rlenmax;
2069   c->nz           = a->nz;
2070   C->preallocated = PETSC_TRUE;
2071 
2072   c->nonzerorowcnt = a->nonzerorowcnt;
2073   C->nonzerostate  = A->nonzerostate;
2074 
2075   ierr = PetscFunctionListDuplicate(((PetscObject)A)->qlist,&((PetscObject)C)->qlist);CHKERRQ(ierr);
2076   PetscFunctionReturn(0);
2077 }
2078 
2079 PetscErrorCode MatDuplicate_SeqSELL(Mat A,MatDuplicateOption cpvalues,Mat *B)
2080 {
2081   PetscErrorCode ierr;
2082 
2083   PetscFunctionBegin;
2084   ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr);
2085   ierr = MatSetSizes(*B,A->rmap->n,A->cmap->n,A->rmap->n,A->cmap->n);CHKERRQ(ierr);
2086   if (!(A->rmap->n % A->rmap->bs) && !(A->cmap->n % A->cmap->bs)) {
2087     ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr);
2088   }
2089   ierr = MatSetType(*B,((PetscObject)A)->type_name);CHKERRQ(ierr);
2090   ierr = MatDuplicateNoCreate_SeqSELL(*B,A,cpvalues,PETSC_TRUE);CHKERRQ(ierr);
2091   PetscFunctionReturn(0);
2092 }
2093 
2094 /*MC
2095    MATSEQSELL - MATSEQSELL = "seqsell" - A matrix type to be used for sequential sparse matrices,
2096    based on the sliced Ellpack format
2097 
2098    Options Database Keys:
2099 . -mat_type seqsell - sets the matrix type to "seqsell" during a call to MatSetFromOptions()
2100 
2101    Level: beginner
2102 
2103 .seealso: MatCreateSeqSell(), MATSELL, MATMPISELL, MATSEQAIJ, MATAIJ, MATMPIAIJ
2104 M*/
2105 
2106 /*MC
2107    MATSELL - MATSELL = "sell" - A matrix type to be used for sparse matrices.
2108 
2109    This matrix type is identical to MATSEQSELL when constructed with a single process communicator,
2110    and MATMPISELL otherwise.  As a result, for single process communicators,
2111   MatSeqSELLSetPreallocation() is supported, and similarly MatMPISELLSetPreallocation() is supported
2112   for communicators controlling multiple processes.  It is recommended that you call both of
2113   the above preallocation routines for simplicity.
2114 
2115    Options Database Keys:
2116 . -mat_type sell - sets the matrix type to "sell" during a call to MatSetFromOptions()
2117 
2118   Level: beginner
2119 
2120   Notes:
2121    This format is only supported for real scalars, double precision, and 32 bit indices (the defaults).
2122 
2123    It can provide better performance on Intel and AMD processes with AVX2 or AVX512 support for matrices that have a similar number of
2124    non-zeros in contiguous groups of rows. However if the computation is memory bandwidth limited it may not provide much improvement.
2125 
2126   Developer Notes:
2127    On Intel (and AMD) systems some of the matrix operations use SIMD (AVX) instructions to achieve higher performance.
2128 
2129    The sparse matrix format is as follows. For simplicity we assume a slice size of 2, it is actually 8
2130 .vb
2131                             (2 0  3 4)
2132    Consider the matrix A =  (5 0  6 0)
2133                             (0 0  7 8)
2134                             (0 0  9 9)
2135 
2136    symbolically the Ellpack format can be written as
2137 
2138         (2 3 4 |)           (0 2 3 |)
2139    v =  (5 6 0 |)  colidx = (0 2 2 |)
2140         --------            ---------
2141         (7 8 |)             (2 3 |)
2142         (9 9 |)             (2 3 |)
2143 
2144     The data for 2 contiguous rows of the matrix are stored together (in column-major format) (with any left-over rows handled as a special case).
2145     Any of the rows in a slice fewer columns than the rest of the slice (row 1 above) are padded with a previous valid column in their "extra" colidx[] locations and
2146     zeros in their "extra" v locations so that the matrix operations do not need special code to handle different length rows within the 2 rows in a slice.
2147 
2148     The one-dimensional representation of v used in the code is (2 5 3 6 4 0 7 9 8 9)  and for colidx is (0 0 2 2 3 2 2 2 3 3)
2149 
2150 .ve
2151 
2152       See MatMult_SeqSELL() for how this format is used with the SIMD operations to achieve high performance.
2153 
2154  References:
2155    Hong Zhang, Richard T. Mills, Karl Rupp, and Barry F. Smith, Vectorized Parallel Sparse Matrix-Vector Multiplication in {PETSc} Using {AVX-512},
2156    Proceedings of the 47th International Conference on Parallel Processing, 2018.
2157 
2158 .seealso: MatCreateSeqSELL(), MatCreateSeqAIJ(), MatCreateSell(), MATSEQSELL, MATMPISELL, MATSEQAIJ, MATMPIAIJ, MATAIJ
2159 M*/
2160 
2161 /*@C
2162        MatCreateSeqSELL - Creates a sparse matrix in SELL format.
2163 
2164  Collective on comm
2165 
2166  Input Parameters:
2167 +  comm - MPI communicator, set to PETSC_COMM_SELF
2168 .  m - number of rows
2169 .  n - number of columns
2170 .  rlenmax - maximum number of nonzeros in a row
2171 -  rlen - array containing the number of nonzeros in the various rows
2172  (possibly different for each row) or NULL
2173 
2174  Output Parameter:
2175 .  A - the matrix
2176 
2177  It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
2178  MatXXXXSetPreallocation() paradigm instead of this routine directly.
2179  [MatXXXXSetPreallocation() is, for example, MatSeqSELLSetPreallocation]
2180 
2181  Notes:
2182  If nnz is given then nz is ignored
2183 
2184  Specify the preallocated storage with either rlenmax or rlen (not both).
2185  Set rlenmax=PETSC_DEFAULT and rlen=NULL for PETSc to control dynamic memory
2186  allocation.  For large problems you MUST preallocate memory or you
2187  will get TERRIBLE performance, see the users' manual chapter on matrices.
2188 
2189  Level: intermediate
2190 
2191  .seealso: MatCreate(), MatCreateSELL(), MatSetValues(), MatSeqSELLSetPreallocation(), MATSELL, MATSEQSELL, MATMPISELL
2192 
2193  @*/
2194 PetscErrorCode MatCreateSeqSELL(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt maxallocrow,const PetscInt rlen[],Mat *A)
2195 {
2196   PetscErrorCode ierr;
2197 
2198   PetscFunctionBegin;
2199   ierr = MatCreate(comm,A);CHKERRQ(ierr);
2200   ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr);
2201   ierr = MatSetType(*A,MATSEQSELL);CHKERRQ(ierr);
2202   ierr = MatSeqSELLSetPreallocation_SeqSELL(*A,maxallocrow,rlen);CHKERRQ(ierr);
2203   PetscFunctionReturn(0);
2204 }
2205 
2206 PetscErrorCode MatEqual_SeqSELL(Mat A,Mat B,PetscBool * flg)
2207 {
2208   Mat_SeqSELL    *a=(Mat_SeqSELL*)A->data,*b=(Mat_SeqSELL*)B->data;
2209   PetscInt       totalslices=a->totalslices;
2210   PetscErrorCode ierr;
2211 
2212   PetscFunctionBegin;
2213   /* If the  matrix dimensions are not equal,or no of nonzeros */
2214   if ((A->rmap->n != B->rmap->n) || (A->cmap->n != B->cmap->n) ||(a->nz != b->nz) || (a->rlenmax != b->rlenmax)) {
2215     *flg = PETSC_FALSE;
2216     PetscFunctionReturn(0);
2217   }
2218   /* if the a->colidx are the same */
2219   ierr = PetscArraycmp(a->colidx,b->colidx,a->sliidx[totalslices],flg);CHKERRQ(ierr);
2220   if (!*flg) PetscFunctionReturn(0);
2221   /* if a->val are the same */
2222   ierr = PetscArraycmp(a->val,b->val,a->sliidx[totalslices],flg);CHKERRQ(ierr);
2223   PetscFunctionReturn(0);
2224 }
2225 
2226 PetscErrorCode MatSeqSELLInvalidateDiagonal(Mat A)
2227 {
2228   Mat_SeqSELL *a=(Mat_SeqSELL*)A->data;
2229 
2230   PetscFunctionBegin;
2231   a->idiagvalid  = PETSC_FALSE;
2232   PetscFunctionReturn(0);
2233 }
2234 
2235 PetscErrorCode MatConjugate_SeqSELL(Mat A)
2236 {
2237 #if defined(PETSC_USE_COMPLEX)
2238   Mat_SeqSELL *a=(Mat_SeqSELL*)A->data;
2239   PetscInt    i;
2240   PetscScalar *val = a->val;
2241 
2242   PetscFunctionBegin;
2243   for (i=0; i<a->sliidx[a->totalslices]; i++) {
2244     val[i] = PetscConj(val[i]);
2245   }
2246 #else
2247   PetscFunctionBegin;
2248 #endif
2249   PetscFunctionReturn(0);
2250 }
2251