xref: /petsc/src/mat/impls/sell/seq/sell.c (revision feff33ee0b5b037fa8f9f294dede656a2f85cc47) !
1 
2 /*
3   Defines the basic matrix operations for the SELL matrix storage format.
4 */
5 #include <../src/mat/impls/sell/seq/sell.h>  /*I   "petscmat.h"  I*/
6 #include <petscblaslapack.h>
7 #include <petsc/private/kernels/blocktranspose.h>
8 #if defined(PETSC_HAVE_IMMINTRIN_H)
9   #include <immintrin.h>
10 
11   #if !defined(_MM_SCALE_8)
12   #define _MM_SCALE_8    8
13   #endif
14 
15   #if defined(__AVX512F__)
16   /* these do not work
17    vec_idx  = _mm512_loadunpackhi_epi32(vec_idx,acolidx);
18    vec_vals = _mm512_loadunpackhi_pd(vec_vals,aval);
19   */
20     #define AVX512_Mult_Private(vec_idx,vec_x,vec_vals,vec_y) \
21     /* if the mask bit is set, copy from acolidx, otherwise from vec_idx */ \
22     vec_idx  = _mm256_loadu_si256((__m256i const*)acolidx); \
23     vec_vals = _mm512_loadu_pd(aval); \
24     vec_x    = _mm512_i32gather_pd(vec_idx,x,_MM_SCALE_8); \
25     vec_y    = _mm512_fmadd_pd(vec_x,vec_vals,vec_y)
26   #elif defined(__AVX2__)
27     #define AVX2_Mult_Private(vec_idx,vec_x,vec_vals,vec_y) \
28     vec_vals = _mm256_loadu_pd(aval); \
29     vec_idx  = _mm_loadu_si128((__m128i const*)acolidx); /* SSE2 */ \
30     vec_x    = _mm256_i32gather_pd(x,vec_idx,_MM_SCALE_8); \
31     vec_y    = _mm256_fmadd_pd(vec_x,vec_vals,vec_y)
32   #endif
33 #endif  /* PETSC_HAVE_IMMINTRIN_H */
34 
35 /*@C
36  MatSeqSELLSetPreallocation - For good matrix assembly performance
37  the user should preallocate the matrix storage by setting the parameter nz
38  (or the array nnz).  By setting these parameters accurately, performance
39  during matrix assembly can be increased significantly.
40 
41  Collective on MPI_Comm
42 
43  Input Parameters:
44  +  B - The matrix
45  .  nz - number of nonzeros per row (same for all rows)
46  -  nnz - array containing the number of nonzeros in the various rows
47  (possibly different for each row) or NULL
48 
49  Notes:
50  If nnz is given then nz is ignored.
51 
52  Specify the preallocated storage with either nz or nnz (not both).
53  Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
54  allocation.  For large problems you MUST preallocate memory or you
55  will get TERRIBLE performance, see the users' manual chapter on matrices.
56 
57  You can call MatGetInfo() to get information on how effective the preallocation was;
58  for example the fields mallocs,nz_allocated,nz_used,nz_unneeded;
59  You can also run with the option -info and look for messages with the string
60  malloc in them to see if additional memory allocation was needed.
61 
62  Developers: Use nz of MAT_SKIP_ALLOCATION to not allocate any space for the matrix
63  entries or columns indices.
64 
65  The maximum number of nonzeos in any row should be as accuate as possible.
66  If it is underesitmated, you will get bad performance due to reallocation
67  (MatSeqXSELLReallocateSELL).
68 
69  Level: intermediate
70 
71  .seealso: MatCreate(), MatCreateSELL(), MatSetValues(), MatGetInfo()
72 
73  @*/
74 PetscErrorCode MatSeqSELLSetPreallocation(Mat B,PetscInt rlenmax,const PetscInt rlen[])
75 {
76   PetscErrorCode ierr;
77 
78   PetscFunctionBegin;
79   PetscValidHeaderSpecific(B,MAT_CLASSID,1);
80   PetscValidType(B,1);
81   ierr = PetscTryMethod(B,"MatSeqSELLSetPreallocation_C",(Mat,PetscInt,const PetscInt[]),(B,rlenmax,rlen));CHKERRQ(ierr);
82   PetscFunctionReturn(0);
83 }
84 
85 PetscErrorCode MatSeqSELLSetPreallocation_SeqSELL(Mat B,PetscInt maxallocrow,const PetscInt rlen[])
86 {
87   Mat_SeqSELL    *b;
88   PetscInt       i,j,totalslices;
89   PetscBool      skipallocation=PETSC_FALSE,realalloc=PETSC_FALSE;
90   PetscErrorCode ierr;
91 
92   PetscFunctionBegin;
93   if (maxallocrow >= 0 || rlen) realalloc = PETSC_TRUE;
94   if (maxallocrow == MAT_SKIP_ALLOCATION) {
95     skipallocation = PETSC_TRUE;
96     maxallocrow    = 0;
97   }
98 
99   ierr = PetscLayoutSetUp(B->rmap);CHKERRQ(ierr);
100   ierr = PetscLayoutSetUp(B->cmap);CHKERRQ(ierr);
101 
102   /* FIXME: if one preallocates more space than needed, the matrix does not shrink automatically, but for best performance it should */
103   if (maxallocrow == PETSC_DEFAULT || maxallocrow == PETSC_DECIDE) maxallocrow = 5;
104   if (maxallocrow < 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"maxallocrow cannot be less than 0: value %D",maxallocrow);
105   if (rlen) {
106     for (i=0; i<B->rmap->n; i++) {
107       if (rlen[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"rlen cannot be less than 0: local row %D value %D",i,rlen[i]);
108       if (rlen[i] > B->cmap->n) SETERRQ3(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"rlen cannot be greater than row length: local row %D value %D rowlength %D",i,rlen[i],B->cmap->n);
109     }
110   }
111 
112   B->preallocated = PETSC_TRUE;
113 
114   b = (Mat_SeqSELL*)B->data;
115 
116   totalslices = B->rmap->n/8+((B->rmap->n & 0x07)?1:0); /* ceil(n/8) */
117   b->totalslices = totalslices;
118   if (!skipallocation) {
119     if (B->rmap->n & 0x07) PetscInfo1(B,"Padding rows to the SEQSELL matrix because the number of rows is not the multiple of 8 (value %D)\n",B->rmap->n);
120 
121     if (!b->sliidx) { /* sliidx gives the starting index of each slice, the last element is the total space allocated */
122       ierr = PetscMalloc1(totalslices+1,&b->sliidx);CHKERRQ(ierr);
123       ierr = PetscLogObjectMemory((PetscObject)B,(totalslices+1)*sizeof(PetscInt));CHKERRQ(ierr);
124     }
125     if (!rlen) { /* if rlen is not provided, allocate same space for all the slices */
126       if (maxallocrow == PETSC_DEFAULT || maxallocrow == PETSC_DECIDE) maxallocrow = 10;
127       else if (maxallocrow < 0) maxallocrow = 1;
128       for (i=0; i<=totalslices; i++) b->sliidx[i] = i*8*maxallocrow;
129     } else {
130       maxallocrow = 0;
131       b->sliidx[0] = 0;
132       for (i=1; i<totalslices; i++) {
133         b->sliidx[i] = 0;
134         for (j=0;j<8;j++) {
135           b->sliidx[i] = PetscMax(b->sliidx[i],rlen[8*(i-1)+j]);
136         }
137         maxallocrow = PetscMax(b->sliidx[i],maxallocrow);
138         b->sliidx[i] = b->sliidx[i-1] + 8*b->sliidx[i];
139       }
140       /* last slice */
141       b->sliidx[totalslices] = 0;
142       for (j=(totalslices-1)*8;j<B->rmap->n;j++) b->sliidx[totalslices] = PetscMax(b->sliidx[totalslices],rlen[j]);
143       maxallocrow = PetscMax(b->sliidx[totalslices],maxallocrow);
144       b->sliidx[totalslices] = b->sliidx[totalslices-1] + 8*b->sliidx[totalslices];
145     }
146 
147     /* allocate space for val, colidx, rlen */
148     /* FIXME: should B's old memory be unlogged? */
149     ierr = MatSeqXSELLFreeSELL(B,&b->val,&b->colidx);CHKERRQ(ierr);
150     /* FIXME: assuming an element of the bit array takes 8 bits */
151     ierr = PetscMalloc2(b->sliidx[totalslices],&b->val,b->sliidx[totalslices],&b->colidx);CHKERRQ(ierr);
152     ierr = PetscLogObjectMemory((PetscObject)B,b->sliidx[totalslices]*(sizeof(PetscScalar)+sizeof(PetscInt)));CHKERRQ(ierr);
153     /* b->rlen will count nonzeros in each row so far. We dont copy rlen to b->rlen because the matrix has not been set. */
154     ierr = PetscCalloc1(8*totalslices,&b->rlen);CHKERRQ(ierr);
155     ierr = PetscLogObjectMemory((PetscObject)B,8*totalslices*sizeof(PetscInt));CHKERRQ(ierr);
156 
157     b->singlemalloc = PETSC_TRUE;
158     b->free_val     = PETSC_TRUE;
159     b->free_colidx  = PETSC_TRUE;
160   } else {
161     b->free_val    = PETSC_FALSE;
162     b->free_colidx = PETSC_FALSE;
163   }
164 
165   b->nz               = 0;
166   b->maxallocrow      = maxallocrow;
167   b->rlenmax          = maxallocrow;
168   b->maxallocmat      = b->sliidx[totalslices];
169   B->info.nz_unneeded = (double)b->maxallocmat;
170   if (realalloc) {
171     ierr = MatSetOption(B,MAT_NEW_NONZERO_ALLOCATION_ERR,PETSC_TRUE);CHKERRQ(ierr);
172   }
173   PetscFunctionReturn(0);
174 }
175 
176 PetscErrorCode MatGetRow_SeqSELL(Mat A,PetscInt row,PetscInt *nz,PetscInt **idx,PetscScalar **v)
177 {
178   Mat_SeqSELL *a = (Mat_SeqSELL*)A->data;
179   PetscInt    shift;
180 
181   PetscFunctionBegin;
182   if (row < 0 || row >= A->rmap->n) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Row %D out of range",row);
183   if (nz) *nz = a->rlen[row];
184   shift = a->sliidx[row>>3]+(row&0x07);
185   if (!a->getrowcols) {
186     PetscErrorCode ierr;
187 
188     ierr = PetscMalloc2(a->rlenmax,&a->getrowcols,a->rlenmax,&a->getrowvals);CHKERRQ(ierr);
189   }
190   if (idx) {
191     PetscInt j;
192     for (j=0; j<a->rlen[row]; j++) a->getrowcols[j] = a->colidx[shift+8*j];
193     *idx = a->getrowcols;
194   }
195   if (v) {
196     PetscInt j;
197     for (j=0; j<a->rlen[row]; j++) a->getrowvals[j] = a->val[shift+8*j];
198     *v = a->getrowvals;
199   }
200   PetscFunctionReturn(0);
201 }
202 
203 PetscErrorCode MatRestoreRow_SeqSELL(Mat A,PetscInt row,PetscInt *nz,PetscInt **idx,PetscScalar **v)
204 {
205   PetscFunctionBegin;
206   PetscFunctionReturn(0);
207 }
208 
209 PetscErrorCode MatConvert_SeqSELL_SeqAIJ(Mat A, MatType newtype,MatReuse reuse,Mat *newmat)
210 {
211   Mat            B;
212   Mat_SeqSELL    *a=(Mat_SeqSELL*)A->data;
213   PetscInt       i;
214   PetscErrorCode ierr;
215 
216   PetscFunctionBegin;
217   if (reuse == MAT_REUSE_MATRIX) {
218     B    = *newmat;
219     ierr = MatZeroEntries(B);CHKERRQ(ierr);
220   } else {
221     ierr = MatCreate(PetscObjectComm((PetscObject)A),&B);CHKERRQ(ierr);
222     ierr = MatSetSizes(B,A->rmap->n,A->cmap->n,A->rmap->N,A->cmap->N);CHKERRQ(ierr);
223     ierr = MatSetType(B,MATSEQAIJ);CHKERRQ(ierr);
224     ierr = MatSeqAIJSetPreallocation(B,0,a->rlen);CHKERRQ(ierr);
225   }
226 
227   for (i=0; i<A->rmap->n; i++) {
228     PetscInt    nz,*cols;
229     PetscScalar *vals;
230 
231     ierr = MatGetRow_SeqSELL(A,i,&nz,&cols,&vals);CHKERRQ(ierr);
232     ierr = MatSetValues(B,1,&i,nz,cols,vals,INSERT_VALUES);CHKERRQ(ierr);
233     ierr = MatRestoreRow_SeqSELL(A,i,&nz,&cols,&vals);CHKERRQ(ierr);
234   }
235 
236   ierr = MatAssemblyBegin(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
237   ierr = MatAssemblyEnd(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
238   B->rmap->bs = A->rmap->bs;
239 
240   if (reuse == MAT_INPLACE_MATRIX) {
241     ierr = MatHeaderReplace(A,&B);CHKERRQ(ierr);
242   } else {
243     *newmat = B;
244   }
245   PetscFunctionReturn(0);
246 }
247 
248 #include <../src/mat/impls/aij/seq/aij.h>
249 
250 PetscErrorCode MatConvert_SeqAIJ_SeqSELL(Mat A,MatType newtype,MatReuse reuse,Mat *newmat)
251 {
252   Mat               B;
253   Mat_SeqAIJ        *a=(Mat_SeqAIJ*)A->data;
254   PetscInt          *ai=a->i,m=A->rmap->N,n=A->cmap->N,i,*rowlengths,row,ncols;
255   const PetscInt    *cols;
256   const PetscScalar *vals;
257   PetscErrorCode    ierr;
258 
259   PetscFunctionBegin;
260   if (A->rmap->bs > 1) {
261     ierr = MatConvert_Basic(A,newtype,reuse,newmat);CHKERRQ(ierr);
262     PetscFunctionReturn(0);
263   }
264 
265   if (reuse == MAT_REUSE_MATRIX) {
266     B = *newmat;
267   } else {
268     /* Can we just use ilen? */
269     ierr = PetscMalloc1(m,&rowlengths);CHKERRQ(ierr);
270     for (i=0; i<m; i++) {
271       rowlengths[i] = ai[i+1] - ai[i];
272     }
273 
274     ierr = MatCreate(PetscObjectComm((PetscObject)A),&B);CHKERRQ(ierr);
275     ierr = MatSetSizes(B,m,n,m,n);CHKERRQ(ierr);
276     ierr = MatSetType(B,MATSEQSELL);CHKERRQ(ierr);
277     ierr = MatSeqSELLSetPreallocation(B,0,rowlengths);CHKERRQ(ierr);
278     ierr = PetscFree(rowlengths);CHKERRQ(ierr);
279   }
280 
281   for (row=0; row<m; row++) {
282     ierr = MatGetRow(A,row,&ncols,&cols,&vals);CHKERRQ(ierr);
283     ierr = MatSetValues(B,1,&row,ncols,cols,vals,INSERT_VALUES);CHKERRQ(ierr);
284     ierr = MatRestoreRow(A,row,&ncols,&cols,&vals);CHKERRQ(ierr);
285   }
286   ierr = MatAssemblyBegin(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
287   ierr = MatAssemblyEnd(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
288   B->rmap->bs = A->rmap->bs;
289 
290   if (reuse == MAT_INPLACE_MATRIX) {
291     ierr = MatHeaderReplace(A,&B);CHKERRQ(ierr);
292   } else {
293     *newmat = B;
294   }
295   PetscFunctionReturn(0);
296 }
297 
298 PetscErrorCode MatMult_SeqSELL(Mat A,Vec xx,Vec yy)
299 {
300   Mat_SeqSELL       *a=(Mat_SeqSELL*)A->data;
301   PetscScalar       *y;
302   const PetscScalar *x;
303   const MatScalar   *aval=a->val;
304   PetscInt          totalslices=a->totalslices;
305   const PetscInt    *acolidx=a->colidx;
306   PetscInt          i,j;
307   PetscErrorCode    ierr;
308 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
309   __m512d           vec_x,vec_y,vec_vals;
310   __m256i           vec_idx;
311   __mmask8          mask;
312   __m512d           vec_x2,vec_y2,vec_vals2,vec_x3,vec_y3,vec_vals3,vec_x4,vec_y4,vec_vals4;
313   __m256i           vec_idx2,vec_idx3,vec_idx4;
314 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX)
315   __m128i           vec_idx;
316   __m256d           vec_x,vec_y,vec_y2,vec_vals;
317   MatScalar         yval;
318   PetscInt          r,rows_left,row,nnz_in_row;
319 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX)
320   __m128d           vec_x_tmp;
321   __m256d           vec_x,vec_y,vec_y2,vec_vals;
322   MatScalar         yval;
323   PetscInt          r,rows_left,row,nnz_in_row;
324 #else
325   PetscScalar       sum[8];
326 #endif
327 
328 #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
329 #pragma disjoint(*x,*y,*aval)
330 #endif
331 
332   PetscFunctionBegin;
333   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
334   ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
335 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
336   for (i=0; i<totalslices; i++) { /* loop over slices */
337     PetscPrefetchBlock(acolidx,a->sliidx[i+1]-a->sliidx[i],0,PETSC_PREFETCH_HINT_T0);
338     PetscPrefetchBlock(aval,a->sliidx[i+1]-a->sliidx[i],0,PETSC_PREFETCH_HINT_T0);
339 
340     vec_y  = _mm512_setzero_pd();
341     vec_y2 = _mm512_setzero_pd();
342     vec_y3 = _mm512_setzero_pd();
343     vec_y4 = _mm512_setzero_pd();
344 
345     j = a->sliidx[i]>>3; /* 8 bytes are read at each time, corresponding to a slice columnn */
346     switch ((a->sliidx[i+1]-a->sliidx[i])/8 & 3) {
347     case 3:
348       AVX512_Mult_Private(vec_idx,vec_x,vec_vals,vec_y);
349       acolidx += 8; aval += 8;
350       AVX512_Mult_Private(vec_idx2,vec_x2,vec_vals2,vec_y2);
351       acolidx += 8; aval += 8;
352       AVX512_Mult_Private(vec_idx3,vec_x3,vec_vals3,vec_y3);
353       acolidx += 8; aval += 8;
354       j += 3;
355       break;
356     case 2:
357       AVX512_Mult_Private(vec_idx,vec_x,vec_vals,vec_y);
358       acolidx += 8; aval += 8;
359       AVX512_Mult_Private(vec_idx2,vec_x2,vec_vals2,vec_y2);
360       acolidx += 8; aval += 8;
361       j += 2;
362       break;
363     case 1:
364       AVX512_Mult_Private(vec_idx,vec_x,vec_vals,vec_y);
365       acolidx += 8; aval += 8;
366       j += 1;
367       break;
368     }
369     #pragma novector
370     for (; j<(a->sliidx[i+1]>>3); j+=4) {
371       AVX512_Mult_Private(vec_idx,vec_x,vec_vals,vec_y);
372       acolidx += 8; aval += 8;
373       AVX512_Mult_Private(vec_idx2,vec_x2,vec_vals2,vec_y2);
374       acolidx += 8; aval += 8;
375       AVX512_Mult_Private(vec_idx3,vec_x3,vec_vals3,vec_y3);
376       acolidx += 8; aval += 8;
377       AVX512_Mult_Private(vec_idx4,vec_x4,vec_vals4,vec_y4);
378       acolidx += 8; aval += 8;
379     }
380 
381     vec_y = _mm512_add_pd(vec_y,vec_y2);
382     vec_y = _mm512_add_pd(vec_y,vec_y3);
383     vec_y = _mm512_add_pd(vec_y,vec_y4);
384     if (i == totalslices-1 && A->rmap->n & 0x07) { /* if last slice has padding rows */
385       mask = (__mmask8)(0xff >> (8-(A->rmap->n & 0x07)));
386       _mm512_mask_storeu_pd(&y[8*i],mask,vec_y);
387     } else {
388       _mm512_storeu_pd(&y[8*i],vec_y);
389     }
390   }
391 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX)
392   for (i=0; i<totalslices; i++) { /* loop over full slices */
393     PetscPrefetchBlock(acolidx,a->sliidx[i+1]-a->sliidx[i],0,PETSC_PREFETCH_HINT_T0);
394     PetscPrefetchBlock(aval,a->sliidx[i+1]-a->sliidx[i],0,PETSC_PREFETCH_HINT_T0);
395 
396     /* last slice may have padding rows. Don't use vectorization. */
397     if (i == totalslices-1 && (A->rmap->n & 0x07)) {
398       rows_left = A->rmap->n - 8*i;
399       for (r=0; r<rows_left; ++r) {
400         yval = (MatScalar)0;
401         row = 8*i + r;
402         nnz_in_row = a->rlen[row];
403         for (j=0; j<nnz_in_row; ++j) yval += aval[8*j+r] * x[acolidx[8*j+r]];
404         y[row] = yval;
405       }
406       break;
407     }
408 
409     vec_y  = _mm256_setzero_pd();
410     vec_y2 = _mm256_setzero_pd();
411 
412     /* Process slice of height 8 (512 bits) via two subslices of height 4 (256 bits) via AVX */
413     #pragma novector
414     #pragma unroll(2)
415     for (j=a->sliidx[i]; j<a->sliidx[i+1]; j+=8) {
416       AVX2_Mult_Private(vec_idx,vec_x,vec_vals,vec_y);
417       aval += 4; acolidx += 4;
418       AVX2_Mult_Private(vec_idx,vec_x,vec_vals,vec_y2);
419       aval += 4; acolidx += 4;
420     }
421 
422     _mm256_storeu_pd(y+i*8,vec_y);
423     _mm256_storeu_pd(y+i*8+4,vec_y2);
424   }
425 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX)
426   for (i=0; i<totalslices; i++) { /* loop over full slices */
427     PetscPrefetchBlock(acolidx,a->sliidx[i+1]-a->sliidx[i],0,PETSC_PREFETCH_HINT_T0);
428     PetscPrefetchBlock(aval,a->sliidx[i+1]-a->sliidx[i],0,PETSC_PREFETCH_HINT_T0);
429 
430     vec_y  = _mm256_setzero_pd();
431     vec_y2 = _mm256_setzero_pd();
432 
433     /* last slice may have padding rows. Don't use vectorization. */
434     if (i == totalslices-1 && (A->rmap->n & 0x07)) {
435       rows_left = A->rmap->n - 8*i;
436       for (r=0; r<rows_left; ++r) {
437         yval = (MatScalar)0;
438         row = 8*i + r;
439         nnz_in_row = a->rlen[row];
440         for (j=0; j<nnz_in_row; ++j) yval += aval[8*j + r] * x[acolidx[8*j + r]];
441         y[row] = yval;
442       }
443       break;
444     }
445 
446     /* Process slice of height 8 (512 bits) via two subslices of height 4 (256 bits) via AVX */
447     #pragma novector
448     #pragma unroll(2)
449     for (j=a->sliidx[i]; j<a->sliidx[i+1]; j+=8) {
450       vec_vals  = _mm256_loadu_pd(aval);
451       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
452       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
453       vec_x     = _mm256_insertf128_pd(vec_x,vec_x_tmp,0);
454       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
455       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
456       vec_x     = _mm256_insertf128_pd(vec_x,vec_x_tmp,1);
457       vec_y     = _mm256_add_pd(_mm256_mul_pd(vec_x,vec_vals),vec_y);
458       aval     += 4;
459 
460       vec_vals  = _mm256_loadu_pd(aval);
461       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
462       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
463       vec_x     = _mm256_insertf128_pd(vec_x,vec_x_tmp,0);
464       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
465       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
466       vec_x     = _mm256_insertf128_pd(vec_x,vec_x_tmp,1);
467       vec_y2    = _mm256_add_pd(_mm256_mul_pd(vec_x,vec_vals),vec_y2);
468       aval     += 4;
469     }
470 
471     _mm256_storeu_pd(y + i*8,     vec_y);
472     _mm256_storeu_pd(y + i*8 + 4, vec_y2);
473   }
474 #else
475   for (i=0; i<totalslices; i++) { /* loop over slices */
476     for (j=0; j<8; j++) sum[j] = 0.0;
477     for (j=a->sliidx[i]; j<a->sliidx[i+1]; j+=8) {
478       sum[0] += aval[j] * x[acolidx[j]];
479       sum[1] += aval[j+1] * x[acolidx[j+1]];
480       sum[2] += aval[j+2] * x[acolidx[j+2]];
481       sum[3] += aval[j+3] * x[acolidx[j+3]];
482       sum[4] += aval[j+4] * x[acolidx[j+4]];
483       sum[5] += aval[j+5] * x[acolidx[j+5]];
484       sum[6] += aval[j+6] * x[acolidx[j+6]];
485       sum[7] += aval[j+7] * x[acolidx[j+7]];
486     }
487     if (i == totalslices-1 && (A->rmap->n & 0x07)) { /* if last slice has padding rows */
488       for(j=0; j<(A->rmap->n & 0x07); j++) y[8*i+j] = sum[j];
489     } else {
490       for(j=0; j<8; j++) y[8*i+j] = sum[j];
491     }
492   }
493 #endif
494 
495   ierr = PetscLogFlops(2.0*a->nz-a->nonzerorowcnt);CHKERRQ(ierr); /* theoretical minimal FLOPs */
496   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
497   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
498   PetscFunctionReturn(0);
499 }
500 
501 #include <../src/mat/impls/aij/seq/ftn-kernels/fmultadd.h>
502 PetscErrorCode MatMultAdd_SeqSELL(Mat A,Vec xx,Vec yy,Vec zz)
503 {
504   Mat_SeqSELL       *a=(Mat_SeqSELL*)A->data;
505   PetscScalar       *y,*z;
506   const PetscScalar *x;
507   const MatScalar   *aval=a->val;
508   PetscInt          totalslices=a->totalslices;
509   const PetscInt    *acolidx=a->colidx;
510   PetscInt          i,j;
511   PetscErrorCode    ierr;
512 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
513   __m512d           vec_x,vec_y,vec_vals;
514   __m256i           vec_idx;
515   __mmask8          mask;
516   __m512d           vec_x2,vec_y2,vec_vals2,vec_x3,vec_y3,vec_vals3,vec_x4,vec_y4,vec_vals4;
517   __m256i           vec_idx2,vec_idx3,vec_idx4;
518 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX)
519   __m128d           vec_x_tmp;
520   __m256d           vec_x,vec_y,vec_y2,vec_vals;
521   MatScalar         yval;
522   PetscInt          r,row,nnz_in_row;
523 #else
524   PetscScalar       sum[8];
525 #endif
526 
527 #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
528 #pragma disjoint(*x,*y,*aval)
529 #endif
530 
531   PetscFunctionBegin;
532   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
533   ierr = VecGetArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
534 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
535   for (i=0; i<totalslices; i++) { /* loop over slices */
536     PetscPrefetchBlock(acolidx,a->sliidx[i+1]-a->sliidx[i],0,PETSC_PREFETCH_HINT_T0);
537     PetscPrefetchBlock(aval,a->sliidx[i+1]-a->sliidx[i],0,PETSC_PREFETCH_HINT_T0);
538 
539     if (i == totalslices-1 && A->rmap->n & 0x07) { /* if last slice has padding rows */
540       mask   = (__mmask8)(0xff >> (8-(A->rmap->n & 0x07)));
541       vec_y  = _mm512_mask_loadu_pd(vec_y,mask,&y[8*i]);
542     } else {
543       vec_y  = _mm512_loadu_pd(&y[8*i]);
544     }
545     vec_y2 = _mm512_setzero_pd();
546     vec_y3 = _mm512_setzero_pd();
547     vec_y4 = _mm512_setzero_pd();
548 
549     j = a->sliidx[i]>>3; /* 8 bytes are read at each time, corresponding to a slice columnn */
550     switch ((a->sliidx[i+1]-a->sliidx[i])/8 & 3) {
551     case 3:
552       AVX512_Mult_Private(vec_idx,vec_x,vec_vals,vec_y);
553       acolidx += 8; aval += 8;
554       AVX512_Mult_Private(vec_idx2,vec_x2,vec_vals2,vec_y2);
555       acolidx += 8; aval += 8;
556       AVX512_Mult_Private(vec_idx3,vec_x3,vec_vals3,vec_y3);
557       acolidx += 8; aval += 8;
558       j += 3;
559       break;
560     case 2:
561       AVX512_Mult_Private(vec_idx,vec_x,vec_vals,vec_y);
562       acolidx += 8; aval += 8;
563       AVX512_Mult_Private(vec_idx2,vec_x2,vec_vals2,vec_y2);
564       acolidx += 8; aval += 8;
565       j += 2;
566       break;
567     case 1:
568       AVX512_Mult_Private(vec_idx,vec_x,vec_vals,vec_y);
569       acolidx += 8; aval += 8;
570       j += 1;
571       break;
572     }
573     #pragma novector
574     for (; j<(a->sliidx[i+1]>>3); j+=4) {
575       AVX512_Mult_Private(vec_idx,vec_x,vec_vals,vec_y);
576       acolidx += 8; aval += 8;
577       AVX512_Mult_Private(vec_idx2,vec_x2,vec_vals2,vec_y2);
578       acolidx += 8; aval += 8;
579       AVX512_Mult_Private(vec_idx3,vec_x3,vec_vals3,vec_y3);
580       acolidx += 8; aval += 8;
581       AVX512_Mult_Private(vec_idx4,vec_x4,vec_vals4,vec_y4);
582       acolidx += 8; aval += 8;
583     }
584 
585     vec_y = _mm512_add_pd(vec_y,vec_y2);
586     vec_y = _mm512_add_pd(vec_y,vec_y3);
587     vec_y = _mm512_add_pd(vec_y,vec_y4);
588     if (i == totalslices-1 && A->rmap->n & 0x07) { /* if last slice has padding rows */
589       _mm512_mask_storeu_pd(&z[8*i],mask,vec_y);
590     } else {
591       _mm512_storeu_pd(&z[8*i],vec_y);
592     }
593   }
594 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX)
595   for (i=0; i<totalslices; i++) { /* loop over full slices */
596     PetscPrefetchBlock(acolidx,a->sliidx[i+1]-a->sliidx[i],0,PETSC_PREFETCH_HINT_T0);
597     PetscPrefetchBlock(aval,a->sliidx[i+1]-a->sliidx[i],0,PETSC_PREFETCH_HINT_T0);
598 
599     /* last slice may have padding rows. Don't use vectorization. */
600     if (i == totalslices-1 && (A->rmap->n & 0x07)) {
601       for (r=0; r<(A->rmap->n & 0x07); ++r) {
602         row        = 8*i + r;
603         yval       = (MatScalar)0.0;
604         nnz_in_row = a->rlen[row];
605         for (j=0; j<nnz_in_row; ++j) yval += aval[8*j+r] * x[acolidx[8*j+r]];
606         z[row] = y[row] + yval;
607       }
608       break;
609     }
610 
611     vec_y  = _mm256_loadu_pd(y+8*i);
612     vec_y2 = _mm256_loadu_pd(y+8*i+4);
613 
614     /* Process slice of height 8 (512 bits) via two subslices of height 4 (256 bits) via AVX */
615     for (j=a->sliidx[i]; j<a->sliidx[i+1]; j+=8) {
616       vec_vals  = _mm256_loadu_pd(aval);
617       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
618       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
619       vec_x     = _mm256_insertf128_pd(vec_x,vec_x_tmp,0);
620       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
621       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
622       vec_x     = _mm256_insertf128_pd(vec_x,vec_x_tmp,1);
623       vec_y     = _mm256_add_pd(_mm256_mul_pd(vec_x,vec_vals),vec_y);
624       aval     += 4;
625 
626       vec_vals  = _mm256_loadu_pd(aval);
627       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
628       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
629       vec_x     = _mm256_insertf128_pd(vec_x,vec_x_tmp,0);
630       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
631       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
632       vec_x     = _mm256_insertf128_pd(vec_x,vec_x_tmp,1);
633       vec_y2    = _mm256_add_pd(_mm256_mul_pd(vec_x,vec_vals),vec_y2);
634       aval     += 4;
635     }
636 
637     _mm256_storeu_pd(z+i*8,vec_y);
638     _mm256_storeu_pd(z+i*8+4,vec_y2);
639   }
640 #else
641   for (i=0; i<totalslices; i++) { /* loop over slices */
642     for (j=0; j<8; j++) sum[j] = 0.0;
643     for (j=a->sliidx[i]; j<a->sliidx[i+1]; j+=8) {
644       sum[0] += aval[j] * x[acolidx[j]];
645       sum[1] += aval[j+1] * x[acolidx[j+1]];
646       sum[2] += aval[j+2] * x[acolidx[j+2]];
647       sum[3] += aval[j+3] * x[acolidx[j+3]];
648       sum[4] += aval[j+4] * x[acolidx[j+4]];
649       sum[5] += aval[j+5] * x[acolidx[j+5]];
650       sum[6] += aval[j+6] * x[acolidx[j+6]];
651       sum[7] += aval[j+7] * x[acolidx[j+7]];
652     }
653     if (i == totalslices-1 && (A->rmap->n & 0x07)) {
654       for (j=0; j<(A->rmap->n & 0x07); j++) z[8*i+j] = y[8*i+j] + sum[j];
655     } else {
656       for (j=0; j<8; j++) z[8*i+j] = y[8*i+j] + sum[j];
657     }
658   }
659 #endif
660 
661   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
662   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
663   ierr = VecRestoreArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
664   PetscFunctionReturn(0);
665 }
666 
667 PetscErrorCode MatMultTransposeAdd_SeqSELL(Mat A,Vec xx,Vec zz,Vec yy)
668 {
669   Mat_SeqSELL       *a=(Mat_SeqSELL*)A->data;
670   PetscScalar       *y;
671   const PetscScalar *x;
672   const MatScalar   *aval=a->val;
673   const PetscInt    *acolidx=a->colidx;
674   PetscInt          i,j,r,row,nnz_in_row,totalslices=a->totalslices;
675   PetscErrorCode    ierr;
676 
677 #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
678 #pragma disjoint(*x,*y,*aval)
679 #endif
680 
681   PetscFunctionBegin;
682   if (A->symmetric) {
683     ierr = MatMultAdd_SeqSELL(A,xx,zz,yy);CHKERRQ(ierr);
684     PetscFunctionReturn(0);
685   }
686   if (zz != yy) { ierr = VecCopy(zz,yy);CHKERRQ(ierr); }
687   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
688   ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
689   for (i=0; i<a->totalslices; i++) { /* loop over slices */
690     if (i == totalslices-1 && (A->rmap->n & 0x07)) {
691       for (r=0; r<(A->rmap->n & 0x07); ++r) {
692         row        = 8*i + r;
693         nnz_in_row = a->rlen[row];
694         for (j=0; j<nnz_in_row; ++j) y[acolidx[8*j+r]] += aval[8*j+r] * x[row];
695       }
696       break;
697     }
698     for (j=a->sliidx[i]; j<a->sliidx[i+1]; j+=8) {
699       y[acolidx[j]]   += aval[j] * x[8*i];
700       y[acolidx[j+1]] += aval[j+1] * x[8*i+1];
701       y[acolidx[j+2]] += aval[j+2] * x[8*i+2];
702       y[acolidx[j+3]] += aval[j+3] * x[8*i+3];
703       y[acolidx[j+4]] += aval[j+4] * x[8*i+4];
704       y[acolidx[j+5]] += aval[j+5] * x[8*i+5];
705       y[acolidx[j+6]] += aval[j+6] * x[8*i+6];
706       y[acolidx[j+7]] += aval[j+7] * x[8*i+7];
707     }
708   }
709   ierr = PetscLogFlops(2.0*a->sliidx[a->totalslices]);CHKERRQ(ierr);
710   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
711   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
712   PetscFunctionReturn(0);
713 }
714 
715 PetscErrorCode MatMultTranspose_SeqSELL(Mat A,Vec xx,Vec yy)
716 {
717   PetscErrorCode ierr;
718 
719   PetscFunctionBegin;
720   if (A->symmetric) {
721     ierr = MatMult_SeqSELL(A,xx,yy);CHKERRQ(ierr);
722   } else {
723     ierr = VecSet(yy,0.0);CHKERRQ(ierr);
724     ierr = MatMultTransposeAdd_SeqSELL(A,xx,yy,yy);CHKERRQ(ierr);
725   }
726   PetscFunctionReturn(0);
727 }
728 
729 /*
730      Checks for missing diagonals
731 */
732 PetscErrorCode MatMissingDiagonal_SeqSELL(Mat A,PetscBool  *missing,PetscInt *d)
733 {
734   Mat_SeqSELL *a=(Mat_SeqSELL*)A->data;
735   PetscInt    *diag,i;
736 
737   PetscFunctionBegin;
738   *missing = PETSC_FALSE;
739   if (A->rmap->n > 0 && !(a->colidx)) {
740     *missing = PETSC_TRUE;
741     if (d) *d = 0;
742     PetscInfo(A,"Matrix has no entries therefore is missing diagonal\n");
743   } else {
744     diag = a->diag;
745     for (i=0; i<A->rmap->n; i++) {
746       if (diag[i] == -1) {
747         *missing = PETSC_TRUE;
748         if (d) *d = i;
749         PetscInfo1(A,"Matrix is missing diagonal number %D\n",i);
750         break;
751       }
752     }
753   }
754   PetscFunctionReturn(0);
755 }
756 
757 PetscErrorCode MatMarkDiagonal_SeqSELL(Mat A)
758 {
759   Mat_SeqSELL    *a=(Mat_SeqSELL*)A->data;
760   PetscInt       i,j,m=A->rmap->n,shift;
761   PetscErrorCode ierr;
762 
763   PetscFunctionBegin;
764   if (!a->diag) {
765     ierr         = PetscMalloc1(m,&a->diag);CHKERRQ(ierr);
766     ierr         = PetscLogObjectMemory((PetscObject)A,m*sizeof(PetscInt));CHKERRQ(ierr);
767     a->free_diag = PETSC_TRUE;
768   }
769   for (i=0; i<m; i++) { /* loop over rows */
770     shift = a->sliidx[i>>3]+(i&0x07); /* starting index of the row i */
771     a->diag[i] = -1;
772     for (j=0; j<a->rlen[i]; j++) {
773       if (a->colidx[shift+j*8] == i) {
774         a->diag[i] = shift+j*8;
775         break;
776       }
777     }
778   }
779   PetscFunctionReturn(0);
780 }
781 
782 /*
783   Negative shift indicates do not generate an error if there is a zero diagonal, just invert it anyways
784 */
785 PetscErrorCode MatInvertDiagonal_SeqSELL(Mat A,PetscScalar omega,PetscScalar fshift)
786 {
787   Mat_SeqSELL    *a=(Mat_SeqSELL*) A->data;
788   PetscInt       i,*diag,m = A->rmap->n;
789   MatScalar      *val = a->val;
790   PetscScalar    *idiag,*mdiag;
791   PetscErrorCode ierr;
792 
793   PetscFunctionBegin;
794   if (a->idiagvalid) PetscFunctionReturn(0);
795   ierr = MatMarkDiagonal_SeqSELL(A);CHKERRQ(ierr);
796   diag = a->diag;
797   if (!a->idiag) {
798     ierr = PetscMalloc3(m,&a->idiag,m,&a->mdiag,m,&a->ssor_work);CHKERRQ(ierr);
799     ierr = PetscLogObjectMemory((PetscObject)A, 3*m*sizeof(PetscScalar));CHKERRQ(ierr);
800     val  = a->val;
801   }
802   mdiag = a->mdiag;
803   idiag = a->idiag;
804 
805   if (omega == 1.0 && PetscRealPart(fshift) <= 0.0) {
806     for (i=0; i<m; i++) {
807       mdiag[i] = val[diag[i]];
808       if (!PetscAbsScalar(mdiag[i])) { /* zero diagonal */
809         if (PetscRealPart(fshift)) {
810           ierr = PetscInfo1(A,"Zero diagonal on row %D\n",i);CHKERRQ(ierr);
811           A->factorerrortype             = MAT_FACTOR_NUMERIC_ZEROPIVOT;
812           A->factorerror_zeropivot_value = 0.0;
813           A->factorerror_zeropivot_row   = i;
814         } else SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"Zero diagonal on row %D",i);
815       }
816       idiag[i] = 1.0/val[diag[i]];
817     }
818     ierr = PetscLogFlops(m);CHKERRQ(ierr);
819   } else {
820     for (i=0; i<m; i++) {
821       mdiag[i] = val[diag[i]];
822       idiag[i] = omega/(fshift + val[diag[i]]);
823     }
824     ierr = PetscLogFlops(2.0*m);CHKERRQ(ierr);
825   }
826   a->idiagvalid = PETSC_TRUE;
827   PetscFunctionReturn(0);
828 }
829 
830 PetscErrorCode MatZeroEntries_SeqSELL(Mat A)
831 {
832   Mat_SeqSELL    *a=(Mat_SeqSELL*)A->data;
833   PetscErrorCode ierr;
834 
835   PetscFunctionBegin;
836   ierr = PetscMemzero(a->val,(a->sliidx[a->totalslices])*sizeof(PetscScalar));CHKERRQ(ierr);
837   ierr = MatSeqSELLInvalidateDiagonal(A);CHKERRQ(ierr);
838   PetscFunctionReturn(0);
839 }
840 
841 PetscErrorCode MatDestroy_SeqSELL(Mat A)
842 {
843   Mat_SeqSELL    *a=(Mat_SeqSELL*)A->data;
844   PetscErrorCode ierr;
845 
846   PetscFunctionBegin;
847 #if defined(PETSC_USE_LOG)
848   PetscLogObjectState((PetscObject)A,"Rows=%D, Cols=%D, NZ=%D",A->rmap->n,A->cmap->n,a->nz);
849 #endif
850   ierr = MatSeqXSELLFreeSELL(A,&a->val,&a->colidx);CHKERRQ(ierr);
851   ierr = ISDestroy(&a->row);CHKERRQ(ierr);
852   ierr = ISDestroy(&a->col);CHKERRQ(ierr);
853   ierr = PetscFree(a->diag);CHKERRQ(ierr);
854   ierr = PetscFree(a->rlen);CHKERRQ(ierr);
855   ierr = PetscFree(a->sliidx);CHKERRQ(ierr);
856   ierr = PetscFree3(a->idiag,a->mdiag,a->ssor_work);CHKERRQ(ierr);
857   ierr = PetscFree(a->solve_work);CHKERRQ(ierr);
858   ierr = ISDestroy(&a->icol);CHKERRQ(ierr);
859   ierr = PetscFree(a->saved_values);CHKERRQ(ierr);
860   ierr = PetscFree2(a->getrowcols,a->getrowvals);CHKERRQ(ierr);
861 
862   ierr = PetscFree(A->data);CHKERRQ(ierr);
863 
864   ierr = PetscObjectChangeTypeName((PetscObject)A,0);CHKERRQ(ierr);
865   ierr = PetscObjectComposeFunction((PetscObject)A,"MatStoreValues_C",NULL);CHKERRQ(ierr);
866   ierr = PetscObjectComposeFunction((PetscObject)A,"MatRetrieveValues_C",NULL);CHKERRQ(ierr);
867 #if defined(PETSC_HAVE_ELEMENTAL)
868 #endif
869   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqSELLSetPreallocation_C",NULL);CHKERRQ(ierr);
870   PetscFunctionReturn(0);
871 }
872 
873 PetscErrorCode MatSetOption_SeqSELL(Mat A,MatOption op,PetscBool flg)
874 {
875   Mat_SeqSELL    *a=(Mat_SeqSELL*)A->data;
876   PetscErrorCode ierr;
877 
878   PetscFunctionBegin;
879   switch (op) {
880   case MAT_ROW_ORIENTED:
881     a->roworiented = flg;
882     break;
883   case MAT_KEEP_NONZERO_PATTERN:
884     a->keepnonzeropattern = flg;
885     break;
886   case MAT_NEW_NONZERO_LOCATIONS:
887     a->nonew = (flg ? 0 : 1);
888     break;
889   case MAT_NEW_NONZERO_LOCATION_ERR:
890     a->nonew = (flg ? -1 : 0);
891     break;
892   case MAT_NEW_NONZERO_ALLOCATION_ERR:
893     a->nonew = (flg ? -2 : 0);
894     break;
895   case MAT_UNUSED_NONZERO_LOCATION_ERR:
896     a->nounused = (flg ? -1 : 0);
897     break;
898   case MAT_NEW_DIAGONALS:
899   case MAT_IGNORE_OFF_PROC_ENTRIES:
900   case MAT_USE_HASH_TABLE:
901     ierr = PetscInfo1(A,"Option %s ignored\n",MatOptions[op]);CHKERRQ(ierr);
902     break;
903   case MAT_SPD:
904   case MAT_SYMMETRIC:
905   case MAT_STRUCTURALLY_SYMMETRIC:
906   case MAT_HERMITIAN:
907   case MAT_SYMMETRY_ETERNAL:
908     /* These options are handled directly by MatSetOption() */
909     break;
910   default:
911     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unknown option %d",op);
912   }
913   PetscFunctionReturn(0);
914 }
915 
916 PetscErrorCode MatGetDiagonal_SeqSELL(Mat A,Vec v)
917 {
918   Mat_SeqSELL    *a=(Mat_SeqSELL*)A->data;
919   PetscInt       i,j,n,shift;
920   PetscScalar    *x,zero=0.0;
921   PetscErrorCode ierr;
922 
923   PetscFunctionBegin;
924   ierr = VecGetLocalSize(v,&n);CHKERRQ(ierr);
925   if (n != A->rmap->n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector");
926 
927   if (A->factortype == MAT_FACTOR_ILU || A->factortype == MAT_FACTOR_LU) {
928     PetscInt *diag=a->diag;
929     ierr = VecGetArray(v,&x);CHKERRQ(ierr);
930     for (i=0; i<n; i++) x[i] = 1.0/a->val[diag[i]];
931     ierr = VecRestoreArray(v,&x);CHKERRQ(ierr);
932     PetscFunctionReturn(0);
933   }
934 
935   ierr = VecSet(v,zero);CHKERRQ(ierr);
936   ierr = VecGetArray(v,&x);CHKERRQ(ierr);
937   for (i=0; i<n; i++) { /* loop over rows */
938     shift = a->sliidx[i>>3]+(i&0x07); /* starting index of the row i */
939     x[i] = 0;
940     for (j=0; j<a->rlen[i]; j++) {
941       if (a->colidx[shift+j*8] == i) {
942         x[i] = a->val[shift+j*8];
943         break;
944       }
945     }
946   }
947   ierr = VecRestoreArray(v,&x);CHKERRQ(ierr);
948   PetscFunctionReturn(0);
949 }
950 
951 PetscErrorCode MatDiagonalScale_SeqSELL(Mat A,Vec ll,Vec rr)
952 {
953   Mat_SeqSELL       *a=(Mat_SeqSELL*)A->data;
954   const PetscScalar *l,*r;
955   PetscInt          i,j,m,n,row;
956   PetscErrorCode    ierr;
957 
958   PetscFunctionBegin;
959   if (ll) {
960     /* The local size is used so that VecMPI can be passed to this routine
961        by MatDiagonalScale_MPISELL */
962     ierr = VecGetLocalSize(ll,&m);CHKERRQ(ierr);
963     if (m != A->rmap->n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Left scaling vector wrong length");
964     ierr = VecGetArrayRead(ll,&l);CHKERRQ(ierr);
965     for (i=0; i<a->totalslices; i++) { /* loop over slices */
966       for (j=a->sliidx[i],row=0; j<a->sliidx[i+1]; j++,row=((row+1)&0x07)) {
967         a->val[j] *= l[8*i+row];
968       }
969     }
970     ierr = VecRestoreArrayRead(ll,&l);CHKERRQ(ierr);
971     ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
972   }
973   if (rr) {
974     ierr = VecGetLocalSize(rr,&n);CHKERRQ(ierr);
975     if (n != A->cmap->n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Right scaling vector wrong length");
976     ierr = VecGetArrayRead(rr,&r);CHKERRQ(ierr);
977     for (i=0; i<a->totalslices; i++) { /* loop over slices */
978       for (j=a->sliidx[i]; j<a->sliidx[i+1]; j++) {
979         a->val[j] *= r[a->colidx[j]];
980       }
981     }
982     ierr = VecRestoreArrayRead(rr,&r);CHKERRQ(ierr);
983     ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
984   }
985   ierr = MatSeqSELLInvalidateDiagonal(A);CHKERRQ(ierr);
986   PetscFunctionReturn(0);
987 }
988 
989 extern PetscErrorCode MatSetValues_SeqSELL(Mat,PetscInt,const PetscInt[],PetscInt,const PetscInt[],const PetscScalar[],InsertMode);
990 
991 PetscErrorCode MatGetValues_SeqSELL(Mat A,PetscInt m,const PetscInt im[],PetscInt n,const PetscInt in[],PetscScalar v[])
992 {
993   Mat_SeqSELL *a=(Mat_SeqSELL*)A->data;
994   PetscInt    *cp,i,k,low,high,t,row,col,l;
995   PetscInt    shift;
996   MatScalar   *vp;
997 
998   PetscFunctionBegin;
999   for (k=0; k<m; k++) { /* loop over requested rows */
1000     row = im[k];
1001     if (row<0) continue;
1002 #if defined(PETSC_USE_DEBUG)
1003     if (row >= A->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Row too large: row %D max %D",row,A->rmap->n-1);
1004 #endif
1005     shift = a->sliidx[row>>3]+(row&0x07); /* starting index of the row */
1006     cp = a->colidx+shift; /* pointer to the row */
1007     vp = a->val+shift; /* pointer to the row */
1008     for (l=0; l<n; l++) { /* loop over requested columns */
1009       col = in[l];
1010       if (col<0) continue;
1011 #if defined(PETSC_USE_DEBUG)
1012       if (col >= A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Column too large: row %D max %D",col,A->cmap->n-1);
1013 #endif
1014       high = a->rlen[row]; low = 0; /* assume unsorted */
1015       while (high-low > 5) {
1016         t = (low+high)/2;
1017         if (*(cp+t*8) > col) high = t;
1018         else low = t;
1019       }
1020       for (i=low; i<high; i++) {
1021         if (*(cp+8*i) > col) break;
1022         if (*(cp+8*i) == col) {
1023           *v++ = *(vp+8*i);
1024           goto finished;
1025         }
1026       }
1027       *v++ = 0.0;
1028     finished:;
1029     }
1030   }
1031   PetscFunctionReturn(0);
1032 }
1033 
1034 PetscErrorCode MatView_SeqSELL_ASCII(Mat A,PetscViewer viewer)
1035 {
1036   Mat_SeqSELL       *a=(Mat_SeqSELL*)A->data;
1037   PetscInt          i,j,m=A->rmap->n,shift;
1038   const char        *name;
1039   PetscViewerFormat format;
1040   PetscErrorCode    ierr;
1041 
1042   PetscFunctionBegin;
1043   ierr = PetscViewerGetFormat(viewer,&format);CHKERRQ(ierr);
1044   if (format == PETSC_VIEWER_ASCII_MATLAB) {
1045     PetscInt nofinalvalue = 0;
1046     /*
1047     if (m && ((a->i[m] == a->i[m-1]) || (a->j[a->nz-1] != A->cmap->n-1))) {
1048       nofinalvalue = 1;
1049     }
1050     */
1051     ierr = PetscViewerASCIIUseTabs(viewer,PETSC_FALSE);CHKERRQ(ierr);
1052     ierr = PetscViewerASCIIPrintf(viewer,"%% Size = %D %D \n",m,A->cmap->n);CHKERRQ(ierr);
1053     ierr = PetscViewerASCIIPrintf(viewer,"%% Nonzeros = %D \n",a->nz);CHKERRQ(ierr);
1054 #if defined(PETSC_USE_COMPLEX)
1055     ierr = PetscViewerASCIIPrintf(viewer,"zzz = zeros(%D,4);\n",a->nz+nofinalvalue);CHKERRQ(ierr);
1056 #else
1057     ierr = PetscViewerASCIIPrintf(viewer,"zzz = zeros(%D,3);\n",a->nz+nofinalvalue);CHKERRQ(ierr);
1058 #endif
1059     ierr = PetscViewerASCIIPrintf(viewer,"zzz = [\n");CHKERRQ(ierr);
1060 
1061     for (i=0; i<m; i++) {
1062       shift = a->sliidx[i>>3]+(i&0x07);
1063       for (j=0; j<a->rlen[i]; j++) {
1064 #if defined(PETSC_USE_COMPLEX)
1065         ierr = PetscViewerASCIIPrintf(viewer,"%D %D  %18.16e %18.16e\n",i+1,a->colidx[shift+8*j]+1,(double)PetscRealPart(a->val[shift+8*j]),(double)PetscImaginaryPart(a->val[shift+8*j]));CHKERRQ(ierr);
1066 #else
1067         ierr = PetscViewerASCIIPrintf(viewer,"%D %D  %18.16e\n",i+1,a->colidx[shift+8*j]+1,(double)a->val[shift+8*j]);CHKERRQ(ierr);
1068 #endif
1069       }
1070     }
1071     /*
1072     if (nofinalvalue) {
1073 #if defined(PETSC_USE_COMPLEX)
1074       ierr = PetscViewerASCIIPrintf(viewer,"%D %D  %18.16e %18.16e\n",m,A->cmap->n,0.,0.);CHKERRQ(ierr);
1075 #else
1076       ierr = PetscViewerASCIIPrintf(viewer,"%D %D  %18.16e\n",m,A->cmap->n,0.0);CHKERRQ(ierr);
1077 #endif
1078     }
1079     */
1080     ierr = PetscObjectGetName((PetscObject)A,&name);CHKERRQ(ierr);
1081     ierr = PetscViewerASCIIPrintf(viewer,"];\n %s = spconvert(zzz);\n",name);CHKERRQ(ierr);
1082     ierr = PetscViewerASCIIUseTabs(viewer,PETSC_TRUE);CHKERRQ(ierr);
1083   } else if (format == PETSC_VIEWER_ASCII_FACTOR_INFO || format == PETSC_VIEWER_ASCII_INFO) {
1084     PetscFunctionReturn(0);
1085   } else if (format == PETSC_VIEWER_ASCII_COMMON) {
1086     ierr = PetscViewerASCIIUseTabs(viewer,PETSC_FALSE);CHKERRQ(ierr);
1087     for (i=0; i<m; i++) {
1088       ierr = PetscViewerASCIIPrintf(viewer,"row %D:",i);CHKERRQ(ierr);
1089       shift = a->sliidx[i>>3]+(i&0x07);
1090       for (j=0; j<a->rlen[i]; j++) {
1091 #if defined(PETSC_USE_COMPLEX)
1092         if (PetscImaginaryPart(a->val[shift+8*j]) > 0.0 && PetscRealPart(a->val[shift+8*j]) != 0.0) {
1093           ierr = PetscViewerASCIIPrintf(viewer," (%D, %g + %g i)",a->colidx[shift+8*j],(double)PetscRealPart(a->val[shift+8*j]),(double)PetscImaginaryPart(a->val[shift+8*j]));CHKERRQ(ierr);
1094         } else if (PetscImaginaryPart(a->val[shift+8*j]) < 0.0 && PetscRealPart(a->val[shift+8*j]) != 0.0) {
1095           ierr = PetscViewerASCIIPrintf(viewer," (%D, %g - %g i)",a->colidx[shift+8*j],(double)PetscRealPart(a->val[shift+8*j]),(double)-PetscImaginaryPart(a->val[shift+8*j]));CHKERRQ(ierr);
1096         } else if (PetscRealPart(a->val[shift+8*j]) != 0.0) {
1097           ierr = PetscViewerASCIIPrintf(viewer," (%D, %g) ",a->colidx[shift+8*j],(double)PetscRealPart(a->val[shift+8*j]));CHKERRQ(ierr);
1098         }
1099 #else
1100         if (a->val[shift+8*j] != 0.0) {ierr = PetscViewerASCIIPrintf(viewer," (%D, %g) ",a->colidx[shift+8*j],(double)a->val[shift+8*j]);CHKERRQ(ierr);}
1101 #endif
1102       }
1103       ierr = PetscViewerASCIIPrintf(viewer,"\n");CHKERRQ(ierr);
1104     }
1105     ierr = PetscViewerASCIIUseTabs(viewer,PETSC_TRUE);CHKERRQ(ierr);
1106   } else if (format == PETSC_VIEWER_ASCII_DENSE) {
1107     PetscInt    cnt=0,jcnt;
1108     PetscScalar value;
1109 #if defined(PETSC_USE_COMPLEX)
1110     PetscBool   realonly=PETSC_TRUE;
1111     for (i=0; i<a->sliidx[a->totalslices]; i++) {
1112       if (PetscImaginaryPart(a->val[i]) != 0.0) {
1113         realonly = PETSC_FALSE;
1114         break;
1115       }
1116     }
1117 #endif
1118 
1119     ierr = PetscViewerASCIIUseTabs(viewer,PETSC_FALSE);CHKERRQ(ierr);
1120     for (i=0; i<m; i++) {
1121       jcnt = 0;
1122       shift = a->sliidx[i>>3]+(i&0x07);
1123       for (j=0; j<A->cmap->n; j++) {
1124         if (jcnt < a->rlen[i] && j == a->colidx[shift+8*j]) {
1125           value = a->val[cnt++];
1126           jcnt++;
1127         } else {
1128           value = 0.0;
1129         }
1130 #if defined(PETSC_USE_COMPLEX)
1131         if (realonly) {
1132           ierr = PetscViewerASCIIPrintf(viewer," %7.5e ",(double)PetscRealPart(value));CHKERRQ(ierr);
1133         } else {
1134           ierr = PetscViewerASCIIPrintf(viewer," %7.5e+%7.5e i ",(double)PetscRealPart(value),(double)PetscImaginaryPart(value));CHKERRQ(ierr);
1135         }
1136 #else
1137         ierr = PetscViewerASCIIPrintf(viewer," %7.5e ",(double)value);CHKERRQ(ierr);
1138 #endif
1139       }
1140       ierr = PetscViewerASCIIPrintf(viewer,"\n");CHKERRQ(ierr);
1141     }
1142     ierr = PetscViewerASCIIUseTabs(viewer,PETSC_TRUE);CHKERRQ(ierr);
1143   } else if (format == PETSC_VIEWER_ASCII_MATRIXMARKET) {
1144     PetscInt fshift=1;
1145     ierr = PetscViewerASCIIUseTabs(viewer,PETSC_FALSE);CHKERRQ(ierr);
1146 #if defined(PETSC_USE_COMPLEX)
1147     ierr = PetscViewerASCIIPrintf(viewer,"%%%%MatrixMarket matrix coordinate complex general\n");CHKERRQ(ierr);
1148 #else
1149     ierr = PetscViewerASCIIPrintf(viewer,"%%%%MatrixMarket matrix coordinate real general\n");CHKERRQ(ierr);
1150 #endif
1151     ierr = PetscViewerASCIIPrintf(viewer,"%D %D %D\n", m, A->cmap->n, a->nz);CHKERRQ(ierr);
1152     for (i=0; i<m; i++) {
1153       shift = a->sliidx[i>>3]+(i&0x07);
1154       for (j=0; j<a->rlen[i]; j++) {
1155 #if defined(PETSC_USE_COMPLEX)
1156         ierr = PetscViewerASCIIPrintf(viewer,"%D %D %g %g\n",i+fshift,a->colidx[shift+8*j]+fshift,(double)PetscRealPart(a->val[shift+8*j]),(double)PetscImaginaryPart(a->val[shift+8*j]));CHKERRQ(ierr);
1157 #else
1158         ierr = PetscViewerASCIIPrintf(viewer,"%D %D %g\n",i+fshift,a->colidx[shift+8*j]+fshift,(double)a->val[shift+8*j]);CHKERRQ(ierr);
1159 #endif
1160       }
1161     }
1162     ierr = PetscViewerASCIIUseTabs(viewer,PETSC_TRUE);CHKERRQ(ierr);
1163   } else if (format == PETSC_VIEWER_NATIVE) {
1164     for (i=0; i<a->totalslices; i++) { /* loop over slices */
1165       PetscInt row;
1166       ierr = PetscViewerASCIIPrintf(viewer,"slice %D: %D %D\n",i,a->sliidx[i],a->sliidx[i+1]);CHKERRQ(ierr);
1167       for (j=a->sliidx[i],row=0; j<a->sliidx[i+1]; j++,row=((row+1)&0x07)) {
1168 #if defined(PETSC_USE_COMPLEX)
1169         if (PetscImaginaryPart(a->val[j]) > 0.0) {
1170           ierr = PetscViewerASCIIPrintf(viewer,"  %D %D %g + %g i\n",8*i+row,a->colidx[j],(double)PetscRealPart(a->val[j]),(double)PetscImaginaryPart(a->val[j]));CHKERRQ(ierr);
1171         } else if (PetscImaginaryPart(a->val[j]) < 0.0) {
1172           ierr = PetscViewerASCIIPrintf(viewer,"  %D %D %g - %g i\n",8*i+row,a->colidx[j],(double)PetscRealPart(a->val[j]),-(double)PetscImaginaryPart(a->val[j]));CHKERRQ(ierr);
1173         } else {
1174           ierr = PetscViewerASCIIPrintf(viewer,"  %D %D %g\n",8*i+row,a->colidx[j],(double)PetscRealPart(a->val[j]));CHKERRQ(ierr);
1175         }
1176 #else
1177         ierr = PetscViewerASCIIPrintf(viewer,"  %D %D %g\n",8*i+row,a->colidx[j],(double)a->val[j]);CHKERRQ(ierr);
1178 #endif
1179       }
1180     }
1181   } else {
1182     ierr = PetscViewerASCIIUseTabs(viewer,PETSC_FALSE);CHKERRQ(ierr);
1183     if (A->factortype) {
1184       for (i=0; i<m; i++) {
1185         shift = a->sliidx[i>>3]+(i&0x07);
1186         ierr = PetscViewerASCIIPrintf(viewer,"row %D:",i);CHKERRQ(ierr);
1187         /* L part */
1188         for (j=shift; j<a->diag[i]; j+=8) {
1189 #if defined(PETSC_USE_COMPLEX)
1190           if (PetscImaginaryPart(a->val[shift+8*j]) > 0.0) {
1191             ierr = PetscViewerASCIIPrintf(viewer," (%D, %g + %g i)",a->colidx[j],(double)PetscRealPart(a->val[j]),(double)PetscImaginaryPart(a->val[j]));CHKERRQ(ierr);
1192           } else if (PetscImaginaryPart(a->val[shift+8*j]) < 0.0) {
1193             ierr = PetscViewerASCIIPrintf(viewer," (%D, %g - %g i)",a->colidx[j],(double)PetscRealPart(a->val[j]),(double)(-PetscImaginaryPart(a->val[j])));CHKERRQ(ierr);
1194           } else {
1195             ierr = PetscViewerASCIIPrintf(viewer," (%D, %g) ",a->colidx[j],(double)PetscRealPart(a->val[j]));CHKERRQ(ierr);
1196           }
1197 #else
1198           ierr = PetscViewerASCIIPrintf(viewer," (%D, %g) ",a->colidx[j],(double)a->val[j]);CHKERRQ(ierr);
1199 #endif
1200         }
1201         /* diagonal */
1202         j = a->diag[i];
1203 #if defined(PETSC_USE_COMPLEX)
1204         if (PetscImaginaryPart(a->val[j]) > 0.0) {
1205           ierr = PetscViewerASCIIPrintf(viewer," (%D, %g + %g i)",a->colidx[j],(double)PetscRealPart(1.0/a->val[j]),(double)PetscImaginaryPart(1.0/a->val[j]));CHKERRQ(ierr);
1206         } else if (PetscImaginaryPart(a->val[j]) < 0.0) {
1207           ierr = PetscViewerASCIIPrintf(viewer," (%D, %g - %g i)",a->colidx[j],(double)PetscRealPart(1.0/a->val[j]),(double)(-PetscImaginaryPart(1.0/a->val[j])));CHKERRQ(ierr);
1208         } else {
1209           ierr = PetscViewerASCIIPrintf(viewer," (%D, %g) ",a->colidx[j],(double)PetscRealPart(1.0/a->val[j]));CHKERRQ(ierr);
1210         }
1211 #else
1212         ierr = PetscViewerASCIIPrintf(viewer," (%D, %g) ",a->colidx[j],(double)(1.0/a->val[j]));CHKERRQ(ierr);
1213 #endif
1214 
1215         /* U part */
1216         for (j=a->diag[i]+1; j<shift+8*a->rlen[i]; j+=8) {
1217 #if defined(PETSC_USE_COMPLEX)
1218           if (PetscImaginaryPart(a->val[j]) > 0.0) {
1219             ierr = PetscViewerASCIIPrintf(viewer," (%D, %g + %g i)",a->colidx[j],(double)PetscRealPart(a->val[j]),(double)PetscImaginaryPart(a->val[j]));CHKERRQ(ierr);
1220           } else if (PetscImaginaryPart(a->val[j]) < 0.0) {
1221             ierr = PetscViewerASCIIPrintf(viewer," (%D, %g - %g i)",a->colidx[j],(double)PetscRealPart(a->val[j]),(double)(-PetscImaginaryPart(a->val[j])));CHKERRQ(ierr);
1222           } else {
1223             ierr = PetscViewerASCIIPrintf(viewer," (%D, %g) ",a->colidx[j],(double)PetscRealPart(a->val[j]));CHKERRQ(ierr);
1224           }
1225 #else
1226           ierr = PetscViewerASCIIPrintf(viewer," (%D, %g) ",a->colidx[j],(double)a->val[j]);CHKERRQ(ierr);
1227 #endif
1228         }
1229         ierr = PetscViewerASCIIPrintf(viewer,"\n");CHKERRQ(ierr);
1230       }
1231     } else {
1232       for (i=0; i<m; i++) {
1233         shift = a->sliidx[i>>3]+(i&0x07);
1234         ierr = PetscViewerASCIIPrintf(viewer,"row %D:",i);CHKERRQ(ierr);
1235         for (j=0; j<a->rlen[i]; j++) {
1236 #if defined(PETSC_USE_COMPLEX)
1237           if (PetscImaginaryPart(a->val[j]) > 0.0) {
1238             ierr = PetscViewerASCIIPrintf(viewer," (%D, %g + %g i)",a->colidx[shift+8*j],(double)PetscRealPart(a->val[shift+8*j]),(double)PetscImaginaryPart(a->val[shift+8*j]));CHKERRQ(ierr);
1239           } else if (PetscImaginaryPart(a->val[j]) < 0.0) {
1240             ierr = PetscViewerASCIIPrintf(viewer," (%D, %g - %g i)",a->colidx[shift+8*j],(double)PetscRealPart(a->val[shift+8*j]),(double)-PetscImaginaryPart(a->val[shift+8*j]));CHKERRQ(ierr);
1241           } else {
1242             ierr = PetscViewerASCIIPrintf(viewer," (%D, %g) ",a->colidx[shift+8*j],(double)PetscRealPart(a->val[shift+8*j]));CHKERRQ(ierr);
1243           }
1244 #else
1245           ierr = PetscViewerASCIIPrintf(viewer," (%D, %g) ",a->colidx[shift+8*j],(double)a->val[shift+8*j]);CHKERRQ(ierr);
1246 #endif
1247         }
1248         ierr = PetscViewerASCIIPrintf(viewer,"\n");CHKERRQ(ierr);
1249       }
1250     }
1251     ierr = PetscViewerASCIIUseTabs(viewer,PETSC_TRUE);CHKERRQ(ierr);
1252   }
1253   ierr = PetscViewerFlush(viewer);CHKERRQ(ierr);
1254   PetscFunctionReturn(0);
1255 }
1256 
1257 #include <petscdraw.h>
1258 PetscErrorCode MatView_SeqSELL_Draw_Zoom(PetscDraw draw,void *Aa)
1259 {
1260   Mat               A=(Mat)Aa;
1261   Mat_SeqSELL       *a=(Mat_SeqSELL*)A->data;
1262   PetscInt          i,j,m=A->rmap->n,shift;
1263   int               color;
1264   PetscReal         xl,yl,xr,yr,x_l,x_r,y_l,y_r;
1265   PetscViewer       viewer;
1266   PetscViewerFormat format;
1267   PetscErrorCode    ierr;
1268 
1269   PetscFunctionBegin;
1270   ierr = PetscObjectQuery((PetscObject)A,"Zoomviewer",(PetscObject*)&viewer);CHKERRQ(ierr);
1271   ierr = PetscViewerGetFormat(viewer,&format);CHKERRQ(ierr);
1272   ierr = PetscDrawGetCoordinates(draw,&xl,&yl,&xr,&yr);CHKERRQ(ierr);
1273 
1274   /* loop over matrix elements drawing boxes */
1275 
1276   if (format != PETSC_VIEWER_DRAW_CONTOUR) {
1277     ierr = PetscDrawCollectiveBegin(draw);CHKERRQ(ierr);
1278     /* Blue for negative, Cyan for zero and  Red for positive */
1279     color = PETSC_DRAW_BLUE;
1280     for (i=0; i<m; i++) {
1281       shift = a->sliidx[i>>3]+(i&0x07); /* starting index of the row i */
1282       y_l = m - i - 1.0; y_r = y_l + 1.0;
1283       for (j=0; j<a->rlen[i]; j++) {
1284         x_l = a->colidx[shift+j*8]; x_r = x_l + 1.0;
1285         if (PetscRealPart(a->val[shift+8*j]) >=  0.) continue;
1286         ierr = PetscDrawRectangle(draw,x_l,y_l,x_r,y_r,color,color,color,color);CHKERRQ(ierr);
1287       }
1288     }
1289     color = PETSC_DRAW_CYAN;
1290     for (i=0; i<m; i++) {
1291       shift = a->sliidx[i>>3]+(i&0x07);
1292       y_l = m - i - 1.0; y_r = y_l + 1.0;
1293       for (j=0; j<a->rlen[i]; j++) {
1294         x_l = a->colidx[shift+j*8]; x_r = x_l + 1.0;
1295         if (a->val[shift+8*j] !=  0.) continue;
1296         ierr = PetscDrawRectangle(draw,x_l,y_l,x_r,y_r,color,color,color,color);CHKERRQ(ierr);
1297       }
1298     }
1299     color = PETSC_DRAW_RED;
1300     for (i=0; i<m; i++) {
1301       shift = a->sliidx[i>>3]+(i&0x07);
1302       y_l = m - i - 1.0; y_r = y_l + 1.0;
1303       for (j=0; j<a->rlen[i]; j++) {
1304         x_l = a->colidx[shift+j*8]; x_r = x_l + 1.0;
1305         if (PetscRealPart(a->val[shift+8*j]) <=  0.) continue;
1306         ierr = PetscDrawRectangle(draw,x_l,y_l,x_r,y_r,color,color,color,color);CHKERRQ(ierr);
1307       }
1308     }
1309     ierr = PetscDrawCollectiveEnd(draw);CHKERRQ(ierr);
1310   } else {
1311     /* use contour shading to indicate magnitude of values */
1312     /* first determine max of all nonzero values */
1313     PetscReal minv=0.0,maxv=0.0;
1314     PetscInt  count=0;
1315     PetscDraw popup;
1316     for (i=0; i<a->sliidx[a->totalslices]; i++) {
1317       if (PetscAbsScalar(a->val[i]) > maxv) maxv = PetscAbsScalar(a->val[i]);
1318     }
1319     if (minv >= maxv) maxv = minv + PETSC_SMALL;
1320     ierr = PetscDrawGetPopup(draw,&popup);CHKERRQ(ierr);
1321     ierr = PetscDrawScalePopup(popup,minv,maxv);CHKERRQ(ierr);
1322 
1323     ierr = PetscDrawCollectiveBegin(draw);CHKERRQ(ierr);
1324     for (i=0; i<m; i++) {
1325       shift = a->sliidx[i>>3]+(i&0x07);
1326       y_l = m - i - 1.0;
1327       y_r = y_l + 1.0;
1328       for (j=0; j<a->rlen[i]; j++) {
1329         x_l = a->colidx[shift+j*8];
1330         x_r = x_l + 1.0;
1331         color = PetscDrawRealToColor(PetscAbsScalar(a->val[count]),minv,maxv);
1332         ierr = PetscDrawRectangle(draw,x_l,y_l,x_r,y_r,color,color,color,color);CHKERRQ(ierr);
1333         count++;
1334       }
1335     }
1336     ierr = PetscDrawCollectiveEnd(draw);CHKERRQ(ierr);
1337   }
1338   PetscFunctionReturn(0);
1339 }
1340 
1341 #include <petscdraw.h>
1342 PetscErrorCode MatView_SeqSELL_Draw(Mat A,PetscViewer viewer)
1343 {
1344   PetscDraw      draw;
1345   PetscReal      xr,yr,xl,yl,h,w;
1346   PetscBool      isnull;
1347   PetscErrorCode ierr;
1348 
1349   PetscFunctionBegin;
1350   ierr = PetscViewerDrawGetDraw(viewer,0,&draw);CHKERRQ(ierr);
1351   ierr = PetscDrawIsNull(draw,&isnull);CHKERRQ(ierr);
1352   if (isnull) PetscFunctionReturn(0);
1353 
1354   xr   = A->cmap->n; yr  = A->rmap->n; h = yr/10.0; w = xr/10.0;
1355   xr  += w;          yr += h;         xl = -w;     yl = -h;
1356   ierr = PetscDrawSetCoordinates(draw,xl,yl,xr,yr);CHKERRQ(ierr);
1357   ierr = PetscObjectCompose((PetscObject)A,"Zoomviewer",(PetscObject)viewer);CHKERRQ(ierr);
1358   ierr = PetscDrawZoom(draw,MatView_SeqSELL_Draw_Zoom,A);CHKERRQ(ierr);
1359   ierr = PetscObjectCompose((PetscObject)A,"Zoomviewer",NULL);CHKERRQ(ierr);
1360   ierr = PetscDrawSave(draw);CHKERRQ(ierr);
1361   PetscFunctionReturn(0);
1362 }
1363 
1364 PetscErrorCode MatView_SeqSELL(Mat A,PetscViewer viewer)
1365 {
1366   PetscBool      iascii,isbinary,isdraw;
1367   PetscErrorCode ierr;
1368 
1369   PetscFunctionBegin;
1370   ierr = PetscObjectTypeCompare((PetscObject)viewer,PETSCVIEWERASCII,&iascii);CHKERRQ(ierr);
1371   ierr = PetscObjectTypeCompare((PetscObject)viewer,PETSCVIEWERBINARY,&isbinary);CHKERRQ(ierr);
1372   ierr = PetscObjectTypeCompare((PetscObject)viewer,PETSCVIEWERDRAW,&isdraw);CHKERRQ(ierr);
1373   if (iascii) {
1374     ierr = MatView_SeqSELL_ASCII(A,viewer);CHKERRQ(ierr);
1375   } else if (isbinary) {
1376     /* ierr = MatView_SeqSELL_Binary(A,viewer);CHKERRQ(ierr); */
1377   } else if (isdraw) {
1378     ierr = MatView_SeqSELL_Draw(A,viewer);CHKERRQ(ierr);
1379   }
1380   PetscFunctionReturn(0);
1381 }
1382 
1383 PetscErrorCode MatAssemblyEnd_SeqSELL(Mat A,MatAssemblyType mode)
1384 {
1385   Mat_SeqSELL    *a=(Mat_SeqSELL*)A->data;
1386   PetscInt       i,shift,row_in_slice,row,nrow,*cp,lastcol,j,k;
1387   MatScalar      *vp;
1388   PetscErrorCode ierr;
1389 
1390   PetscFunctionBegin;
1391   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0);
1392   /* To do: compress out the unused elements */
1393   ierr = MatMarkDiagonal_SeqSELL(A);CHKERRQ(ierr);
1394   ierr = PetscInfo6(A,"Matrix size: %D X %D; storage space: %D allocated %D used (%D nonzeros+%D paddedzeros)\n",A->rmap->n,A->cmap->n,a->maxallocmat,a->sliidx[a->totalslices],a->nz,a->sliidx[a->totalslices]-a->nz);CHKERRQ(ierr);
1395   ierr = PetscInfo1(A,"Number of mallocs during MatSetValues() is %D\n",a->reallocs);CHKERRQ(ierr);
1396   ierr = PetscInfo1(A,"Maximum nonzeros in any row is %D\n",a->rlenmax);CHKERRQ(ierr);
1397   /* Set unused slots for column indices to last valid column index. Set unused slots for values to zero. This allows for a use of unmasked intrinsics -> higher performance */
1398   for (i=0; i<a->totalslices; ++i) {
1399     shift = a->sliidx[i];    /* starting index of the slice */
1400     cp    = a->colidx+shift; /* pointer to the column indices of the slice */
1401     vp    = a->val+shift;    /* pointer to the nonzero values of the slice */
1402     for (row_in_slice=0; row_in_slice<8; ++row_in_slice) { /* loop over rows in the slice */
1403       row  = 8*i + row_in_slice;
1404       nrow = a->rlen[row]; /* number of nonzeros in row */
1405       /*
1406         Search for the nearest nonzero. Normally setting the index to zero may cause extra communication.
1407         But if the entire slice are empty, it is fine to use 0 since the index will not be loaded.
1408       */
1409       lastcol = 0;
1410       if (nrow>0) { /* nonempty row */
1411         lastcol = cp[8*(nrow-1)+row_in_slice]; /* use the index from the last nonzero at current row */
1412       } else if (!row_in_slice) { /* first row of the currect slice is empty */
1413         for (j=1;j<8;j++) {
1414           if (a->rlen[8*i+j]) {
1415             lastcol = cp[j];
1416             break;
1417           }
1418         }
1419       } else {
1420         if (a->sliidx[i+1] != shift) lastcol = cp[row_in_slice-1]; /* use the index from the previous row */
1421       }
1422 
1423       for (k=nrow; k<(a->sliidx[i+1]-shift)/8; ++k) {
1424         cp[8*k+row_in_slice] = lastcol;
1425         vp[8*k+row_in_slice] = (MatScalar)0;
1426       }
1427     }
1428   }
1429 
1430   A->info.mallocs += a->reallocs;
1431   a->reallocs      = 0;
1432 
1433   ierr = MatSeqSELLInvalidateDiagonal(A);CHKERRQ(ierr);
1434   PetscFunctionReturn(0);
1435 }
1436 
1437 PetscErrorCode MatGetInfo_SeqSELL(Mat A,MatInfoType flag,MatInfo *info)
1438 {
1439   Mat_SeqSELL *a=(Mat_SeqSELL*)A->data;
1440 
1441   PetscFunctionBegin;
1442   info->block_size   = 1.0;
1443   info->nz_allocated = (double)a->maxallocmat;
1444   info->nz_used      = (double)a->sliidx[a->totalslices]; /* include padding zeros */
1445   info->nz_unneeded  = (double)(a->maxallocmat-a->sliidx[a->totalslices]);
1446   info->assemblies   = (double)A->num_ass;
1447   info->mallocs      = (double)A->info.mallocs;
1448   info->memory       = ((PetscObject)A)->mem;
1449   if (A->factortype) {
1450     info->fill_ratio_given  = A->info.fill_ratio_given;
1451     info->fill_ratio_needed = A->info.fill_ratio_needed;
1452     info->factor_mallocs    = A->info.factor_mallocs;
1453   } else {
1454     info->fill_ratio_given  = 0;
1455     info->fill_ratio_needed = 0;
1456     info->factor_mallocs    = 0;
1457   }
1458   PetscFunctionReturn(0);
1459 }
1460 
1461 PetscErrorCode MatSetValues_SeqSELL(Mat A,PetscInt m,const PetscInt im[],PetscInt n,const PetscInt in[],const PetscScalar v[],InsertMode is)
1462 {
1463   Mat_SeqSELL    *a=(Mat_SeqSELL*)A->data;
1464   PetscInt       shift,i,k,l,low,high,t,ii,row,col,nrow;
1465   PetscInt       *cp,nonew=a->nonew,lastcol=-1;
1466   MatScalar      *vp,value;
1467   PetscErrorCode ierr;
1468 
1469   PetscFunctionBegin;
1470   for (k=0; k<m; k++) { /* loop over added rows */
1471     row = im[k];
1472     if (row < 0) continue;
1473 #if defined(PETSC_USE_DEBUG)
1474     if (row >= A->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Row too large: row %D max %D",row,A->rmap->n-1);
1475 #endif
1476     shift = a->sliidx[row>>3]+(row&0x07); /* starting index of the row */
1477     cp    = a->colidx+shift; /* pointer to the row */
1478     vp    = a->val+shift; /* pointer to the row */
1479     nrow  = a->rlen[row];
1480     low   = 0;
1481     high  = nrow;
1482 
1483     for (l=0; l<n; l++) { /* loop over added columns */
1484       col = in[l];
1485       if (col<0) continue;
1486 #if defined(PETSC_USE_DEBUG)
1487       if (col >= A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Col too large: row %D max %D",col,A->cmap->n-1);
1488 #endif
1489       if (a->roworiented) {
1490         value = v[l+k*n];
1491       } else {
1492         value = v[k+l*m];
1493       }
1494       if ((value == 0.0 && a->ignorezeroentries) && (is == ADD_VALUES)) continue;
1495 
1496       /* search in this row for the specified colmun, i indicates the column to be set */
1497       if (col <= lastcol) low = 0;
1498       else high = nrow;
1499       lastcol = col;
1500       while (high-low > 5) {
1501         t = (low+high)/2;
1502         if (*(cp+t*8) > col) high = t;
1503         else low = t;
1504       }
1505       for (i=low; i<high; i++) {
1506         if (*(cp+i*8) > col) break;
1507         if (*(cp+i*8) == col) {
1508           if (is == ADD_VALUES) *(vp+i*8) += value;
1509           else *(vp+i*8) = value;
1510           low = i + 1;
1511           goto noinsert;
1512         }
1513       }
1514       if (value == 0.0 && a->ignorezeroentries) goto noinsert;
1515       if (nonew == 1) goto noinsert;
1516       if (nonew == -1) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Inserting a new nonzero (%D, %D) in the matrix", row, col);
1517       /* If the current row length exceeds the slice width (e.g. nrow==slice_width), allocate a new space, otherwise do nothing */
1518       MatSeqXSELLReallocateSELL(A,A->rmap->n,1,nrow,a->sliidx,row/8,row,col,a->colidx,a->val,cp,vp,nonew,MatScalar);
1519       /* add the new nonzero to the high position, shift the remaining elements in current row to the right by one slot */
1520       for (ii=nrow-1; ii>=i; ii--) {
1521         *(cp+(ii+1)*8) = *(cp+ii*8);
1522         *(vp+(ii+1)*8) = *(vp+ii*8);
1523       }
1524       a->rlen[row]++;
1525       *(cp+i*8) = col;
1526       *(vp+i*8) = value;
1527       a->nz++;
1528       A->nonzerostate++;
1529       low = i+1; high++; nrow++;
1530 noinsert:;
1531     }
1532     a->rlen[row] = nrow;
1533   }
1534   PetscFunctionReturn(0);
1535 }
1536 
1537 PetscErrorCode MatCopy_SeqSELL(Mat A,Mat B,MatStructure str)
1538 {
1539   PetscErrorCode ierr;
1540 
1541   PetscFunctionBegin;
1542   /* If the two matrices have the same copy implementation, use fast copy. */
1543   if (str == SAME_NONZERO_PATTERN && (A->ops->copy == B->ops->copy)) {
1544     Mat_SeqSELL *a=(Mat_SeqSELL*)A->data;
1545     Mat_SeqSELL *b=(Mat_SeqSELL*)B->data;
1546 
1547     if (a->sliidx[a->totalslices] != b->sliidx[b->totalslices]) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"Number of nonzeros in two matrices are different");
1548     ierr = PetscMemcpy(b->val,a->val,a->sliidx[a->totalslices]*sizeof(PetscScalar));CHKERRQ(ierr);
1549   } else {
1550     ierr = MatCopy_Basic(A,B,str);CHKERRQ(ierr);
1551   }
1552   PetscFunctionReturn(0);
1553 }
1554 
1555 PetscErrorCode MatSetUp_SeqSELL(Mat A)
1556 {
1557   PetscErrorCode ierr;
1558 
1559   PetscFunctionBegin;
1560   ierr = MatSeqSELLSetPreallocation(A,PETSC_DEFAULT,0);CHKERRQ(ierr);
1561   PetscFunctionReturn(0);
1562 }
1563 
1564 PetscErrorCode MatSeqSELLGetArray_SeqSELL(Mat A,PetscScalar *array[])
1565 {
1566   Mat_SeqSELL *a=(Mat_SeqSELL*)A->data;
1567 
1568   PetscFunctionBegin;
1569   *array = a->val;
1570   PetscFunctionReturn(0);
1571 }
1572 
1573 PetscErrorCode MatSeqSELLRestoreArray_SeqSELL(Mat A,PetscScalar *array[])
1574 {
1575   PetscFunctionBegin;
1576   PetscFunctionReturn(0);
1577 }
1578 
1579 PetscErrorCode MatRealPart_SeqSELL(Mat A)
1580 {
1581   Mat_SeqSELL *a=(Mat_SeqSELL*)A->data;
1582   PetscInt    i;
1583   MatScalar   *aval=a->val;
1584 
1585   PetscFunctionBegin;
1586   for (i=0; i<a->sliidx[a->totalslices]; i++) aval[i]=PetscRealPart(aval[i]);
1587   PetscFunctionReturn(0);
1588 }
1589 
1590 PetscErrorCode MatImaginaryPart_SeqSELL(Mat A)
1591 {
1592   Mat_SeqSELL    *a=(Mat_SeqSELL*)A->data;
1593   PetscInt       i;
1594   MatScalar      *aval=a->val;
1595   PetscErrorCode ierr;
1596 
1597   PetscFunctionBegin;
1598   for (i=0; i<a->sliidx[a->totalslices]; i++) aval[i] = PetscImaginaryPart(aval[i]);
1599   ierr = MatSeqSELLInvalidateDiagonal(A);CHKERRQ(ierr);
1600   PetscFunctionReturn(0);
1601 }
1602 
1603 PetscErrorCode MatScale_SeqSELL(Mat inA,PetscScalar alpha)
1604 {
1605   Mat_SeqSELL    *a=(Mat_SeqSELL*)inA->data;
1606   MatScalar      *aval=a->val;
1607   PetscScalar    oalpha=alpha;
1608   PetscBLASInt   one=1,size;
1609   PetscErrorCode ierr;
1610 
1611   PetscFunctionBegin;
1612   ierr = PetscBLASIntCast(a->sliidx[a->totalslices],&size);CHKERRQ(ierr);
1613   PetscStackCallBLAS("BLASscal",BLASscal_(&size,&oalpha,aval,&one));
1614   ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
1615   ierr = MatSeqSELLInvalidateDiagonal(inA);CHKERRQ(ierr);
1616   PetscFunctionReturn(0);
1617 }
1618 
1619 PetscErrorCode MatShift_SeqSELL(Mat Y,PetscScalar a)
1620 {
1621   Mat_SeqSELL    *y=(Mat_SeqSELL*)Y->data;
1622   PetscErrorCode ierr;
1623 
1624   PetscFunctionBegin;
1625   if (!Y->preallocated || !y->nz) {
1626     ierr = MatSeqSELLSetPreallocation(Y,1,NULL);CHKERRQ(ierr);
1627   }
1628   ierr = MatShift_Basic(Y,a);CHKERRQ(ierr);
1629   PetscFunctionReturn(0);
1630 }
1631 
1632 PetscErrorCode MatSOR_SeqSELL(Mat A,Vec bb,PetscReal omega,MatSORType flag,PetscReal fshift,PetscInt its,PetscInt lits,Vec xx)
1633 {
1634   Mat_SeqSELL       *a=(Mat_SeqSELL*)A->data;
1635   PetscScalar       *x,sum,*t;
1636   const MatScalar   *idiag=0,*mdiag;
1637   const PetscScalar *b,*xb;
1638   PetscInt          n,m=A->rmap->n,i,j,shift;
1639   const PetscInt    *diag;
1640   PetscErrorCode    ierr;
1641 
1642   PetscFunctionBegin;
1643   its = its*lits;
1644 
1645   if (fshift != a->fshift || omega != a->omega) a->idiagvalid = PETSC_FALSE; /* must recompute idiag[] */
1646   if (!a->idiagvalid) {ierr = MatInvertDiagonal_SeqSELL(A,omega,fshift);CHKERRQ(ierr);}
1647   a->fshift = fshift;
1648   a->omega  = omega;
1649 
1650   diag  = a->diag;
1651   t     = a->ssor_work;
1652   idiag = a->idiag;
1653   mdiag = a->mdiag;
1654 
1655   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1656   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
1657   /* We count flops by assuming the upper triangular and lower triangular parts have the same number of nonzeros */
1658   if (flag == SOR_APPLY_UPPER) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"SOR_APPLY_UPPER is not implemented");
1659   if (flag == SOR_APPLY_LOWER) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"SOR_APPLY_LOWER is not implemented");
1660   if (flag & SOR_EISENSTAT) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support yet for Eisenstat");
1661 
1662   if (flag & SOR_ZERO_INITIAL_GUESS) {
1663     if ((flag & SOR_FORWARD_SWEEP) || (flag & SOR_LOCAL_FORWARD_SWEEP)) {
1664       for (i=0; i<m; i++) {
1665         shift = a->sliidx[i>>3]+(i&0x07); /* starting index of the row i */
1666         sum   = b[i];
1667         n     = (diag[i]-shift)/8;
1668         for (j=0; j<n; j++) sum -= a->val[shift+j*8]*x[a->colidx[shift+j*8]];
1669         t[i]  = sum;
1670         x[i]  = sum*idiag[i];
1671       }
1672       xb   = t;
1673       ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
1674     } else xb = b;
1675     if ((flag & SOR_BACKWARD_SWEEP) || (flag & SOR_LOCAL_BACKWARD_SWEEP)) {
1676       for (i=m-1; i>=0; i--) {
1677         shift = a->sliidx[i>>3]+(i&0x07); /* starting index of the row i */
1678         sum   = xb[i];
1679         n     = a->rlen[i]-(diag[i]-shift)/8-1;
1680         for (j=1; j<=n; j++) sum -= a->val[diag[i]+j*8]*x[a->colidx[diag[i]+j*8]];
1681         if (xb == b) {
1682           x[i] = sum*idiag[i];
1683         } else {
1684           x[i] = (1.-omega)*x[i]+sum*idiag[i];  /* omega in idiag */
1685         }
1686       }
1687       ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); /* assumes 1/2 in upper */
1688     }
1689     its--;
1690   }
1691   while (its--) {
1692     if ((flag & SOR_FORWARD_SWEEP) || (flag & SOR_LOCAL_FORWARD_SWEEP)) {
1693       for (i=0; i<m; i++) {
1694         /* lower */
1695         shift = a->sliidx[i>>3]+(i&0x07); /* starting index of the row i */
1696         sum   = b[i];
1697         n     = (diag[i]-shift)/8;
1698         for (j=0; j<n; j++) sum -= a->val[shift+j*8]*x[a->colidx[shift+j*8]];
1699         t[i]  = sum;             /* save application of the lower-triangular part */
1700         /* upper */
1701         n     = a->rlen[i]-(diag[i]-shift)/8-1;
1702         for (j=1; j<=n; j++) sum -= a->val[diag[i]+j*8]*x[a->colidx[diag[i]+j*8]];
1703         x[i]  = (1.-omega)*x[i]+sum*idiag[i];  /* omega in idiag */
1704       }
1705       xb   = t;
1706       ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
1707     } else xb = b;
1708     if ((flag & SOR_BACKWARD_SWEEP) || (flag & SOR_LOCAL_BACKWARD_SWEEP)) {
1709       for (i=m-1; i>=0; i--) {
1710         shift = a->sliidx[i>>3]+(i&0x07); /* starting index of the row i */
1711         sum = xb[i];
1712         if (xb == b) {
1713           /* whole matrix (no checkpointing available) */
1714           n     = a->rlen[i];
1715           for (j=0; j<n; j++) sum -= a->val[shift+j*8]*x[a->colidx[shift+j*8]];
1716           x[i] = (1.-omega)*x[i]+(sum+mdiag[i]*x[i])*idiag[i];
1717         } else { /* lower-triangular part has been saved, so only apply upper-triangular */
1718           n     = a->rlen[i]-(diag[i]-shift)/8-1;
1719           for (j=1; j<=n; j++) sum -= a->val[diag[i]+j*8]*x[a->colidx[diag[i]+j*8]];
1720           x[i]  = (1.-omega)*x[i]+sum*idiag[i];  /* omega in idiag */
1721         }
1722       }
1723       if (xb == b) {
1724         ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
1725       } else {
1726         ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); /* assumes 1/2 in upper */
1727       }
1728     }
1729   }
1730   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1731   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
1732   PetscFunctionReturn(0);
1733 }
1734 
1735 /* -------------------------------------------------------------------*/
1736 static struct _MatOps MatOps_Values = {MatSetValues_SeqSELL,
1737                                        MatGetRow_SeqSELL,
1738                                        MatRestoreRow_SeqSELL,
1739                                        MatMult_SeqSELL,
1740                                /* 4*/  MatMultAdd_SeqSELL,
1741                                        MatMultTranspose_SeqSELL,
1742                                        MatMultTransposeAdd_SeqSELL,
1743                                        0,
1744                                        0,
1745                                        0,
1746                                /* 10*/ 0,
1747                                        0,
1748                                        0,
1749                                        MatSOR_SeqSELL,
1750                                        0,
1751                                /* 15*/ MatGetInfo_SeqSELL,
1752                                        MatEqual_SeqSELL,
1753                                        MatGetDiagonal_SeqSELL,
1754                                        MatDiagonalScale_SeqSELL,
1755                                        0,
1756                                /* 20*/ 0,
1757                                        MatAssemblyEnd_SeqSELL,
1758                                        MatSetOption_SeqSELL,
1759                                        MatZeroEntries_SeqSELL,
1760                                /* 24*/ 0,
1761                                        0,
1762                                        0,
1763                                        0,
1764                                        0,
1765                                /* 29*/ MatSetUp_SeqSELL,
1766                                        0,
1767                                        0,
1768                                        0,
1769                                        0,
1770                                /* 34*/ MatDuplicate_SeqSELL,
1771                                        0,
1772                                        0,
1773                                        0,
1774                                        0,
1775                                /* 39*/ 0,
1776                                        0,
1777                                        0,
1778                                        MatGetValues_SeqSELL,
1779                                        MatCopy_SeqSELL,
1780                                /* 44*/ 0,
1781                                        MatScale_SeqSELL,
1782                                        MatShift_SeqSELL,
1783                                        0,
1784                                        0,
1785                                /* 49*/ 0,
1786                                        0,
1787                                        0,
1788                                        0,
1789                                        0,
1790                                /* 54*/ MatFDColoringCreate_SeqXAIJ,
1791                                        0,
1792                                        0,
1793                                        0,
1794                                        0,
1795                                /* 59*/ 0,
1796                                        MatDestroy_SeqSELL,
1797                                        MatView_SeqSELL,
1798                                        0,
1799                                        0,
1800                                /* 64*/ 0,
1801                                        0,
1802                                        0,
1803                                        0,
1804                                        0,
1805                                /* 69*/ 0,
1806                                        0,
1807                                        0,
1808                                        0,
1809                                        0,
1810                                /* 74*/ 0,
1811                                        MatFDColoringApply_AIJ, /* reuse the FDColoring function for AIJ */
1812                                        0,
1813                                        0,
1814                                        0,
1815                                /* 79*/ 0,
1816                                        0,
1817                                        0,
1818                                        0,
1819                                        0,
1820                                /* 84*/ 0,
1821                                        0,
1822                                        0,
1823                                        0,
1824                                        0,
1825                                /* 89*/ 0,
1826                                        0,
1827                                        0,
1828                                        0,
1829                                        0,
1830                                /* 94*/ 0,
1831                                        0,
1832                                        0,
1833                                        0,
1834                                        0,
1835                                /* 99*/ 0,
1836                                        0,
1837                                        0,
1838                                        MatConjugate_SeqSELL,
1839                                        0,
1840                                /*104*/ 0,
1841                                        0,
1842                                        0,
1843                                        0,
1844                                        0,
1845                                /*109*/ 0,
1846                                        0,
1847                                        0,
1848                                        0,
1849                                        MatMissingDiagonal_SeqSELL,
1850                                /*114*/ 0,
1851                                        0,
1852                                        0,
1853                                        0,
1854                                        0,
1855                                /*119*/ 0,
1856                                        0,
1857                                        0,
1858                                        0,
1859                                        0,
1860                                /*124*/ 0,
1861                                        0,
1862                                        0,
1863                                        0,
1864                                        0,
1865                                /*129*/ 0,
1866                                        0,
1867                                        0,
1868                                        0,
1869                                        0,
1870                                /*134*/ 0,
1871                                        0,
1872                                        0,
1873                                        0,
1874                                        0,
1875                                /*139*/ 0,
1876                                        0,
1877                                        0,
1878                                        MatFDColoringSetUp_SeqXAIJ,
1879                                        0,
1880                                 /*144*/0
1881 };
1882 
1883 PetscErrorCode MatStoreValues_SeqSELL(Mat mat)
1884 {
1885   Mat_SeqSELL    *a=(Mat_SeqSELL*)mat->data;
1886   PetscErrorCode ierr;
1887 
1888   PetscFunctionBegin;
1889   if (!a->nonew) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ORDER,"Must call MatSetOption(A,MAT_NEW_NONZERO_LOCATIONS,PETSC_FALSE);first");
1890 
1891   /* allocate space for values if not already there */
1892   if (!a->saved_values) {
1893     ierr = PetscMalloc1(a->sliidx[a->totalslices]+1,&a->saved_values);CHKERRQ(ierr);
1894     ierr = PetscLogObjectMemory((PetscObject)mat,(a->sliidx[a->totalslices]+1)*sizeof(PetscScalar));CHKERRQ(ierr);
1895   }
1896 
1897   /* copy values over */
1898   ierr = PetscMemcpy(a->saved_values,a->val,a->sliidx[a->totalslices]*sizeof(PetscScalar));CHKERRQ(ierr);
1899   PetscFunctionReturn(0);
1900 }
1901 
1902 PetscErrorCode MatRetrieveValues_SeqSELL(Mat mat)
1903 {
1904   Mat_SeqSELL    *a=(Mat_SeqSELL*)mat->data;
1905   PetscErrorCode ierr;
1906 
1907   PetscFunctionBegin;
1908   if (!a->nonew) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ORDER,"Must call MatSetOption(A,MAT_NEW_NONZERO_LOCATIONS,PETSC_FALSE);first");
1909   if (!a->saved_values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ORDER,"Must call MatStoreValues(A);first");
1910   /* copy values over */
1911   ierr = PetscMemcpy(a->val,a->saved_values,a->sliidx[a->totalslices]*sizeof(PetscScalar));CHKERRQ(ierr);
1912   PetscFunctionReturn(0);
1913 }
1914 
1915 /*@C
1916  MatSeqSELLRestoreArray - returns access to the array where the data for a MATSEQSELL matrix is stored obtained by MatSeqSELLGetArray()
1917 
1918  Not Collective
1919 
1920  Input Parameters:
1921  .  mat - a MATSEQSELL matrix
1922  .  array - pointer to the data
1923 
1924  Level: intermediate
1925 
1926  .seealso: MatSeqSELLGetArray(), MatSeqSELLRestoreArrayF90()
1927  @*/
1928 PetscErrorCode MatSeqSELLRestoreArray(Mat A,PetscScalar **array)
1929 {
1930   PetscErrorCode ierr;
1931 
1932   PetscFunctionBegin;
1933   ierr = PetscUseMethod(A,"MatSeqSELLRestoreArray_C",(Mat,PetscScalar**),(A,array));CHKERRQ(ierr);
1934   PetscFunctionReturn(0);
1935 }
1936 
1937 PETSC_EXTERN PetscErrorCode MatCreate_SeqSELL(Mat B)
1938 {
1939   Mat_SeqSELL    *b;
1940   PetscMPIInt    size;
1941   PetscErrorCode ierr;
1942 
1943   PetscFunctionBegin;
1944   ierr = MPI_Comm_size(PetscObjectComm((PetscObject)B),&size);CHKERRQ(ierr);
1945   if (size > 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Comm must be of size 1");
1946 
1947   ierr = PetscNewLog(B,&b);CHKERRQ(ierr);
1948 
1949   B->data = (void*)b;
1950 
1951   ierr = PetscMemcpy(B->ops,&MatOps_Values,sizeof(struct _MatOps));CHKERRQ(ierr);
1952 
1953   b->row                = 0;
1954   b->col                = 0;
1955   b->icol               = 0;
1956   b->reallocs           = 0;
1957   b->ignorezeroentries  = PETSC_FALSE;
1958   b->roworiented        = PETSC_TRUE;
1959   b->nonew              = 0;
1960   b->diag               = 0;
1961   b->solve_work         = 0;
1962   B->spptr              = 0;
1963   b->saved_values       = 0;
1964   b->idiag              = 0;
1965   b->mdiag              = 0;
1966   b->ssor_work          = 0;
1967   b->omega              = 1.0;
1968   b->fshift             = 0.0;
1969   b->idiagvalid         = PETSC_FALSE;
1970   b->keepnonzeropattern = PETSC_FALSE;
1971 
1972   ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQSELL);CHKERRQ(ierr);
1973   ierr = PetscObjectComposeFunction((PetscObject)B,"MatSeqSELLGetArray_C",MatSeqSELLGetArray_SeqSELL);CHKERRQ(ierr);
1974   ierr = PetscObjectComposeFunction((PetscObject)B,"MatSeqSELLRestoreArray_C",MatSeqSELLRestoreArray_SeqSELL);CHKERRQ(ierr);
1975   ierr = PetscObjectComposeFunction((PetscObject)B,"MatStoreValues_C",MatStoreValues_SeqSELL);CHKERRQ(ierr);
1976   ierr = PetscObjectComposeFunction((PetscObject)B,"MatRetrieveValues_C",MatRetrieveValues_SeqSELL);CHKERRQ(ierr);
1977   ierr = PetscObjectComposeFunction((PetscObject)B,"MatSeqSELLSetPreallocation_C",MatSeqSELLSetPreallocation_SeqSELL);CHKERRQ(ierr);
1978   ierr = PetscObjectComposeFunction((PetscObject)B,"MatConvert_seqsell_seqaij_C",MatConvert_SeqSELL_SeqAIJ);CHKERRQ(ierr);
1979   PetscFunctionReturn(0);
1980 }
1981 
1982 /*
1983  Given a matrix generated with MatGetFactor() duplicates all the information in A into B
1984  */
1985 PetscErrorCode MatDuplicateNoCreate_SeqSELL(Mat C,Mat A,MatDuplicateOption cpvalues,PetscBool mallocmatspace)
1986 {
1987   Mat_SeqSELL    *c,*a=(Mat_SeqSELL*)A->data;
1988   PetscInt       i,m=A->rmap->n;
1989   PetscInt       totalslices=a->totalslices;
1990   PetscErrorCode ierr;
1991 
1992   PetscFunctionBegin;
1993   c = (Mat_SeqSELL*)C->data;
1994 
1995   C->factortype = A->factortype;
1996   c->row        = 0;
1997   c->col        = 0;
1998   c->icol       = 0;
1999   c->reallocs   = 0;
2000 
2001   C->assembled = PETSC_TRUE;
2002 
2003   ierr = PetscLayoutReference(A->rmap,&C->rmap);CHKERRQ(ierr);
2004   ierr = PetscLayoutReference(A->cmap,&C->cmap);CHKERRQ(ierr);
2005 
2006   ierr = PetscMalloc1(8*totalslices,&c->rlen);CHKERRQ(ierr);
2007   ierr = PetscLogObjectMemory((PetscObject)C,m*sizeof(PetscInt));CHKERRQ(ierr);
2008   ierr = PetscMalloc1(totalslices+1,&c->sliidx);CHKERRQ(ierr);
2009   ierr = PetscLogObjectMemory((PetscObject)C, (totalslices+1)*sizeof(PetscInt));CHKERRQ(ierr);
2010 
2011   for (i=0; i<m; i++) c->rlen[i] = a->rlen[i];
2012   for (i=0; i<totalslices+1; i++) c->sliidx[i] = a->sliidx[i];
2013 
2014   /* allocate the matrix space */
2015   if (mallocmatspace) {
2016     ierr = PetscMalloc2(a->maxallocmat,&c->val,a->maxallocmat,&c->colidx);CHKERRQ(ierr);
2017     ierr = PetscLogObjectMemory((PetscObject)C,a->maxallocmat*(sizeof(PetscScalar)+sizeof(PetscInt)));CHKERRQ(ierr);
2018 
2019     c->singlemalloc = PETSC_TRUE;
2020 
2021     if (m > 0) {
2022       ierr = PetscMemcpy(c->colidx,a->colidx,(a->maxallocmat)*sizeof(PetscInt));CHKERRQ(ierr);
2023       if (cpvalues == MAT_COPY_VALUES) {
2024         ierr = PetscMemcpy(c->val,a->val,a->maxallocmat*sizeof(PetscScalar));CHKERRQ(ierr);
2025       } else {
2026         ierr = PetscMemzero(c->val,a->maxallocmat*sizeof(PetscScalar));CHKERRQ(ierr);
2027       }
2028     }
2029   }
2030 
2031   c->ignorezeroentries = a->ignorezeroentries;
2032   c->roworiented       = a->roworiented;
2033   c->nonew             = a->nonew;
2034   if (a->diag) {
2035     ierr = PetscMalloc1(m,&c->diag);CHKERRQ(ierr);
2036     ierr = PetscLogObjectMemory((PetscObject)C,m*sizeof(PetscInt));CHKERRQ(ierr);
2037     for (i=0; i<m; i++) {
2038       c->diag[i] = a->diag[i];
2039     }
2040   } else c->diag = 0;
2041 
2042   c->solve_work         = 0;
2043   c->saved_values       = 0;
2044   c->idiag              = 0;
2045   c->ssor_work          = 0;
2046   c->keepnonzeropattern = a->keepnonzeropattern;
2047   c->free_val           = PETSC_TRUE;
2048   c->free_colidx        = PETSC_TRUE;
2049 
2050   c->maxallocmat  = a->maxallocmat;
2051   c->maxallocrow  = a->maxallocrow;
2052   c->rlenmax      = a->rlenmax;
2053   c->nz           = a->nz;
2054   C->preallocated = PETSC_TRUE;
2055 
2056   c->nonzerorowcnt = a->nonzerorowcnt;
2057   C->nonzerostate  = A->nonzerostate;
2058 
2059   ierr = PetscFunctionListDuplicate(((PetscObject)A)->qlist,&((PetscObject)C)->qlist);CHKERRQ(ierr);
2060   PetscFunctionReturn(0);
2061 }
2062 
2063 PetscErrorCode MatDuplicate_SeqSELL(Mat A,MatDuplicateOption cpvalues,Mat *B)
2064 {
2065   PetscErrorCode ierr;
2066 
2067   PetscFunctionBegin;
2068   ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr);
2069   ierr = MatSetSizes(*B,A->rmap->n,A->cmap->n,A->rmap->n,A->cmap->n);CHKERRQ(ierr);
2070   if (!(A->rmap->n % A->rmap->bs) && !(A->cmap->n % A->cmap->bs)) {
2071     ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr);
2072   }
2073   ierr = MatSetType(*B,((PetscObject)A)->type_name);CHKERRQ(ierr);
2074   ierr = MatDuplicateNoCreate_SeqSELL(*B,A,cpvalues,PETSC_TRUE);CHKERRQ(ierr);
2075   PetscFunctionReturn(0);
2076 }
2077 
2078 /*@C
2079  MatCreateSeqSELL - Creates a sparse matrix in SELL format.
2080 
2081  Collective on MPI_Comm
2082 
2083  Input Parameters:
2084  +  comm - MPI communicator, set to PETSC_COMM_SELF
2085  .  m - number of rows
2086  .  n - number of columns
2087  .  rlenmax - maximum number of nonzeros in a row
2088  -  rlen - array containing the number of nonzeros in the various rows
2089  (possibly different for each row) or NULL
2090 
2091  Output Parameter:
2092  .  A - the matrix
2093 
2094  It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
2095  MatXXXXSetPreallocation() paradgm instead of this routine directly.
2096  [MatXXXXSetPreallocation() is, for example, MatSeqSELLSetPreallocation]
2097 
2098  Notes:
2099  If nnz is given then nz is ignored
2100 
2101  Specify the preallocated storage with either rlenmax or rlen (not both).
2102  Set rlenmax=PETSC_DEFAULT and rlen=NULL for PETSc to control dynamic memory
2103  allocation.  For large problems you MUST preallocate memory or you
2104  will get TERRIBLE performance, see the users' manual chapter on matrices.
2105 
2106  Level: intermediate
2107 
2108  .seealso: MatCreate(), MatCreateSELL(), MatSetValues(), MatCreateSeqSELLWithArrays()
2109 
2110  @*/
2111 PetscErrorCode MatCreateSeqSELL(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt maxallocrow,const PetscInt rlen[],Mat *A)
2112 {
2113   PetscErrorCode ierr;
2114 
2115   PetscFunctionBegin;
2116   ierr = MatCreate(comm,A);CHKERRQ(ierr);
2117   ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr);
2118   ierr = MatSetType(*A,MATSEQSELL);CHKERRQ(ierr);
2119   ierr = MatSeqSELLSetPreallocation_SeqSELL(*A,maxallocrow,rlen);CHKERRQ(ierr);
2120   PetscFunctionReturn(0);
2121 }
2122 
2123 PetscErrorCode MatEqual_SeqSELL(Mat A,Mat B,PetscBool * flg)
2124 {
2125   Mat_SeqSELL     *a=(Mat_SeqSELL*)A->data,*b=(Mat_SeqSELL*)B->data;
2126   PetscInt       totalslices=a->totalslices;
2127   PetscErrorCode ierr;
2128 
2129   PetscFunctionBegin;
2130   /* If the  matrix dimensions are not equal,or no of nonzeros */
2131   if ((A->rmap->n != B->rmap->n) || (A->cmap->n != B->cmap->n) ||(a->nz != b->nz) || (a->rlenmax != b->rlenmax)) {
2132     *flg = PETSC_FALSE;
2133     PetscFunctionReturn(0);
2134   }
2135   /* if the a->colidx are the same */
2136   ierr = PetscMemcmp(a->colidx,b->colidx,a->sliidx[totalslices]*sizeof(PetscInt),flg);CHKERRQ(ierr);
2137   if (!*flg) PetscFunctionReturn(0);
2138   /* if a->val are the same */
2139   ierr = PetscMemcmp(a->val,b->val,a->sliidx[totalslices]*sizeof(PetscScalar),flg);CHKERRQ(ierr);
2140   PetscFunctionReturn(0);
2141 }
2142 
2143 PetscErrorCode MatSeqSELLInvalidateDiagonal(Mat A)
2144 {
2145   Mat_SeqSELL *a=(Mat_SeqSELL*)A->data;
2146 
2147   PetscFunctionBegin;
2148   a->idiagvalid  = PETSC_FALSE;
2149   PetscFunctionReturn(0);
2150 }
2151 
2152 PetscErrorCode MatConjugate_SeqSELL(Mat A)
2153 {
2154 #if defined(PETSC_USE_COMPLEX)
2155   Mat_SeqSELL *a=(Mat_SeqSELL*)A->data;
2156   PetscInt    i;
2157   PetscScalar *val = a->val;
2158 
2159   PetscFunctionBegin;
2160   for (i=0; i<a->sliidx[a->totalslices]; i++) {
2161     val[i] = PetscConj(val[i]);
2162   }
2163 #else
2164   PetscFunctionBegin;
2165 #endif
2166   PetscFunctionReturn(0);
2167 }
2168