xref: /petsc/src/mat/impls/sell/seq/sell.c (revision 2bcef1f22680dd67181dd710ef3d41bc6d95a252) !
1 
2 /*
3   Defines the basic matrix operations for the SELL matrix storage format.
4 */
5 #include <../src/mat/impls/sell/seq/sell.h>  /*I   "petscmat.h"  I*/
6 #include <petscblaslapack.h>
7 #include <petsc/private/kernels/blocktranspose.h>
8 #if defined(PETSC_HAVE_IMMINTRIN_H) && (defined(__AVX512F__) || (defined(__AVX2__) && defined(__FMA__)) || defined(__AVX__)) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
9 
10   #include <immintrin.h>
11 
12   #if !defined(_MM_SCALE_8)
13   #define _MM_SCALE_8    8
14   #endif
15 
16   #if defined(__AVX512F__)
17   /* these do not work
18    vec_idx  = _mm512_loadunpackhi_epi32(vec_idx,acolidx);
19    vec_vals = _mm512_loadunpackhi_pd(vec_vals,aval);
20   */
21     #define AVX512_Mult_Private(vec_idx,vec_x,vec_vals,vec_y) \
22     /* if the mask bit is set, copy from acolidx, otherwise from vec_idx */ \
23     vec_idx  = _mm256_loadu_si256((__m256i const*)acolidx); \
24     vec_vals = _mm512_loadu_pd(aval); \
25     vec_x    = _mm512_i32gather_pd(vec_idx,x,_MM_SCALE_8); \
26     vec_y    = _mm512_fmadd_pd(vec_x,vec_vals,vec_y)
27   #elif defined(__AVX2__) && defined(__FMA__)
28     #define AVX2_Mult_Private(vec_idx,vec_x,vec_vals,vec_y) \
29     vec_vals = _mm256_loadu_pd(aval); \
30     vec_idx  = _mm_loadu_si128((__m128i const*)acolidx); /* SSE2 */ \
31     vec_x    = _mm256_i32gather_pd(x,vec_idx,_MM_SCALE_8); \
32     vec_y    = _mm256_fmadd_pd(vec_x,vec_vals,vec_y)
33   #endif
34 #endif  /* PETSC_HAVE_IMMINTRIN_H */
35 
36 /*@C
37  MatSeqSELLSetPreallocation - For good matrix assembly performance
38  the user should preallocate the matrix storage by setting the parameter nz
39  (or the array nnz).  By setting these parameters accurately, performance
40  during matrix assembly can be increased significantly.
41 
42  Collective
43 
44  Input Parameters:
45  +  B - The matrix
46  .  nz - number of nonzeros per row (same for all rows)
47  -  nnz - array containing the number of nonzeros in the various rows
48  (possibly different for each row) or NULL
49 
50  Notes:
51  If nnz is given then nz is ignored.
52 
53  Specify the preallocated storage with either nz or nnz (not both).
54  Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
55  allocation.  For large problems you MUST preallocate memory or you
56  will get TERRIBLE performance, see the users' manual chapter on matrices.
57 
58  You can call MatGetInfo() to get information on how effective the preallocation was;
59  for example the fields mallocs,nz_allocated,nz_used,nz_unneeded;
60  You can also run with the option -info and look for messages with the string
61  malloc in them to see if additional memory allocation was needed.
62 
63  Developers: Use nz of MAT_SKIP_ALLOCATION to not allocate any space for the matrix
64  entries or columns indices.
65 
66  The maximum number of nonzeos in any row should be as accurate as possible.
67  If it is underestimated, you will get bad performance due to reallocation
68  (MatSeqXSELLReallocateSELL).
69 
70  Level: intermediate
71 
72  .seealso: MatCreate(), MatCreateSELL(), MatSetValues(), MatGetInfo()
73 
74  @*/
75 PetscErrorCode MatSeqSELLSetPreallocation(Mat B,PetscInt rlenmax,const PetscInt rlen[])
76 {
77   PetscErrorCode ierr;
78 
79   PetscFunctionBegin;
80   PetscValidHeaderSpecific(B,MAT_CLASSID,1);
81   PetscValidType(B,1);
82   ierr = PetscTryMethod(B,"MatSeqSELLSetPreallocation_C",(Mat,PetscInt,const PetscInt[]),(B,rlenmax,rlen));CHKERRQ(ierr);
83   PetscFunctionReturn(0);
84 }
85 
86 PetscErrorCode MatSeqSELLSetPreallocation_SeqSELL(Mat B,PetscInt maxallocrow,const PetscInt rlen[])
87 {
88   Mat_SeqSELL    *b;
89   PetscInt       i,j,totalslices;
90   PetscBool      skipallocation=PETSC_FALSE,realalloc=PETSC_FALSE;
91   PetscErrorCode ierr;
92 
93   PetscFunctionBegin;
94   if (maxallocrow >= 0 || rlen) realalloc = PETSC_TRUE;
95   if (maxallocrow == MAT_SKIP_ALLOCATION) {
96     skipallocation = PETSC_TRUE;
97     maxallocrow    = 0;
98   }
99 
100   ierr = PetscLayoutSetUp(B->rmap);CHKERRQ(ierr);
101   ierr = PetscLayoutSetUp(B->cmap);CHKERRQ(ierr);
102 
103   /* FIXME: if one preallocates more space than needed, the matrix does not shrink automatically, but for best performance it should */
104   if (maxallocrow == PETSC_DEFAULT || maxallocrow == PETSC_DECIDE) maxallocrow = 5;
105   if (maxallocrow < 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"maxallocrow cannot be less than 0: value %D",maxallocrow);
106   if (rlen) {
107     for (i=0; i<B->rmap->n; i++) {
108       if (rlen[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"rlen cannot be less than 0: local row %D value %D",i,rlen[i]);
109       if (rlen[i] > B->cmap->n) SETERRQ3(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"rlen cannot be greater than row length: local row %D value %D rowlength %D",i,rlen[i],B->cmap->n);
110     }
111   }
112 
113   B->preallocated = PETSC_TRUE;
114 
115   b = (Mat_SeqSELL*)B->data;
116 
117   totalslices = B->rmap->n/8+((B->rmap->n & 0x07)?1:0); /* ceil(n/8) */
118   b->totalslices = totalslices;
119   if (!skipallocation) {
120     if (B->rmap->n & 0x07) {ierr = PetscInfo1(B,"Padding rows to the SEQSELL matrix because the number of rows is not the multiple of 8 (value %D)\n",B->rmap->n);CHKERRQ(ierr);}
121 
122     if (!b->sliidx) { /* sliidx gives the starting index of each slice, the last element is the total space allocated */
123       ierr = PetscMalloc1(totalslices+1,&b->sliidx);CHKERRQ(ierr);
124       ierr = PetscLogObjectMemory((PetscObject)B,(totalslices+1)*sizeof(PetscInt));CHKERRQ(ierr);
125     }
126     if (!rlen) { /* if rlen is not provided, allocate same space for all the slices */
127       if (maxallocrow == PETSC_DEFAULT || maxallocrow == PETSC_DECIDE) maxallocrow = 10;
128       else if (maxallocrow < 0) maxallocrow = 1;
129       for (i=0; i<=totalslices; i++) b->sliidx[i] = i*8*maxallocrow;
130     } else {
131       maxallocrow = 0;
132       b->sliidx[0] = 0;
133       for (i=1; i<totalslices; i++) {
134         b->sliidx[i] = 0;
135         for (j=0;j<8;j++) {
136           b->sliidx[i] = PetscMax(b->sliidx[i],rlen[8*(i-1)+j]);
137         }
138         maxallocrow = PetscMax(b->sliidx[i],maxallocrow);
139         ierr = PetscIntSumError(b->sliidx[i-1],8*b->sliidx[i],&b->sliidx[i]);CHKERRQ(ierr);
140       }
141       /* last slice */
142       b->sliidx[totalslices] = 0;
143       for (j=(totalslices-1)*8;j<B->rmap->n;j++) b->sliidx[totalslices] = PetscMax(b->sliidx[totalslices],rlen[j]);
144       maxallocrow = PetscMax(b->sliidx[totalslices],maxallocrow);
145       b->sliidx[totalslices] = b->sliidx[totalslices-1] + 8*b->sliidx[totalslices];
146     }
147 
148     /* allocate space for val, colidx, rlen */
149     /* FIXME: should B's old memory be unlogged? */
150     ierr = MatSeqXSELLFreeSELL(B,&b->val,&b->colidx);CHKERRQ(ierr);
151     /* FIXME: assuming an element of the bit array takes 8 bits */
152     ierr = PetscMalloc2(b->sliidx[totalslices],&b->val,b->sliidx[totalslices],&b->colidx);CHKERRQ(ierr);
153     ierr = PetscLogObjectMemory((PetscObject)B,b->sliidx[totalslices]*(sizeof(PetscScalar)+sizeof(PetscInt)));CHKERRQ(ierr);
154     /* b->rlen will count nonzeros in each row so far. We dont copy rlen to b->rlen because the matrix has not been set. */
155     ierr = PetscCalloc1(8*totalslices,&b->rlen);CHKERRQ(ierr);
156     ierr = PetscLogObjectMemory((PetscObject)B,8*totalslices*sizeof(PetscInt));CHKERRQ(ierr);
157 
158     b->singlemalloc = PETSC_TRUE;
159     b->free_val     = PETSC_TRUE;
160     b->free_colidx  = PETSC_TRUE;
161   } else {
162     b->free_val    = PETSC_FALSE;
163     b->free_colidx = PETSC_FALSE;
164   }
165 
166   b->nz               = 0;
167   b->maxallocrow      = maxallocrow;
168   b->rlenmax          = maxallocrow;
169   b->maxallocmat      = b->sliidx[totalslices];
170   B->info.nz_unneeded = (double)b->maxallocmat;
171   if (realalloc) {
172     ierr = MatSetOption(B,MAT_NEW_NONZERO_ALLOCATION_ERR,PETSC_TRUE);CHKERRQ(ierr);
173   }
174   PetscFunctionReturn(0);
175 }
176 
177 PetscErrorCode MatGetRow_SeqSELL(Mat A,PetscInt row,PetscInt *nz,PetscInt **idx,PetscScalar **v)
178 {
179   Mat_SeqSELL *a = (Mat_SeqSELL*)A->data;
180   PetscInt    shift;
181 
182   PetscFunctionBegin;
183   if (row < 0 || row >= A->rmap->n) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Row %D out of range",row);
184   if (nz) *nz = a->rlen[row];
185   shift = a->sliidx[row>>3]+(row&0x07);
186   if (!a->getrowcols) {
187     PetscErrorCode ierr;
188 
189     ierr = PetscMalloc2(a->rlenmax,&a->getrowcols,a->rlenmax,&a->getrowvals);CHKERRQ(ierr);
190   }
191   if (idx) {
192     PetscInt j;
193     for (j=0; j<a->rlen[row]; j++) a->getrowcols[j] = a->colidx[shift+8*j];
194     *idx = a->getrowcols;
195   }
196   if (v) {
197     PetscInt j;
198     for (j=0; j<a->rlen[row]; j++) a->getrowvals[j] = a->val[shift+8*j];
199     *v = a->getrowvals;
200   }
201   PetscFunctionReturn(0);
202 }
203 
204 PetscErrorCode MatRestoreRow_SeqSELL(Mat A,PetscInt row,PetscInt *nz,PetscInt **idx,PetscScalar **v)
205 {
206   PetscFunctionBegin;
207   PetscFunctionReturn(0);
208 }
209 
210 PetscErrorCode MatConvert_SeqSELL_SeqAIJ(Mat A, MatType newtype,MatReuse reuse,Mat *newmat)
211 {
212   Mat            B;
213   Mat_SeqSELL    *a=(Mat_SeqSELL*)A->data;
214   PetscInt       i;
215   PetscErrorCode ierr;
216 
217   PetscFunctionBegin;
218   if (reuse == MAT_REUSE_MATRIX) {
219     B    = *newmat;
220     ierr = MatZeroEntries(B);CHKERRQ(ierr);
221   } else {
222     ierr = MatCreate(PetscObjectComm((PetscObject)A),&B);CHKERRQ(ierr);
223     ierr = MatSetSizes(B,A->rmap->n,A->cmap->n,A->rmap->N,A->cmap->N);CHKERRQ(ierr);
224     ierr = MatSetType(B,MATSEQAIJ);CHKERRQ(ierr);
225     ierr = MatSeqAIJSetPreallocation(B,0,a->rlen);CHKERRQ(ierr);
226   }
227 
228   for (i=0; i<A->rmap->n; i++) {
229     PetscInt    nz = 0,*cols = NULL;
230     PetscScalar *vals = NULL;
231 
232     ierr = MatGetRow_SeqSELL(A,i,&nz,&cols,&vals);CHKERRQ(ierr);
233     ierr = MatSetValues(B,1,&i,nz,cols,vals,INSERT_VALUES);CHKERRQ(ierr);
234     ierr = MatRestoreRow_SeqSELL(A,i,&nz,&cols,&vals);CHKERRQ(ierr);
235   }
236 
237   ierr = MatAssemblyBegin(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
238   ierr = MatAssemblyEnd(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
239   B->rmap->bs = A->rmap->bs;
240 
241   if (reuse == MAT_INPLACE_MATRIX) {
242     ierr = MatHeaderReplace(A,&B);CHKERRQ(ierr);
243   } else {
244     *newmat = B;
245   }
246   PetscFunctionReturn(0);
247 }
248 
249 #include <../src/mat/impls/aij/seq/aij.h>
250 
251 PetscErrorCode MatConvert_SeqAIJ_SeqSELL(Mat A,MatType newtype,MatReuse reuse,Mat *newmat)
252 {
253   Mat               B;
254   Mat_SeqAIJ        *a=(Mat_SeqAIJ*)A->data;
255   PetscInt          *ai=a->i,m=A->rmap->N,n=A->cmap->N,i,*rowlengths,row,ncols;
256   const PetscInt    *cols;
257   const PetscScalar *vals;
258   PetscErrorCode    ierr;
259 
260   PetscFunctionBegin;
261 
262   if (reuse == MAT_REUSE_MATRIX) {
263     B = *newmat;
264   } else {
265     /* Can we just use ilen? */
266     ierr = PetscMalloc1(m,&rowlengths);CHKERRQ(ierr);
267     for (i=0; i<m; i++) {
268       rowlengths[i] = ai[i+1] - ai[i];
269     }
270 
271     ierr = MatCreate(PetscObjectComm((PetscObject)A),&B);CHKERRQ(ierr);
272     ierr = MatSetSizes(B,m,n,m,n);CHKERRQ(ierr);
273     ierr = MatSetType(B,MATSEQSELL);CHKERRQ(ierr);
274     ierr = MatSeqSELLSetPreallocation(B,0,rowlengths);CHKERRQ(ierr);
275     ierr = PetscFree(rowlengths);CHKERRQ(ierr);
276   }
277 
278   for (row=0; row<m; row++) {
279     ierr = MatGetRow(A,row,&ncols,&cols,&vals);CHKERRQ(ierr);
280     ierr = MatSetValues(B,1,&row,ncols,cols,vals,INSERT_VALUES);CHKERRQ(ierr);
281     ierr = MatRestoreRow(A,row,&ncols,&cols,&vals);CHKERRQ(ierr);
282   }
283   ierr = MatAssemblyBegin(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
284   ierr = MatAssemblyEnd(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
285   B->rmap->bs = A->rmap->bs;
286 
287   if (reuse == MAT_INPLACE_MATRIX) {
288     ierr = MatHeaderReplace(A,&B);CHKERRQ(ierr);
289   } else {
290     *newmat = B;
291   }
292   PetscFunctionReturn(0);
293 }
294 
295 PetscErrorCode MatMult_SeqSELL(Mat A,Vec xx,Vec yy)
296 {
297   Mat_SeqSELL       *a=(Mat_SeqSELL*)A->data;
298   PetscScalar       *y;
299   const PetscScalar *x;
300   const MatScalar   *aval=a->val;
301   PetscInt          totalslices=a->totalslices;
302   const PetscInt    *acolidx=a->colidx;
303   PetscInt          i,j;
304   PetscErrorCode    ierr;
305 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
306   __m512d           vec_x,vec_y,vec_vals;
307   __m256i           vec_idx;
308   __mmask8          mask;
309   __m512d           vec_x2,vec_y2,vec_vals2,vec_x3,vec_y3,vec_vals3,vec_x4,vec_y4,vec_vals4;
310   __m256i           vec_idx2,vec_idx3,vec_idx4;
311 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
312   __m128i           vec_idx;
313   __m256d           vec_x,vec_y,vec_y2,vec_vals;
314   MatScalar         yval;
315   PetscInt          r,rows_left,row,nnz_in_row;
316 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
317   __m128d           vec_x_tmp;
318   __m256d           vec_x,vec_y,vec_y2,vec_vals;
319   MatScalar         yval;
320   PetscInt          r,rows_left,row,nnz_in_row;
321 #else
322   PetscScalar       sum[8];
323 #endif
324 
325 #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
326 #pragma disjoint(*x,*y,*aval)
327 #endif
328 
329   PetscFunctionBegin;
330   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
331   ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
332 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
333   for (i=0; i<totalslices; i++) { /* loop over slices */
334     PetscPrefetchBlock(acolidx,a->sliidx[i+1]-a->sliidx[i],0,PETSC_PREFETCH_HINT_T0);
335     PetscPrefetchBlock(aval,a->sliidx[i+1]-a->sliidx[i],0,PETSC_PREFETCH_HINT_T0);
336 
337     vec_y  = _mm512_setzero_pd();
338     vec_y2 = _mm512_setzero_pd();
339     vec_y3 = _mm512_setzero_pd();
340     vec_y4 = _mm512_setzero_pd();
341 
342     j = a->sliidx[i]>>3; /* 8 bytes are read at each time, corresponding to a slice columnn */
343     switch ((a->sliidx[i+1]-a->sliidx[i])/8 & 3) {
344     case 3:
345       AVX512_Mult_Private(vec_idx,vec_x,vec_vals,vec_y);
346       acolidx += 8; aval += 8;
347       AVX512_Mult_Private(vec_idx2,vec_x2,vec_vals2,vec_y2);
348       acolidx += 8; aval += 8;
349       AVX512_Mult_Private(vec_idx3,vec_x3,vec_vals3,vec_y3);
350       acolidx += 8; aval += 8;
351       j += 3;
352       break;
353     case 2:
354       AVX512_Mult_Private(vec_idx,vec_x,vec_vals,vec_y);
355       acolidx += 8; aval += 8;
356       AVX512_Mult_Private(vec_idx2,vec_x2,vec_vals2,vec_y2);
357       acolidx += 8; aval += 8;
358       j += 2;
359       break;
360     case 1:
361       AVX512_Mult_Private(vec_idx,vec_x,vec_vals,vec_y);
362       acolidx += 8; aval += 8;
363       j += 1;
364       break;
365     }
366     #pragma novector
367     for (; j<(a->sliidx[i+1]>>3); j+=4) {
368       AVX512_Mult_Private(vec_idx,vec_x,vec_vals,vec_y);
369       acolidx += 8; aval += 8;
370       AVX512_Mult_Private(vec_idx2,vec_x2,vec_vals2,vec_y2);
371       acolidx += 8; aval += 8;
372       AVX512_Mult_Private(vec_idx3,vec_x3,vec_vals3,vec_y3);
373       acolidx += 8; aval += 8;
374       AVX512_Mult_Private(vec_idx4,vec_x4,vec_vals4,vec_y4);
375       acolidx += 8; aval += 8;
376     }
377 
378     vec_y = _mm512_add_pd(vec_y,vec_y2);
379     vec_y = _mm512_add_pd(vec_y,vec_y3);
380     vec_y = _mm512_add_pd(vec_y,vec_y4);
381     if (i == totalslices-1 && A->rmap->n & 0x07) { /* if last slice has padding rows */
382       mask = (__mmask8)(0xff >> (8-(A->rmap->n & 0x07)));
383       _mm512_mask_storeu_pd(&y[8*i],mask,vec_y);
384     } else {
385       _mm512_storeu_pd(&y[8*i],vec_y);
386     }
387   }
388 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
389   for (i=0; i<totalslices; i++) { /* loop over full slices */
390     PetscPrefetchBlock(acolidx,a->sliidx[i+1]-a->sliidx[i],0,PETSC_PREFETCH_HINT_T0);
391     PetscPrefetchBlock(aval,a->sliidx[i+1]-a->sliidx[i],0,PETSC_PREFETCH_HINT_T0);
392 
393     /* last slice may have padding rows. Don't use vectorization. */
394     if (i == totalslices-1 && (A->rmap->n & 0x07)) {
395       rows_left = A->rmap->n - 8*i;
396       for (r=0; r<rows_left; ++r) {
397         yval = (MatScalar)0;
398         row = 8*i + r;
399         nnz_in_row = a->rlen[row];
400         for (j=0; j<nnz_in_row; ++j) yval += aval[8*j+r] * x[acolidx[8*j+r]];
401         y[row] = yval;
402       }
403       break;
404     }
405 
406     vec_y  = _mm256_setzero_pd();
407     vec_y2 = _mm256_setzero_pd();
408 
409     /* Process slice of height 8 (512 bits) via two subslices of height 4 (256 bits) via AVX */
410     #pragma novector
411     #pragma unroll(2)
412     for (j=a->sliidx[i]; j<a->sliidx[i+1]; j+=8) {
413       AVX2_Mult_Private(vec_idx,vec_x,vec_vals,vec_y);
414       aval += 4; acolidx += 4;
415       AVX2_Mult_Private(vec_idx,vec_x,vec_vals,vec_y2);
416       aval += 4; acolidx += 4;
417     }
418 
419     _mm256_storeu_pd(y+i*8,vec_y);
420     _mm256_storeu_pd(y+i*8+4,vec_y2);
421   }
422 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
423   for (i=0; i<totalslices; i++) { /* loop over full slices */
424     PetscPrefetchBlock(acolidx,a->sliidx[i+1]-a->sliidx[i],0,PETSC_PREFETCH_HINT_T0);
425     PetscPrefetchBlock(aval,a->sliidx[i+1]-a->sliidx[i],0,PETSC_PREFETCH_HINT_T0);
426 
427     vec_y  = _mm256_setzero_pd();
428     vec_y2 = _mm256_setzero_pd();
429 
430     /* last slice may have padding rows. Don't use vectorization. */
431     if (i == totalslices-1 && (A->rmap->n & 0x07)) {
432       rows_left = A->rmap->n - 8*i;
433       for (r=0; r<rows_left; ++r) {
434         yval = (MatScalar)0;
435         row = 8*i + r;
436         nnz_in_row = a->rlen[row];
437         for (j=0; j<nnz_in_row; ++j) yval += aval[8*j + r] * x[acolidx[8*j + r]];
438         y[row] = yval;
439       }
440       break;
441     }
442 
443     /* Process slice of height 8 (512 bits) via two subslices of height 4 (256 bits) via AVX */
444     #pragma novector
445     #pragma unroll(2)
446     for (j=a->sliidx[i]; j<a->sliidx[i+1]; j+=8) {
447       vec_vals  = _mm256_loadu_pd(aval);
448       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
449       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
450       vec_x     = _mm256_insertf128_pd(vec_x,vec_x_tmp,0);
451       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
452       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
453       vec_x     = _mm256_insertf128_pd(vec_x,vec_x_tmp,1);
454       vec_y     = _mm256_add_pd(_mm256_mul_pd(vec_x,vec_vals),vec_y);
455       aval     += 4;
456 
457       vec_vals  = _mm256_loadu_pd(aval);
458       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
459       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
460       vec_x     = _mm256_insertf128_pd(vec_x,vec_x_tmp,0);
461       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
462       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
463       vec_x     = _mm256_insertf128_pd(vec_x,vec_x_tmp,1);
464       vec_y2    = _mm256_add_pd(_mm256_mul_pd(vec_x,vec_vals),vec_y2);
465       aval     += 4;
466     }
467 
468     _mm256_storeu_pd(y + i*8,     vec_y);
469     _mm256_storeu_pd(y + i*8 + 4, vec_y2);
470   }
471 #else
472   for (i=0; i<totalslices; i++) { /* loop over slices */
473     for (j=0; j<8; j++) sum[j] = 0.0;
474     for (j=a->sliidx[i]; j<a->sliidx[i+1]; j+=8) {
475       sum[0] += aval[j] * x[acolidx[j]];
476       sum[1] += aval[j+1] * x[acolidx[j+1]];
477       sum[2] += aval[j+2] * x[acolidx[j+2]];
478       sum[3] += aval[j+3] * x[acolidx[j+3]];
479       sum[4] += aval[j+4] * x[acolidx[j+4]];
480       sum[5] += aval[j+5] * x[acolidx[j+5]];
481       sum[6] += aval[j+6] * x[acolidx[j+6]];
482       sum[7] += aval[j+7] * x[acolidx[j+7]];
483     }
484     if (i == totalslices-1 && (A->rmap->n & 0x07)) { /* if last slice has padding rows */
485       for(j=0; j<(A->rmap->n & 0x07); j++) y[8*i+j] = sum[j];
486     } else {
487       for(j=0; j<8; j++) y[8*i+j] = sum[j];
488     }
489   }
490 #endif
491 
492   ierr = PetscLogFlops(2.0*a->nz-a->nonzerorowcnt);CHKERRQ(ierr); /* theoretical minimal FLOPs */
493   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
494   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
495   PetscFunctionReturn(0);
496 }
497 
498 #include <../src/mat/impls/aij/seq/ftn-kernels/fmultadd.h>
499 PetscErrorCode MatMultAdd_SeqSELL(Mat A,Vec xx,Vec yy,Vec zz)
500 {
501   Mat_SeqSELL       *a=(Mat_SeqSELL*)A->data;
502   PetscScalar       *y,*z;
503   const PetscScalar *x;
504   const MatScalar   *aval=a->val;
505   PetscInt          totalslices=a->totalslices;
506   const PetscInt    *acolidx=a->colidx;
507   PetscInt          i,j;
508   PetscErrorCode    ierr;
509 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
510   __m512d           vec_x,vec_y,vec_vals;
511   __m256i           vec_idx;
512   __mmask8          mask;
513   __m512d           vec_x2,vec_y2,vec_vals2,vec_x3,vec_y3,vec_vals3,vec_x4,vec_y4,vec_vals4;
514   __m256i           vec_idx2,vec_idx3,vec_idx4;
515 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
516   __m128d           vec_x_tmp;
517   __m256d           vec_x,vec_y,vec_y2,vec_vals;
518   MatScalar         yval;
519   PetscInt          r,row,nnz_in_row;
520 #else
521   PetscScalar       sum[8];
522 #endif
523 
524 #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
525 #pragma disjoint(*x,*y,*aval)
526 #endif
527 
528   PetscFunctionBegin;
529   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
530   ierr = VecGetArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
531 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
532   for (i=0; i<totalslices; i++) { /* loop over slices */
533     PetscPrefetchBlock(acolidx,a->sliidx[i+1]-a->sliidx[i],0,PETSC_PREFETCH_HINT_T0);
534     PetscPrefetchBlock(aval,a->sliidx[i+1]-a->sliidx[i],0,PETSC_PREFETCH_HINT_T0);
535 
536     if (i == totalslices-1 && A->rmap->n & 0x07) { /* if last slice has padding rows */
537       mask   = (__mmask8)(0xff >> (8-(A->rmap->n & 0x07)));
538       vec_y  = _mm512_mask_loadu_pd(vec_y,mask,&y[8*i]);
539     } else {
540       vec_y  = _mm512_loadu_pd(&y[8*i]);
541     }
542     vec_y2 = _mm512_setzero_pd();
543     vec_y3 = _mm512_setzero_pd();
544     vec_y4 = _mm512_setzero_pd();
545 
546     j = a->sliidx[i]>>3; /* 8 bytes are read at each time, corresponding to a slice columnn */
547     switch ((a->sliidx[i+1]-a->sliidx[i])/8 & 3) {
548     case 3:
549       AVX512_Mult_Private(vec_idx,vec_x,vec_vals,vec_y);
550       acolidx += 8; aval += 8;
551       AVX512_Mult_Private(vec_idx2,vec_x2,vec_vals2,vec_y2);
552       acolidx += 8; aval += 8;
553       AVX512_Mult_Private(vec_idx3,vec_x3,vec_vals3,vec_y3);
554       acolidx += 8; aval += 8;
555       j += 3;
556       break;
557     case 2:
558       AVX512_Mult_Private(vec_idx,vec_x,vec_vals,vec_y);
559       acolidx += 8; aval += 8;
560       AVX512_Mult_Private(vec_idx2,vec_x2,vec_vals2,vec_y2);
561       acolidx += 8; aval += 8;
562       j += 2;
563       break;
564     case 1:
565       AVX512_Mult_Private(vec_idx,vec_x,vec_vals,vec_y);
566       acolidx += 8; aval += 8;
567       j += 1;
568       break;
569     }
570     #pragma novector
571     for (; j<(a->sliidx[i+1]>>3); j+=4) {
572       AVX512_Mult_Private(vec_idx,vec_x,vec_vals,vec_y);
573       acolidx += 8; aval += 8;
574       AVX512_Mult_Private(vec_idx2,vec_x2,vec_vals2,vec_y2);
575       acolidx += 8; aval += 8;
576       AVX512_Mult_Private(vec_idx3,vec_x3,vec_vals3,vec_y3);
577       acolidx += 8; aval += 8;
578       AVX512_Mult_Private(vec_idx4,vec_x4,vec_vals4,vec_y4);
579       acolidx += 8; aval += 8;
580     }
581 
582     vec_y = _mm512_add_pd(vec_y,vec_y2);
583     vec_y = _mm512_add_pd(vec_y,vec_y3);
584     vec_y = _mm512_add_pd(vec_y,vec_y4);
585     if (i == totalslices-1 && A->rmap->n & 0x07) { /* if last slice has padding rows */
586       _mm512_mask_storeu_pd(&z[8*i],mask,vec_y);
587     } else {
588       _mm512_storeu_pd(&z[8*i],vec_y);
589     }
590   }
591 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
592   for (i=0; i<totalslices; i++) { /* loop over full slices */
593     PetscPrefetchBlock(acolidx,a->sliidx[i+1]-a->sliidx[i],0,PETSC_PREFETCH_HINT_T0);
594     PetscPrefetchBlock(aval,a->sliidx[i+1]-a->sliidx[i],0,PETSC_PREFETCH_HINT_T0);
595 
596     /* last slice may have padding rows. Don't use vectorization. */
597     if (i == totalslices-1 && (A->rmap->n & 0x07)) {
598       for (r=0; r<(A->rmap->n & 0x07); ++r) {
599         row        = 8*i + r;
600         yval       = (MatScalar)0.0;
601         nnz_in_row = a->rlen[row];
602         for (j=0; j<nnz_in_row; ++j) yval += aval[8*j+r] * x[acolidx[8*j+r]];
603         z[row] = y[row] + yval;
604       }
605       break;
606     }
607 
608     vec_y  = _mm256_loadu_pd(y+8*i);
609     vec_y2 = _mm256_loadu_pd(y+8*i+4);
610 
611     /* Process slice of height 8 (512 bits) via two subslices of height 4 (256 bits) via AVX */
612     for (j=a->sliidx[i]; j<a->sliidx[i+1]; j+=8) {
613       vec_vals  = _mm256_loadu_pd(aval);
614       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
615       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
616       vec_x     = _mm256_insertf128_pd(vec_x,vec_x_tmp,0);
617       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
618       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
619       vec_x     = _mm256_insertf128_pd(vec_x,vec_x_tmp,1);
620       vec_y     = _mm256_add_pd(_mm256_mul_pd(vec_x,vec_vals),vec_y);
621       aval     += 4;
622 
623       vec_vals  = _mm256_loadu_pd(aval);
624       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
625       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
626       vec_x     = _mm256_insertf128_pd(vec_x,vec_x_tmp,0);
627       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
628       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
629       vec_x     = _mm256_insertf128_pd(vec_x,vec_x_tmp,1);
630       vec_y2    = _mm256_add_pd(_mm256_mul_pd(vec_x,vec_vals),vec_y2);
631       aval     += 4;
632     }
633 
634     _mm256_storeu_pd(z+i*8,vec_y);
635     _mm256_storeu_pd(z+i*8+4,vec_y2);
636   }
637 #else
638   for (i=0; i<totalslices; i++) { /* loop over slices */
639     for (j=0; j<8; j++) sum[j] = 0.0;
640     for (j=a->sliidx[i]; j<a->sliidx[i+1]; j+=8) {
641       sum[0] += aval[j] * x[acolidx[j]];
642       sum[1] += aval[j+1] * x[acolidx[j+1]];
643       sum[2] += aval[j+2] * x[acolidx[j+2]];
644       sum[3] += aval[j+3] * x[acolidx[j+3]];
645       sum[4] += aval[j+4] * x[acolidx[j+4]];
646       sum[5] += aval[j+5] * x[acolidx[j+5]];
647       sum[6] += aval[j+6] * x[acolidx[j+6]];
648       sum[7] += aval[j+7] * x[acolidx[j+7]];
649     }
650     if (i == totalslices-1 && (A->rmap->n & 0x07)) {
651       for (j=0; j<(A->rmap->n & 0x07); j++) z[8*i+j] = y[8*i+j] + sum[j];
652     } else {
653       for (j=0; j<8; j++) z[8*i+j] = y[8*i+j] + sum[j];
654     }
655   }
656 #endif
657 
658   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
659   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
660   ierr = VecRestoreArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
661   PetscFunctionReturn(0);
662 }
663 
664 PetscErrorCode MatMultTransposeAdd_SeqSELL(Mat A,Vec xx,Vec zz,Vec yy)
665 {
666   Mat_SeqSELL       *a=(Mat_SeqSELL*)A->data;
667   PetscScalar       *y;
668   const PetscScalar *x;
669   const MatScalar   *aval=a->val;
670   const PetscInt    *acolidx=a->colidx;
671   PetscInt          i,j,r,row,nnz_in_row,totalslices=a->totalslices;
672   PetscErrorCode    ierr;
673 
674 #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
675 #pragma disjoint(*x,*y,*aval)
676 #endif
677 
678   PetscFunctionBegin;
679   if (A->symmetric) {
680     ierr = MatMultAdd_SeqSELL(A,xx,zz,yy);CHKERRQ(ierr);
681     PetscFunctionReturn(0);
682   }
683   if (zz != yy) { ierr = VecCopy(zz,yy);CHKERRQ(ierr); }
684   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
685   ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
686   for (i=0; i<a->totalslices; i++) { /* loop over slices */
687     if (i == totalslices-1 && (A->rmap->n & 0x07)) {
688       for (r=0; r<(A->rmap->n & 0x07); ++r) {
689         row        = 8*i + r;
690         nnz_in_row = a->rlen[row];
691         for (j=0; j<nnz_in_row; ++j) y[acolidx[8*j+r]] += aval[8*j+r] * x[row];
692       }
693       break;
694     }
695     for (j=a->sliidx[i]; j<a->sliidx[i+1]; j+=8) {
696       y[acolidx[j]]   += aval[j] * x[8*i];
697       y[acolidx[j+1]] += aval[j+1] * x[8*i+1];
698       y[acolidx[j+2]] += aval[j+2] * x[8*i+2];
699       y[acolidx[j+3]] += aval[j+3] * x[8*i+3];
700       y[acolidx[j+4]] += aval[j+4] * x[8*i+4];
701       y[acolidx[j+5]] += aval[j+5] * x[8*i+5];
702       y[acolidx[j+6]] += aval[j+6] * x[8*i+6];
703       y[acolidx[j+7]] += aval[j+7] * x[8*i+7];
704     }
705   }
706   ierr = PetscLogFlops(2.0*a->sliidx[a->totalslices]);CHKERRQ(ierr);
707   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
708   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
709   PetscFunctionReturn(0);
710 }
711 
712 PetscErrorCode MatMultTranspose_SeqSELL(Mat A,Vec xx,Vec yy)
713 {
714   PetscErrorCode ierr;
715 
716   PetscFunctionBegin;
717   if (A->symmetric) {
718     ierr = MatMult_SeqSELL(A,xx,yy);CHKERRQ(ierr);
719   } else {
720     ierr = VecSet(yy,0.0);CHKERRQ(ierr);
721     ierr = MatMultTransposeAdd_SeqSELL(A,xx,yy,yy);CHKERRQ(ierr);
722   }
723   PetscFunctionReturn(0);
724 }
725 
726 /*
727      Checks for missing diagonals
728 */
729 PetscErrorCode MatMissingDiagonal_SeqSELL(Mat A,PetscBool  *missing,PetscInt *d)
730 {
731   Mat_SeqSELL    *a=(Mat_SeqSELL*)A->data;
732   PetscInt       *diag,i;
733   PetscErrorCode ierr;
734 
735   PetscFunctionBegin;
736   *missing = PETSC_FALSE;
737   if (A->rmap->n > 0 && !(a->colidx)) {
738     *missing = PETSC_TRUE;
739     if (d) *d = 0;
740     ierr = PetscInfo(A,"Matrix has no entries therefore is missing diagonal\n");CHKERRQ(ierr);
741   } else {
742     diag = a->diag;
743     for (i=0; i<A->rmap->n; i++) {
744       if (diag[i] == -1) {
745         *missing = PETSC_TRUE;
746         if (d) *d = i;
747         ierr = PetscInfo1(A,"Matrix is missing diagonal number %D\n",i);CHKERRQ(ierr);
748         break;
749       }
750     }
751   }
752   PetscFunctionReturn(0);
753 }
754 
755 PetscErrorCode MatMarkDiagonal_SeqSELL(Mat A)
756 {
757   Mat_SeqSELL    *a=(Mat_SeqSELL*)A->data;
758   PetscInt       i,j,m=A->rmap->n,shift;
759   PetscErrorCode ierr;
760 
761   PetscFunctionBegin;
762   if (!a->diag) {
763     ierr         = PetscMalloc1(m,&a->diag);CHKERRQ(ierr);
764     ierr         = PetscLogObjectMemory((PetscObject)A,m*sizeof(PetscInt));CHKERRQ(ierr);
765     a->free_diag = PETSC_TRUE;
766   }
767   for (i=0; i<m; i++) { /* loop over rows */
768     shift = a->sliidx[i>>3]+(i&0x07); /* starting index of the row i */
769     a->diag[i] = -1;
770     for (j=0; j<a->rlen[i]; j++) {
771       if (a->colidx[shift+j*8] == i) {
772         a->diag[i] = shift+j*8;
773         break;
774       }
775     }
776   }
777   PetscFunctionReturn(0);
778 }
779 
780 /*
781   Negative shift indicates do not generate an error if there is a zero diagonal, just invert it anyways
782 */
783 PetscErrorCode MatInvertDiagonal_SeqSELL(Mat A,PetscScalar omega,PetscScalar fshift)
784 {
785   Mat_SeqSELL    *a=(Mat_SeqSELL*) A->data;
786   PetscInt       i,*diag,m = A->rmap->n;
787   MatScalar      *val = a->val;
788   PetscScalar    *idiag,*mdiag;
789   PetscErrorCode ierr;
790 
791   PetscFunctionBegin;
792   if (a->idiagvalid) PetscFunctionReturn(0);
793   ierr = MatMarkDiagonal_SeqSELL(A);CHKERRQ(ierr);
794   diag = a->diag;
795   if (!a->idiag) {
796     ierr = PetscMalloc3(m,&a->idiag,m,&a->mdiag,m,&a->ssor_work);CHKERRQ(ierr);
797     ierr = PetscLogObjectMemory((PetscObject)A, 3*m*sizeof(PetscScalar));CHKERRQ(ierr);
798     val  = a->val;
799   }
800   mdiag = a->mdiag;
801   idiag = a->idiag;
802 
803   if (omega == 1.0 && PetscRealPart(fshift) <= 0.0) {
804     for (i=0; i<m; i++) {
805       mdiag[i] = val[diag[i]];
806       if (!PetscAbsScalar(mdiag[i])) { /* zero diagonal */
807         if (PetscRealPart(fshift)) {
808           ierr = PetscInfo1(A,"Zero diagonal on row %D\n",i);CHKERRQ(ierr);
809           A->factorerrortype             = MAT_FACTOR_NUMERIC_ZEROPIVOT;
810           A->factorerror_zeropivot_value = 0.0;
811           A->factorerror_zeropivot_row   = i;
812         } else SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"Zero diagonal on row %D",i);
813       }
814       idiag[i] = 1.0/val[diag[i]];
815     }
816     ierr = PetscLogFlops(m);CHKERRQ(ierr);
817   } else {
818     for (i=0; i<m; i++) {
819       mdiag[i] = val[diag[i]];
820       idiag[i] = omega/(fshift + val[diag[i]]);
821     }
822     ierr = PetscLogFlops(2.0*m);CHKERRQ(ierr);
823   }
824   a->idiagvalid = PETSC_TRUE;
825   PetscFunctionReturn(0);
826 }
827 
828 PetscErrorCode MatZeroEntries_SeqSELL(Mat A)
829 {
830   Mat_SeqSELL    *a=(Mat_SeqSELL*)A->data;
831   PetscErrorCode ierr;
832 
833   PetscFunctionBegin;
834   ierr = PetscArrayzero(a->val,a->sliidx[a->totalslices]);CHKERRQ(ierr);
835   ierr = MatSeqSELLInvalidateDiagonal(A);CHKERRQ(ierr);
836   PetscFunctionReturn(0);
837 }
838 
839 PetscErrorCode MatDestroy_SeqSELL(Mat A)
840 {
841   Mat_SeqSELL    *a=(Mat_SeqSELL*)A->data;
842   PetscErrorCode ierr;
843 
844   PetscFunctionBegin;
845 #if defined(PETSC_USE_LOG)
846   PetscLogObjectState((PetscObject)A,"Rows=%D, Cols=%D, NZ=%D",A->rmap->n,A->cmap->n,a->nz);
847 #endif
848   ierr = MatSeqXSELLFreeSELL(A,&a->val,&a->colidx);CHKERRQ(ierr);
849   ierr = ISDestroy(&a->row);CHKERRQ(ierr);
850   ierr = ISDestroy(&a->col);CHKERRQ(ierr);
851   ierr = PetscFree(a->diag);CHKERRQ(ierr);
852   ierr = PetscFree(a->rlen);CHKERRQ(ierr);
853   ierr = PetscFree(a->sliidx);CHKERRQ(ierr);
854   ierr = PetscFree3(a->idiag,a->mdiag,a->ssor_work);CHKERRQ(ierr);
855   ierr = PetscFree(a->solve_work);CHKERRQ(ierr);
856   ierr = ISDestroy(&a->icol);CHKERRQ(ierr);
857   ierr = PetscFree(a->saved_values);CHKERRQ(ierr);
858   ierr = PetscFree2(a->getrowcols,a->getrowvals);CHKERRQ(ierr);
859 
860   ierr = PetscFree(A->data);CHKERRQ(ierr);
861 
862   ierr = PetscObjectChangeTypeName((PetscObject)A,0);CHKERRQ(ierr);
863   ierr = PetscObjectComposeFunction((PetscObject)A,"MatStoreValues_C",NULL);CHKERRQ(ierr);
864   ierr = PetscObjectComposeFunction((PetscObject)A,"MatRetrieveValues_C",NULL);CHKERRQ(ierr);
865 #if defined(PETSC_HAVE_ELEMENTAL)
866 #endif
867   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqSELLSetPreallocation_C",NULL);CHKERRQ(ierr);
868   PetscFunctionReturn(0);
869 }
870 
871 PetscErrorCode MatSetOption_SeqSELL(Mat A,MatOption op,PetscBool flg)
872 {
873   Mat_SeqSELL    *a=(Mat_SeqSELL*)A->data;
874   PetscErrorCode ierr;
875 
876   PetscFunctionBegin;
877   switch (op) {
878   case MAT_ROW_ORIENTED:
879     a->roworiented = flg;
880     break;
881   case MAT_KEEP_NONZERO_PATTERN:
882     a->keepnonzeropattern = flg;
883     break;
884   case MAT_NEW_NONZERO_LOCATIONS:
885     a->nonew = (flg ? 0 : 1);
886     break;
887   case MAT_NEW_NONZERO_LOCATION_ERR:
888     a->nonew = (flg ? -1 : 0);
889     break;
890   case MAT_NEW_NONZERO_ALLOCATION_ERR:
891     a->nonew = (flg ? -2 : 0);
892     break;
893   case MAT_UNUSED_NONZERO_LOCATION_ERR:
894     a->nounused = (flg ? -1 : 0);
895     break;
896   case MAT_NEW_DIAGONALS:
897   case MAT_IGNORE_OFF_PROC_ENTRIES:
898   case MAT_USE_HASH_TABLE:
899   case MAT_SORTED_FULL:
900     ierr = PetscInfo1(A,"Option %s ignored\n",MatOptions[op]);CHKERRQ(ierr);
901     break;
902   case MAT_SPD:
903   case MAT_SYMMETRIC:
904   case MAT_STRUCTURALLY_SYMMETRIC:
905   case MAT_HERMITIAN:
906   case MAT_SYMMETRY_ETERNAL:
907     /* These options are handled directly by MatSetOption() */
908     break;
909   default:
910     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unknown option %d",op);
911   }
912   PetscFunctionReturn(0);
913 }
914 
915 PetscErrorCode MatGetDiagonal_SeqSELL(Mat A,Vec v)
916 {
917   Mat_SeqSELL    *a=(Mat_SeqSELL*)A->data;
918   PetscInt       i,j,n,shift;
919   PetscScalar    *x,zero=0.0;
920   PetscErrorCode ierr;
921 
922   PetscFunctionBegin;
923   ierr = VecGetLocalSize(v,&n);CHKERRQ(ierr);
924   if (n != A->rmap->n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector");
925 
926   if (A->factortype == MAT_FACTOR_ILU || A->factortype == MAT_FACTOR_LU) {
927     PetscInt *diag=a->diag;
928     ierr = VecGetArray(v,&x);CHKERRQ(ierr);
929     for (i=0; i<n; i++) x[i] = 1.0/a->val[diag[i]];
930     ierr = VecRestoreArray(v,&x);CHKERRQ(ierr);
931     PetscFunctionReturn(0);
932   }
933 
934   ierr = VecSet(v,zero);CHKERRQ(ierr);
935   ierr = VecGetArray(v,&x);CHKERRQ(ierr);
936   for (i=0; i<n; i++) { /* loop over rows */
937     shift = a->sliidx[i>>3]+(i&0x07); /* starting index of the row i */
938     x[i] = 0;
939     for (j=0; j<a->rlen[i]; j++) {
940       if (a->colidx[shift+j*8] == i) {
941         x[i] = a->val[shift+j*8];
942         break;
943       }
944     }
945   }
946   ierr = VecRestoreArray(v,&x);CHKERRQ(ierr);
947   PetscFunctionReturn(0);
948 }
949 
950 PetscErrorCode MatDiagonalScale_SeqSELL(Mat A,Vec ll,Vec rr)
951 {
952   Mat_SeqSELL       *a=(Mat_SeqSELL*)A->data;
953   const PetscScalar *l,*r;
954   PetscInt          i,j,m,n,row;
955   PetscErrorCode    ierr;
956 
957   PetscFunctionBegin;
958   if (ll) {
959     /* The local size is used so that VecMPI can be passed to this routine
960        by MatDiagonalScale_MPISELL */
961     ierr = VecGetLocalSize(ll,&m);CHKERRQ(ierr);
962     if (m != A->rmap->n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Left scaling vector wrong length");
963     ierr = VecGetArrayRead(ll,&l);CHKERRQ(ierr);
964     for (i=0; i<a->totalslices; i++) { /* loop over slices */
965       if (i == a->totalslices-1 && (A->rmap->n & 0x07)) { /* if last slice has padding rows */
966         for (j=a->sliidx[i],row=0; j<a->sliidx[i+1]; j++,row=((row+1)&0x07)) {
967           if (row < (A->rmap->n & 0x07)) a->val[j] *= l[8*i+row];
968         }
969       } else {
970         for (j=a->sliidx[i],row=0; j<a->sliidx[i+1]; j++,row=((row+1)&0x07)) {
971           a->val[j] *= l[8*i+row];
972         }
973       }
974     }
975     ierr = VecRestoreArrayRead(ll,&l);CHKERRQ(ierr);
976     ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
977   }
978   if (rr) {
979     ierr = VecGetLocalSize(rr,&n);CHKERRQ(ierr);
980     if (n != A->cmap->n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Right scaling vector wrong length");
981     ierr = VecGetArrayRead(rr,&r);CHKERRQ(ierr);
982     for (i=0; i<a->totalslices; i++) { /* loop over slices */
983       if (i == a->totalslices-1 && (A->rmap->n & 0x07)) { /* if last slice has padding rows */
984         for (j=a->sliidx[i],row=0; j<a->sliidx[i+1]; j++,row=((row+1)&0x07)) {
985           if (row < (A->rmap->n & 0x07)) a->val[j] *= r[a->colidx[j]];
986         }
987       } else {
988         for (j=a->sliidx[i]; j<a->sliidx[i+1]; j++) {
989           a->val[j] *= r[a->colidx[j]];
990         }
991       }
992     }
993     ierr = VecRestoreArrayRead(rr,&r);CHKERRQ(ierr);
994     ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
995   }
996   ierr = MatSeqSELLInvalidateDiagonal(A);CHKERRQ(ierr);
997   PetscFunctionReturn(0);
998 }
999 
1000 extern PetscErrorCode MatSetValues_SeqSELL(Mat,PetscInt,const PetscInt[],PetscInt,const PetscInt[],const PetscScalar[],InsertMode);
1001 
1002 PetscErrorCode MatGetValues_SeqSELL(Mat A,PetscInt m,const PetscInt im[],PetscInt n,const PetscInt in[],PetscScalar v[])
1003 {
1004   Mat_SeqSELL *a=(Mat_SeqSELL*)A->data;
1005   PetscInt    *cp,i,k,low,high,t,row,col,l;
1006   PetscInt    shift;
1007   MatScalar   *vp;
1008 
1009   PetscFunctionBegin;
1010   for (k=0; k<m; k++) { /* loop over requested rows */
1011     row = im[k];
1012     if (row<0) continue;
1013     if (PetscUnlikelyDebug(row >= A->rmap->n)) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Row too large: row %D max %D",row,A->rmap->n-1);
1014     shift = a->sliidx[row>>3]+(row&0x07); /* starting index of the row */
1015     cp = a->colidx+shift; /* pointer to the row */
1016     vp = a->val+shift; /* pointer to the row */
1017     for (l=0; l<n; l++) { /* loop over requested columns */
1018       col = in[l];
1019       if (col<0) continue;
1020       if (PetscUnlikelyDebug(col >= A->cmap->n)) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Column too large: row %D max %D",col,A->cmap->n-1);
1021       high = a->rlen[row]; low = 0; /* assume unsorted */
1022       while (high-low > 5) {
1023         t = (low+high)/2;
1024         if (*(cp+t*8) > col) high = t;
1025         else low = t;
1026       }
1027       for (i=low; i<high; i++) {
1028         if (*(cp+8*i) > col) break;
1029         if (*(cp+8*i) == col) {
1030           *v++ = *(vp+8*i);
1031           goto finished;
1032         }
1033       }
1034       *v++ = 0.0;
1035     finished:;
1036     }
1037   }
1038   PetscFunctionReturn(0);
1039 }
1040 
1041 PetscErrorCode MatView_SeqSELL_ASCII(Mat A,PetscViewer viewer)
1042 {
1043   Mat_SeqSELL       *a=(Mat_SeqSELL*)A->data;
1044   PetscInt          i,j,m=A->rmap->n,shift;
1045   const char        *name;
1046   PetscViewerFormat format;
1047   PetscErrorCode    ierr;
1048 
1049   PetscFunctionBegin;
1050   ierr = PetscViewerGetFormat(viewer,&format);CHKERRQ(ierr);
1051   if (format == PETSC_VIEWER_ASCII_MATLAB) {
1052     PetscInt nofinalvalue = 0;
1053     /*
1054     if (m && ((a->i[m] == a->i[m-1]) || (a->j[a->nz-1] != A->cmap->n-1))) {
1055       nofinalvalue = 1;
1056     }
1057     */
1058     ierr = PetscViewerASCIIUseTabs(viewer,PETSC_FALSE);CHKERRQ(ierr);
1059     ierr = PetscViewerASCIIPrintf(viewer,"%% Size = %D %D \n",m,A->cmap->n);CHKERRQ(ierr);
1060     ierr = PetscViewerASCIIPrintf(viewer,"%% Nonzeros = %D \n",a->nz);CHKERRQ(ierr);
1061 #if defined(PETSC_USE_COMPLEX)
1062     ierr = PetscViewerASCIIPrintf(viewer,"zzz = zeros(%D,4);\n",a->nz+nofinalvalue);CHKERRQ(ierr);
1063 #else
1064     ierr = PetscViewerASCIIPrintf(viewer,"zzz = zeros(%D,3);\n",a->nz+nofinalvalue);CHKERRQ(ierr);
1065 #endif
1066     ierr = PetscViewerASCIIPrintf(viewer,"zzz = [\n");CHKERRQ(ierr);
1067 
1068     for (i=0; i<m; i++) {
1069       shift = a->sliidx[i>>3]+(i&0x07);
1070       for (j=0; j<a->rlen[i]; j++) {
1071 #if defined(PETSC_USE_COMPLEX)
1072         ierr = PetscViewerASCIIPrintf(viewer,"%D %D  %18.16e %18.16e\n",i+1,a->colidx[shift+8*j]+1,(double)PetscRealPart(a->val[shift+8*j]),(double)PetscImaginaryPart(a->val[shift+8*j]));CHKERRQ(ierr);
1073 #else
1074         ierr = PetscViewerASCIIPrintf(viewer,"%D %D  %18.16e\n",i+1,a->colidx[shift+8*j]+1,(double)a->val[shift+8*j]);CHKERRQ(ierr);
1075 #endif
1076       }
1077     }
1078     /*
1079     if (nofinalvalue) {
1080 #if defined(PETSC_USE_COMPLEX)
1081       ierr = PetscViewerASCIIPrintf(viewer,"%D %D  %18.16e %18.16e\n",m,A->cmap->n,0.,0.);CHKERRQ(ierr);
1082 #else
1083       ierr = PetscViewerASCIIPrintf(viewer,"%D %D  %18.16e\n",m,A->cmap->n,0.0);CHKERRQ(ierr);
1084 #endif
1085     }
1086     */
1087     ierr = PetscObjectGetName((PetscObject)A,&name);CHKERRQ(ierr);
1088     ierr = PetscViewerASCIIPrintf(viewer,"];\n %s = spconvert(zzz);\n",name);CHKERRQ(ierr);
1089     ierr = PetscViewerASCIIUseTabs(viewer,PETSC_TRUE);CHKERRQ(ierr);
1090   } else if (format == PETSC_VIEWER_ASCII_FACTOR_INFO || format == PETSC_VIEWER_ASCII_INFO) {
1091     PetscFunctionReturn(0);
1092   } else if (format == PETSC_VIEWER_ASCII_COMMON) {
1093     ierr = PetscViewerASCIIUseTabs(viewer,PETSC_FALSE);CHKERRQ(ierr);
1094     for (i=0; i<m; i++) {
1095       ierr = PetscViewerASCIIPrintf(viewer,"row %D:",i);CHKERRQ(ierr);
1096       shift = a->sliidx[i>>3]+(i&0x07);
1097       for (j=0; j<a->rlen[i]; j++) {
1098 #if defined(PETSC_USE_COMPLEX)
1099         if (PetscImaginaryPart(a->val[shift+8*j]) > 0.0 && PetscRealPart(a->val[shift+8*j]) != 0.0) {
1100           ierr = PetscViewerASCIIPrintf(viewer," (%D, %g + %g i)",a->colidx[shift+8*j],(double)PetscRealPart(a->val[shift+8*j]),(double)PetscImaginaryPart(a->val[shift+8*j]));CHKERRQ(ierr);
1101         } else if (PetscImaginaryPart(a->val[shift+8*j]) < 0.0 && PetscRealPart(a->val[shift+8*j]) != 0.0) {
1102           ierr = PetscViewerASCIIPrintf(viewer," (%D, %g - %g i)",a->colidx[shift+8*j],(double)PetscRealPart(a->val[shift+8*j]),(double)-PetscImaginaryPart(a->val[shift+8*j]));CHKERRQ(ierr);
1103         } else if (PetscRealPart(a->val[shift+8*j]) != 0.0) {
1104           ierr = PetscViewerASCIIPrintf(viewer," (%D, %g) ",a->colidx[shift+8*j],(double)PetscRealPart(a->val[shift+8*j]));CHKERRQ(ierr);
1105         }
1106 #else
1107         if (a->val[shift+8*j] != 0.0) {ierr = PetscViewerASCIIPrintf(viewer," (%D, %g) ",a->colidx[shift+8*j],(double)a->val[shift+8*j]);CHKERRQ(ierr);}
1108 #endif
1109       }
1110       ierr = PetscViewerASCIIPrintf(viewer,"\n");CHKERRQ(ierr);
1111     }
1112     ierr = PetscViewerASCIIUseTabs(viewer,PETSC_TRUE);CHKERRQ(ierr);
1113   } else if (format == PETSC_VIEWER_ASCII_DENSE) {
1114     PetscInt    cnt=0,jcnt;
1115     PetscScalar value;
1116 #if defined(PETSC_USE_COMPLEX)
1117     PetscBool   realonly=PETSC_TRUE;
1118     for (i=0; i<a->sliidx[a->totalslices]; i++) {
1119       if (PetscImaginaryPart(a->val[i]) != 0.0) {
1120         realonly = PETSC_FALSE;
1121         break;
1122       }
1123     }
1124 #endif
1125 
1126     ierr = PetscViewerASCIIUseTabs(viewer,PETSC_FALSE);CHKERRQ(ierr);
1127     for (i=0; i<m; i++) {
1128       jcnt = 0;
1129       shift = a->sliidx[i>>3]+(i&0x07);
1130       for (j=0; j<A->cmap->n; j++) {
1131         if (jcnt < a->rlen[i] && j == a->colidx[shift+8*j]) {
1132           value = a->val[cnt++];
1133           jcnt++;
1134         } else {
1135           value = 0.0;
1136         }
1137 #if defined(PETSC_USE_COMPLEX)
1138         if (realonly) {
1139           ierr = PetscViewerASCIIPrintf(viewer," %7.5e ",(double)PetscRealPart(value));CHKERRQ(ierr);
1140         } else {
1141           ierr = PetscViewerASCIIPrintf(viewer," %7.5e+%7.5e i ",(double)PetscRealPart(value),(double)PetscImaginaryPart(value));CHKERRQ(ierr);
1142         }
1143 #else
1144         ierr = PetscViewerASCIIPrintf(viewer," %7.5e ",(double)value);CHKERRQ(ierr);
1145 #endif
1146       }
1147       ierr = PetscViewerASCIIPrintf(viewer,"\n");CHKERRQ(ierr);
1148     }
1149     ierr = PetscViewerASCIIUseTabs(viewer,PETSC_TRUE);CHKERRQ(ierr);
1150   } else if (format == PETSC_VIEWER_ASCII_MATRIXMARKET) {
1151     PetscInt fshift=1;
1152     ierr = PetscViewerASCIIUseTabs(viewer,PETSC_FALSE);CHKERRQ(ierr);
1153 #if defined(PETSC_USE_COMPLEX)
1154     ierr = PetscViewerASCIIPrintf(viewer,"%%%%MatrixMarket matrix coordinate complex general\n");CHKERRQ(ierr);
1155 #else
1156     ierr = PetscViewerASCIIPrintf(viewer,"%%%%MatrixMarket matrix coordinate real general\n");CHKERRQ(ierr);
1157 #endif
1158     ierr = PetscViewerASCIIPrintf(viewer,"%D %D %D\n", m, A->cmap->n, a->nz);CHKERRQ(ierr);
1159     for (i=0; i<m; i++) {
1160       shift = a->sliidx[i>>3]+(i&0x07);
1161       for (j=0; j<a->rlen[i]; j++) {
1162 #if defined(PETSC_USE_COMPLEX)
1163         ierr = PetscViewerASCIIPrintf(viewer,"%D %D %g %g\n",i+fshift,a->colidx[shift+8*j]+fshift,(double)PetscRealPart(a->val[shift+8*j]),(double)PetscImaginaryPart(a->val[shift+8*j]));CHKERRQ(ierr);
1164 #else
1165         ierr = PetscViewerASCIIPrintf(viewer,"%D %D %g\n",i+fshift,a->colidx[shift+8*j]+fshift,(double)a->val[shift+8*j]);CHKERRQ(ierr);
1166 #endif
1167       }
1168     }
1169     ierr = PetscViewerASCIIUseTabs(viewer,PETSC_TRUE);CHKERRQ(ierr);
1170   } else if (format == PETSC_VIEWER_NATIVE) {
1171     for (i=0; i<a->totalslices; i++) { /* loop over slices */
1172       PetscInt row;
1173       ierr = PetscViewerASCIIPrintf(viewer,"slice %D: %D %D\n",i,a->sliidx[i],a->sliidx[i+1]);CHKERRQ(ierr);
1174       for (j=a->sliidx[i],row=0; j<a->sliidx[i+1]; j++,row=((row+1)&0x07)) {
1175 #if defined(PETSC_USE_COMPLEX)
1176         if (PetscImaginaryPart(a->val[j]) > 0.0) {
1177           ierr = PetscViewerASCIIPrintf(viewer,"  %D %D %g + %g i\n",8*i+row,a->colidx[j],(double)PetscRealPart(a->val[j]),(double)PetscImaginaryPart(a->val[j]));CHKERRQ(ierr);
1178         } else if (PetscImaginaryPart(a->val[j]) < 0.0) {
1179           ierr = PetscViewerASCIIPrintf(viewer,"  %D %D %g - %g i\n",8*i+row,a->colidx[j],(double)PetscRealPart(a->val[j]),-(double)PetscImaginaryPart(a->val[j]));CHKERRQ(ierr);
1180         } else {
1181           ierr = PetscViewerASCIIPrintf(viewer,"  %D %D %g\n",8*i+row,a->colidx[j],(double)PetscRealPart(a->val[j]));CHKERRQ(ierr);
1182         }
1183 #else
1184         ierr = PetscViewerASCIIPrintf(viewer,"  %D %D %g\n",8*i+row,a->colidx[j],(double)a->val[j]);CHKERRQ(ierr);
1185 #endif
1186       }
1187     }
1188   } else {
1189     ierr = PetscViewerASCIIUseTabs(viewer,PETSC_FALSE);CHKERRQ(ierr);
1190     if (A->factortype) {
1191       for (i=0; i<m; i++) {
1192         shift = a->sliidx[i>>3]+(i&0x07);
1193         ierr = PetscViewerASCIIPrintf(viewer,"row %D:",i);CHKERRQ(ierr);
1194         /* L part */
1195         for (j=shift; j<a->diag[i]; j+=8) {
1196 #if defined(PETSC_USE_COMPLEX)
1197           if (PetscImaginaryPart(a->val[shift+8*j]) > 0.0) {
1198             ierr = PetscViewerASCIIPrintf(viewer," (%D, %g + %g i)",a->colidx[j],(double)PetscRealPart(a->val[j]),(double)PetscImaginaryPart(a->val[j]));CHKERRQ(ierr);
1199           } else if (PetscImaginaryPart(a->val[shift+8*j]) < 0.0) {
1200             ierr = PetscViewerASCIIPrintf(viewer," (%D, %g - %g i)",a->colidx[j],(double)PetscRealPart(a->val[j]),(double)(-PetscImaginaryPart(a->val[j])));CHKERRQ(ierr);
1201           } else {
1202             ierr = PetscViewerASCIIPrintf(viewer," (%D, %g) ",a->colidx[j],(double)PetscRealPart(a->val[j]));CHKERRQ(ierr);
1203           }
1204 #else
1205           ierr = PetscViewerASCIIPrintf(viewer," (%D, %g) ",a->colidx[j],(double)a->val[j]);CHKERRQ(ierr);
1206 #endif
1207         }
1208         /* diagonal */
1209         j = a->diag[i];
1210 #if defined(PETSC_USE_COMPLEX)
1211         if (PetscImaginaryPart(a->val[j]) > 0.0) {
1212           ierr = PetscViewerASCIIPrintf(viewer," (%D, %g + %g i)",a->colidx[j],(double)PetscRealPart(1.0/a->val[j]),(double)PetscImaginaryPart(1.0/a->val[j]));CHKERRQ(ierr);
1213         } else if (PetscImaginaryPart(a->val[j]) < 0.0) {
1214           ierr = PetscViewerASCIIPrintf(viewer," (%D, %g - %g i)",a->colidx[j],(double)PetscRealPart(1.0/a->val[j]),(double)(-PetscImaginaryPart(1.0/a->val[j])));CHKERRQ(ierr);
1215         } else {
1216           ierr = PetscViewerASCIIPrintf(viewer," (%D, %g) ",a->colidx[j],(double)PetscRealPart(1.0/a->val[j]));CHKERRQ(ierr);
1217         }
1218 #else
1219         ierr = PetscViewerASCIIPrintf(viewer," (%D, %g) ",a->colidx[j],(double)(1.0/a->val[j]));CHKERRQ(ierr);
1220 #endif
1221 
1222         /* U part */
1223         for (j=a->diag[i]+1; j<shift+8*a->rlen[i]; j+=8) {
1224 #if defined(PETSC_USE_COMPLEX)
1225           if (PetscImaginaryPart(a->val[j]) > 0.0) {
1226             ierr = PetscViewerASCIIPrintf(viewer," (%D, %g + %g i)",a->colidx[j],(double)PetscRealPart(a->val[j]),(double)PetscImaginaryPart(a->val[j]));CHKERRQ(ierr);
1227           } else if (PetscImaginaryPart(a->val[j]) < 0.0) {
1228             ierr = PetscViewerASCIIPrintf(viewer," (%D, %g - %g i)",a->colidx[j],(double)PetscRealPart(a->val[j]),(double)(-PetscImaginaryPart(a->val[j])));CHKERRQ(ierr);
1229           } else {
1230             ierr = PetscViewerASCIIPrintf(viewer," (%D, %g) ",a->colidx[j],(double)PetscRealPart(a->val[j]));CHKERRQ(ierr);
1231           }
1232 #else
1233           ierr = PetscViewerASCIIPrintf(viewer," (%D, %g) ",a->colidx[j],(double)a->val[j]);CHKERRQ(ierr);
1234 #endif
1235         }
1236         ierr = PetscViewerASCIIPrintf(viewer,"\n");CHKERRQ(ierr);
1237       }
1238     } else {
1239       for (i=0; i<m; i++) {
1240         shift = a->sliidx[i>>3]+(i&0x07);
1241         ierr = PetscViewerASCIIPrintf(viewer,"row %D:",i);CHKERRQ(ierr);
1242         for (j=0; j<a->rlen[i]; j++) {
1243 #if defined(PETSC_USE_COMPLEX)
1244           if (PetscImaginaryPart(a->val[j]) > 0.0) {
1245             ierr = PetscViewerASCIIPrintf(viewer," (%D, %g + %g i)",a->colidx[shift+8*j],(double)PetscRealPart(a->val[shift+8*j]),(double)PetscImaginaryPart(a->val[shift+8*j]));CHKERRQ(ierr);
1246           } else if (PetscImaginaryPart(a->val[j]) < 0.0) {
1247             ierr = PetscViewerASCIIPrintf(viewer," (%D, %g - %g i)",a->colidx[shift+8*j],(double)PetscRealPart(a->val[shift+8*j]),(double)-PetscImaginaryPart(a->val[shift+8*j]));CHKERRQ(ierr);
1248           } else {
1249             ierr = PetscViewerASCIIPrintf(viewer," (%D, %g) ",a->colidx[shift+8*j],(double)PetscRealPart(a->val[shift+8*j]));CHKERRQ(ierr);
1250           }
1251 #else
1252           ierr = PetscViewerASCIIPrintf(viewer," (%D, %g) ",a->colidx[shift+8*j],(double)a->val[shift+8*j]);CHKERRQ(ierr);
1253 #endif
1254         }
1255         ierr = PetscViewerASCIIPrintf(viewer,"\n");CHKERRQ(ierr);
1256       }
1257     }
1258     ierr = PetscViewerASCIIUseTabs(viewer,PETSC_TRUE);CHKERRQ(ierr);
1259   }
1260   ierr = PetscViewerFlush(viewer);CHKERRQ(ierr);
1261   PetscFunctionReturn(0);
1262 }
1263 
1264 #include <petscdraw.h>
1265 PetscErrorCode MatView_SeqSELL_Draw_Zoom(PetscDraw draw,void *Aa)
1266 {
1267   Mat               A=(Mat)Aa;
1268   Mat_SeqSELL       *a=(Mat_SeqSELL*)A->data;
1269   PetscInt          i,j,m=A->rmap->n,shift;
1270   int               color;
1271   PetscReal         xl,yl,xr,yr,x_l,x_r,y_l,y_r;
1272   PetscViewer       viewer;
1273   PetscViewerFormat format;
1274   PetscErrorCode    ierr;
1275 
1276   PetscFunctionBegin;
1277   ierr = PetscObjectQuery((PetscObject)A,"Zoomviewer",(PetscObject*)&viewer);CHKERRQ(ierr);
1278   ierr = PetscViewerGetFormat(viewer,&format);CHKERRQ(ierr);
1279   ierr = PetscDrawGetCoordinates(draw,&xl,&yl,&xr,&yr);CHKERRQ(ierr);
1280 
1281   /* loop over matrix elements drawing boxes */
1282 
1283   if (format != PETSC_VIEWER_DRAW_CONTOUR) {
1284     ierr = PetscDrawCollectiveBegin(draw);CHKERRQ(ierr);
1285     /* Blue for negative, Cyan for zero and  Red for positive */
1286     color = PETSC_DRAW_BLUE;
1287     for (i=0; i<m; i++) {
1288       shift = a->sliidx[i>>3]+(i&0x07); /* starting index of the row i */
1289       y_l = m - i - 1.0; y_r = y_l + 1.0;
1290       for (j=0; j<a->rlen[i]; j++) {
1291         x_l = a->colidx[shift+j*8]; x_r = x_l + 1.0;
1292         if (PetscRealPart(a->val[shift+8*j]) >=  0.) continue;
1293         ierr = PetscDrawRectangle(draw,x_l,y_l,x_r,y_r,color,color,color,color);CHKERRQ(ierr);
1294       }
1295     }
1296     color = PETSC_DRAW_CYAN;
1297     for (i=0; i<m; i++) {
1298       shift = a->sliidx[i>>3]+(i&0x07);
1299       y_l = m - i - 1.0; y_r = y_l + 1.0;
1300       for (j=0; j<a->rlen[i]; j++) {
1301         x_l = a->colidx[shift+j*8]; x_r = x_l + 1.0;
1302         if (a->val[shift+8*j] !=  0.) continue;
1303         ierr = PetscDrawRectangle(draw,x_l,y_l,x_r,y_r,color,color,color,color);CHKERRQ(ierr);
1304       }
1305     }
1306     color = PETSC_DRAW_RED;
1307     for (i=0; i<m; i++) {
1308       shift = a->sliidx[i>>3]+(i&0x07);
1309       y_l = m - i - 1.0; y_r = y_l + 1.0;
1310       for (j=0; j<a->rlen[i]; j++) {
1311         x_l = a->colidx[shift+j*8]; x_r = x_l + 1.0;
1312         if (PetscRealPart(a->val[shift+8*j]) <=  0.) continue;
1313         ierr = PetscDrawRectangle(draw,x_l,y_l,x_r,y_r,color,color,color,color);CHKERRQ(ierr);
1314       }
1315     }
1316     ierr = PetscDrawCollectiveEnd(draw);CHKERRQ(ierr);
1317   } else {
1318     /* use contour shading to indicate magnitude of values */
1319     /* first determine max of all nonzero values */
1320     PetscReal minv=0.0,maxv=0.0;
1321     PetscInt  count=0;
1322     PetscDraw popup;
1323     for (i=0; i<a->sliidx[a->totalslices]; i++) {
1324       if (PetscAbsScalar(a->val[i]) > maxv) maxv = PetscAbsScalar(a->val[i]);
1325     }
1326     if (minv >= maxv) maxv = minv + PETSC_SMALL;
1327     ierr = PetscDrawGetPopup(draw,&popup);CHKERRQ(ierr);
1328     ierr = PetscDrawScalePopup(popup,minv,maxv);CHKERRQ(ierr);
1329 
1330     ierr = PetscDrawCollectiveBegin(draw);CHKERRQ(ierr);
1331     for (i=0; i<m; i++) {
1332       shift = a->sliidx[i>>3]+(i&0x07);
1333       y_l = m - i - 1.0;
1334       y_r = y_l + 1.0;
1335       for (j=0; j<a->rlen[i]; j++) {
1336         x_l = a->colidx[shift+j*8];
1337         x_r = x_l + 1.0;
1338         color = PetscDrawRealToColor(PetscAbsScalar(a->val[count]),minv,maxv);
1339         ierr = PetscDrawRectangle(draw,x_l,y_l,x_r,y_r,color,color,color,color);CHKERRQ(ierr);
1340         count++;
1341       }
1342     }
1343     ierr = PetscDrawCollectiveEnd(draw);CHKERRQ(ierr);
1344   }
1345   PetscFunctionReturn(0);
1346 }
1347 
1348 #include <petscdraw.h>
1349 PetscErrorCode MatView_SeqSELL_Draw(Mat A,PetscViewer viewer)
1350 {
1351   PetscDraw      draw;
1352   PetscReal      xr,yr,xl,yl,h,w;
1353   PetscBool      isnull;
1354   PetscErrorCode ierr;
1355 
1356   PetscFunctionBegin;
1357   ierr = PetscViewerDrawGetDraw(viewer,0,&draw);CHKERRQ(ierr);
1358   ierr = PetscDrawIsNull(draw,&isnull);CHKERRQ(ierr);
1359   if (isnull) PetscFunctionReturn(0);
1360 
1361   xr   = A->cmap->n; yr  = A->rmap->n; h = yr/10.0; w = xr/10.0;
1362   xr  += w;          yr += h;         xl = -w;     yl = -h;
1363   ierr = PetscDrawSetCoordinates(draw,xl,yl,xr,yr);CHKERRQ(ierr);
1364   ierr = PetscObjectCompose((PetscObject)A,"Zoomviewer",(PetscObject)viewer);CHKERRQ(ierr);
1365   ierr = PetscDrawZoom(draw,MatView_SeqSELL_Draw_Zoom,A);CHKERRQ(ierr);
1366   ierr = PetscObjectCompose((PetscObject)A,"Zoomviewer",NULL);CHKERRQ(ierr);
1367   ierr = PetscDrawSave(draw);CHKERRQ(ierr);
1368   PetscFunctionReturn(0);
1369 }
1370 
1371 PetscErrorCode MatView_SeqSELL(Mat A,PetscViewer viewer)
1372 {
1373   PetscBool      iascii,isbinary,isdraw;
1374   PetscErrorCode ierr;
1375 
1376   PetscFunctionBegin;
1377   ierr = PetscObjectTypeCompare((PetscObject)viewer,PETSCVIEWERASCII,&iascii);CHKERRQ(ierr);
1378   ierr = PetscObjectTypeCompare((PetscObject)viewer,PETSCVIEWERBINARY,&isbinary);CHKERRQ(ierr);
1379   ierr = PetscObjectTypeCompare((PetscObject)viewer,PETSCVIEWERDRAW,&isdraw);CHKERRQ(ierr);
1380   if (iascii) {
1381     ierr = MatView_SeqSELL_ASCII(A,viewer);CHKERRQ(ierr);
1382   } else if (isbinary) {
1383     /* ierr = MatView_SeqSELL_Binary(A,viewer);CHKERRQ(ierr); */
1384   } else if (isdraw) {
1385     ierr = MatView_SeqSELL_Draw(A,viewer);CHKERRQ(ierr);
1386   }
1387   PetscFunctionReturn(0);
1388 }
1389 
1390 PetscErrorCode MatAssemblyEnd_SeqSELL(Mat A,MatAssemblyType mode)
1391 {
1392   Mat_SeqSELL    *a=(Mat_SeqSELL*)A->data;
1393   PetscInt       i,shift,row_in_slice,row,nrow,*cp,lastcol,j,k;
1394   MatScalar      *vp;
1395   PetscErrorCode ierr;
1396 
1397   PetscFunctionBegin;
1398   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0);
1399   /* To do: compress out the unused elements */
1400   ierr = MatMarkDiagonal_SeqSELL(A);CHKERRQ(ierr);
1401   ierr = PetscInfo6(A,"Matrix size: %D X %D; storage space: %D allocated %D used (%D nonzeros+%D paddedzeros)\n",A->rmap->n,A->cmap->n,a->maxallocmat,a->sliidx[a->totalslices],a->nz,a->sliidx[a->totalslices]-a->nz);CHKERRQ(ierr);
1402   ierr = PetscInfo1(A,"Number of mallocs during MatSetValues() is %D\n",a->reallocs);CHKERRQ(ierr);
1403   ierr = PetscInfo1(A,"Maximum nonzeros in any row is %D\n",a->rlenmax);CHKERRQ(ierr);
1404   /* Set unused slots for column indices to last valid column index. Set unused slots for values to zero. This allows for a use of unmasked intrinsics -> higher performance */
1405   for (i=0; i<a->totalslices; ++i) {
1406     shift = a->sliidx[i];    /* starting index of the slice */
1407     cp    = a->colidx+shift; /* pointer to the column indices of the slice */
1408     vp    = a->val+shift;    /* pointer to the nonzero values of the slice */
1409     for (row_in_slice=0; row_in_slice<8; ++row_in_slice) { /* loop over rows in the slice */
1410       row  = 8*i + row_in_slice;
1411       nrow = a->rlen[row]; /* number of nonzeros in row */
1412       /*
1413         Search for the nearest nonzero. Normally setting the index to zero may cause extra communication.
1414         But if the entire slice are empty, it is fine to use 0 since the index will not be loaded.
1415       */
1416       lastcol = 0;
1417       if (nrow>0) { /* nonempty row */
1418         lastcol = cp[8*(nrow-1)+row_in_slice]; /* use the index from the last nonzero at current row */
1419       } else if (!row_in_slice) { /* first row of the currect slice is empty */
1420         for (j=1;j<8;j++) {
1421           if (a->rlen[8*i+j]) {
1422             lastcol = cp[j];
1423             break;
1424           }
1425         }
1426       } else {
1427         if (a->sliidx[i+1] != shift) lastcol = cp[row_in_slice-1]; /* use the index from the previous row */
1428       }
1429 
1430       for (k=nrow; k<(a->sliidx[i+1]-shift)/8; ++k) {
1431         cp[8*k+row_in_slice] = lastcol;
1432         vp[8*k+row_in_slice] = (MatScalar)0;
1433       }
1434     }
1435   }
1436 
1437   A->info.mallocs += a->reallocs;
1438   a->reallocs      = 0;
1439 
1440   ierr = MatSeqSELLInvalidateDiagonal(A);CHKERRQ(ierr);
1441   PetscFunctionReturn(0);
1442 }
1443 
1444 PetscErrorCode MatGetInfo_SeqSELL(Mat A,MatInfoType flag,MatInfo *info)
1445 {
1446   Mat_SeqSELL *a=(Mat_SeqSELL*)A->data;
1447 
1448   PetscFunctionBegin;
1449   info->block_size   = 1.0;
1450   info->nz_allocated = a->maxallocmat;
1451   info->nz_used      = a->sliidx[a->totalslices]; /* include padding zeros */
1452   info->nz_unneeded  = (a->maxallocmat-a->sliidx[a->totalslices]);
1453   info->assemblies   = A->num_ass;
1454   info->mallocs      = A->info.mallocs;
1455   info->memory       = ((PetscObject)A)->mem;
1456   if (A->factortype) {
1457     info->fill_ratio_given  = A->info.fill_ratio_given;
1458     info->fill_ratio_needed = A->info.fill_ratio_needed;
1459     info->factor_mallocs    = A->info.factor_mallocs;
1460   } else {
1461     info->fill_ratio_given  = 0;
1462     info->fill_ratio_needed = 0;
1463     info->factor_mallocs    = 0;
1464   }
1465   PetscFunctionReturn(0);
1466 }
1467 
1468 PetscErrorCode MatSetValues_SeqSELL(Mat A,PetscInt m,const PetscInt im[],PetscInt n,const PetscInt in[],const PetscScalar v[],InsertMode is)
1469 {
1470   Mat_SeqSELL    *a=(Mat_SeqSELL*)A->data;
1471   PetscInt       shift,i,k,l,low,high,t,ii,row,col,nrow;
1472   PetscInt       *cp,nonew=a->nonew,lastcol=-1;
1473   MatScalar      *vp,value;
1474   PetscErrorCode ierr;
1475 
1476   PetscFunctionBegin;
1477   for (k=0; k<m; k++) { /* loop over added rows */
1478     row = im[k];
1479     if (row < 0) continue;
1480     if (PetscUnlikelyDebug(row >= A->rmap->n)) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Row too large: row %D max %D",row,A->rmap->n-1);
1481     shift = a->sliidx[row>>3]+(row&0x07); /* starting index of the row */
1482     cp    = a->colidx+shift; /* pointer to the row */
1483     vp    = a->val+shift; /* pointer to the row */
1484     nrow  = a->rlen[row];
1485     low   = 0;
1486     high  = nrow;
1487 
1488     for (l=0; l<n; l++) { /* loop over added columns */
1489       col = in[l];
1490       if (col<0) continue;
1491       if (PetscUnlikelyDebug(col >= A->cmap->n)) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Col too large: row %D max %D",col,A->cmap->n-1);
1492       if (a->roworiented) {
1493         value = v[l+k*n];
1494       } else {
1495         value = v[k+l*m];
1496       }
1497       if ((value == 0.0 && a->ignorezeroentries) && (is == ADD_VALUES)) continue;
1498 
1499       /* search in this row for the specified colmun, i indicates the column to be set */
1500       if (col <= lastcol) low = 0;
1501       else high = nrow;
1502       lastcol = col;
1503       while (high-low > 5) {
1504         t = (low+high)/2;
1505         if (*(cp+t*8) > col) high = t;
1506         else low = t;
1507       }
1508       for (i=low; i<high; i++) {
1509         if (*(cp+i*8) > col) break;
1510         if (*(cp+i*8) == col) {
1511           if (is == ADD_VALUES) *(vp+i*8) += value;
1512           else *(vp+i*8) = value;
1513           low = i + 1;
1514           goto noinsert;
1515         }
1516       }
1517       if (value == 0.0 && a->ignorezeroentries) goto noinsert;
1518       if (nonew == 1) goto noinsert;
1519       if (nonew == -1) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Inserting a new nonzero (%D, %D) in the matrix", row, col);
1520       /* If the current row length exceeds the slice width (e.g. nrow==slice_width), allocate a new space, otherwise do nothing */
1521       MatSeqXSELLReallocateSELL(A,A->rmap->n,1,nrow,a->sliidx,row/8,row,col,a->colidx,a->val,cp,vp,nonew,MatScalar);
1522       /* add the new nonzero to the high position, shift the remaining elements in current row to the right by one slot */
1523       for (ii=nrow-1; ii>=i; ii--) {
1524         *(cp+(ii+1)*8) = *(cp+ii*8);
1525         *(vp+(ii+1)*8) = *(vp+ii*8);
1526       }
1527       a->rlen[row]++;
1528       *(cp+i*8) = col;
1529       *(vp+i*8) = value;
1530       a->nz++;
1531       A->nonzerostate++;
1532       low = i+1; high++; nrow++;
1533 noinsert:;
1534     }
1535     a->rlen[row] = nrow;
1536   }
1537   PetscFunctionReturn(0);
1538 }
1539 
1540 PetscErrorCode MatCopy_SeqSELL(Mat A,Mat B,MatStructure str)
1541 {
1542   PetscErrorCode ierr;
1543 
1544   PetscFunctionBegin;
1545   /* If the two matrices have the same copy implementation, use fast copy. */
1546   if (str == SAME_NONZERO_PATTERN && (A->ops->copy == B->ops->copy)) {
1547     Mat_SeqSELL *a=(Mat_SeqSELL*)A->data;
1548     Mat_SeqSELL *b=(Mat_SeqSELL*)B->data;
1549 
1550     if (a->sliidx[a->totalslices] != b->sliidx[b->totalslices]) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"Number of nonzeros in two matrices are different");
1551     ierr = PetscArraycpy(b->val,a->val,a->sliidx[a->totalslices]);CHKERRQ(ierr);
1552   } else {
1553     ierr = MatCopy_Basic(A,B,str);CHKERRQ(ierr);
1554   }
1555   PetscFunctionReturn(0);
1556 }
1557 
1558 PetscErrorCode MatSetUp_SeqSELL(Mat A)
1559 {
1560   PetscErrorCode ierr;
1561 
1562   PetscFunctionBegin;
1563   ierr = MatSeqSELLSetPreallocation(A,PETSC_DEFAULT,0);CHKERRQ(ierr);
1564   PetscFunctionReturn(0);
1565 }
1566 
1567 PetscErrorCode MatSeqSELLGetArray_SeqSELL(Mat A,PetscScalar *array[])
1568 {
1569   Mat_SeqSELL *a=(Mat_SeqSELL*)A->data;
1570 
1571   PetscFunctionBegin;
1572   *array = a->val;
1573   PetscFunctionReturn(0);
1574 }
1575 
1576 PetscErrorCode MatSeqSELLRestoreArray_SeqSELL(Mat A,PetscScalar *array[])
1577 {
1578   PetscFunctionBegin;
1579   PetscFunctionReturn(0);
1580 }
1581 
1582 PetscErrorCode MatRealPart_SeqSELL(Mat A)
1583 {
1584   Mat_SeqSELL *a=(Mat_SeqSELL*)A->data;
1585   PetscInt    i;
1586   MatScalar   *aval=a->val;
1587 
1588   PetscFunctionBegin;
1589   for (i=0; i<a->sliidx[a->totalslices]; i++) aval[i]=PetscRealPart(aval[i]);
1590   PetscFunctionReturn(0);
1591 }
1592 
1593 PetscErrorCode MatImaginaryPart_SeqSELL(Mat A)
1594 {
1595   Mat_SeqSELL    *a=(Mat_SeqSELL*)A->data;
1596   PetscInt       i;
1597   MatScalar      *aval=a->val;
1598   PetscErrorCode ierr;
1599 
1600   PetscFunctionBegin;
1601   for (i=0; i<a->sliidx[a->totalslices]; i++) aval[i] = PetscImaginaryPart(aval[i]);
1602   ierr = MatSeqSELLInvalidateDiagonal(A);CHKERRQ(ierr);
1603   PetscFunctionReturn(0);
1604 }
1605 
1606 PetscErrorCode MatScale_SeqSELL(Mat inA,PetscScalar alpha)
1607 {
1608   Mat_SeqSELL    *a=(Mat_SeqSELL*)inA->data;
1609   MatScalar      *aval=a->val;
1610   PetscScalar    oalpha=alpha;
1611   PetscBLASInt   one=1,size;
1612   PetscErrorCode ierr;
1613 
1614   PetscFunctionBegin;
1615   ierr = PetscBLASIntCast(a->sliidx[a->totalslices],&size);CHKERRQ(ierr);
1616   PetscStackCallBLAS("BLASscal",BLASscal_(&size,&oalpha,aval,&one));
1617   ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
1618   ierr = MatSeqSELLInvalidateDiagonal(inA);CHKERRQ(ierr);
1619   PetscFunctionReturn(0);
1620 }
1621 
1622 PetscErrorCode MatShift_SeqSELL(Mat Y,PetscScalar a)
1623 {
1624   Mat_SeqSELL    *y=(Mat_SeqSELL*)Y->data;
1625   PetscErrorCode ierr;
1626 
1627   PetscFunctionBegin;
1628   if (!Y->preallocated || !y->nz) {
1629     ierr = MatSeqSELLSetPreallocation(Y,1,NULL);CHKERRQ(ierr);
1630   }
1631   ierr = MatShift_Basic(Y,a);CHKERRQ(ierr);
1632   PetscFunctionReturn(0);
1633 }
1634 
1635 PetscErrorCode MatSOR_SeqSELL(Mat A,Vec bb,PetscReal omega,MatSORType flag,PetscReal fshift,PetscInt its,PetscInt lits,Vec xx)
1636 {
1637   Mat_SeqSELL       *a=(Mat_SeqSELL*)A->data;
1638   PetscScalar       *x,sum,*t;
1639   const MatScalar   *idiag=0,*mdiag;
1640   const PetscScalar *b,*xb;
1641   PetscInt          n,m=A->rmap->n,i,j,shift;
1642   const PetscInt    *diag;
1643   PetscErrorCode    ierr;
1644 
1645   PetscFunctionBegin;
1646   its = its*lits;
1647 
1648   if (fshift != a->fshift || omega != a->omega) a->idiagvalid = PETSC_FALSE; /* must recompute idiag[] */
1649   if (!a->idiagvalid) {ierr = MatInvertDiagonal_SeqSELL(A,omega,fshift);CHKERRQ(ierr);}
1650   a->fshift = fshift;
1651   a->omega  = omega;
1652 
1653   diag  = a->diag;
1654   t     = a->ssor_work;
1655   idiag = a->idiag;
1656   mdiag = a->mdiag;
1657 
1658   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1659   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
1660   /* We count flops by assuming the upper triangular and lower triangular parts have the same number of nonzeros */
1661   if (flag == SOR_APPLY_UPPER) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"SOR_APPLY_UPPER is not implemented");
1662   if (flag == SOR_APPLY_LOWER) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"SOR_APPLY_LOWER is not implemented");
1663   if (flag & SOR_EISENSTAT) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support yet for Eisenstat");
1664 
1665   if (flag & SOR_ZERO_INITIAL_GUESS) {
1666     if ((flag & SOR_FORWARD_SWEEP) || (flag & SOR_LOCAL_FORWARD_SWEEP)) {
1667       for (i=0; i<m; i++) {
1668         shift = a->sliidx[i>>3]+(i&0x07); /* starting index of the row i */
1669         sum   = b[i];
1670         n     = (diag[i]-shift)/8;
1671         for (j=0; j<n; j++) sum -= a->val[shift+j*8]*x[a->colidx[shift+j*8]];
1672         t[i]  = sum;
1673         x[i]  = sum*idiag[i];
1674       }
1675       xb   = t;
1676       ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
1677     } else xb = b;
1678     if ((flag & SOR_BACKWARD_SWEEP) || (flag & SOR_LOCAL_BACKWARD_SWEEP)) {
1679       for (i=m-1; i>=0; i--) {
1680         shift = a->sliidx[i>>3]+(i&0x07); /* starting index of the row i */
1681         sum   = xb[i];
1682         n     = a->rlen[i]-(diag[i]-shift)/8-1;
1683         for (j=1; j<=n; j++) sum -= a->val[diag[i]+j*8]*x[a->colidx[diag[i]+j*8]];
1684         if (xb == b) {
1685           x[i] = sum*idiag[i];
1686         } else {
1687           x[i] = (1.-omega)*x[i]+sum*idiag[i];  /* omega in idiag */
1688         }
1689       }
1690       ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); /* assumes 1/2 in upper */
1691     }
1692     its--;
1693   }
1694   while (its--) {
1695     if ((flag & SOR_FORWARD_SWEEP) || (flag & SOR_LOCAL_FORWARD_SWEEP)) {
1696       for (i=0; i<m; i++) {
1697         /* lower */
1698         shift = a->sliidx[i>>3]+(i&0x07); /* starting index of the row i */
1699         sum   = b[i];
1700         n     = (diag[i]-shift)/8;
1701         for (j=0; j<n; j++) sum -= a->val[shift+j*8]*x[a->colidx[shift+j*8]];
1702         t[i]  = sum;             /* save application of the lower-triangular part */
1703         /* upper */
1704         n     = a->rlen[i]-(diag[i]-shift)/8-1;
1705         for (j=1; j<=n; j++) sum -= a->val[diag[i]+j*8]*x[a->colidx[diag[i]+j*8]];
1706         x[i]  = (1.-omega)*x[i]+sum*idiag[i];  /* omega in idiag */
1707       }
1708       xb   = t;
1709       ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
1710     } else xb = b;
1711     if ((flag & SOR_BACKWARD_SWEEP) || (flag & SOR_LOCAL_BACKWARD_SWEEP)) {
1712       for (i=m-1; i>=0; i--) {
1713         shift = a->sliidx[i>>3]+(i&0x07); /* starting index of the row i */
1714         sum = xb[i];
1715         if (xb == b) {
1716           /* whole matrix (no checkpointing available) */
1717           n     = a->rlen[i];
1718           for (j=0; j<n; j++) sum -= a->val[shift+j*8]*x[a->colidx[shift+j*8]];
1719           x[i] = (1.-omega)*x[i]+(sum+mdiag[i]*x[i])*idiag[i];
1720         } else { /* lower-triangular part has been saved, so only apply upper-triangular */
1721           n     = a->rlen[i]-(diag[i]-shift)/8-1;
1722           for (j=1; j<=n; j++) sum -= a->val[diag[i]+j*8]*x[a->colidx[diag[i]+j*8]];
1723           x[i]  = (1.-omega)*x[i]+sum*idiag[i];  /* omega in idiag */
1724         }
1725       }
1726       if (xb == b) {
1727         ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
1728       } else {
1729         ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); /* assumes 1/2 in upper */
1730       }
1731     }
1732   }
1733   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1734   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
1735   PetscFunctionReturn(0);
1736 }
1737 
1738 /* -------------------------------------------------------------------*/
1739 static struct _MatOps MatOps_Values = {MatSetValues_SeqSELL,
1740                                        MatGetRow_SeqSELL,
1741                                        MatRestoreRow_SeqSELL,
1742                                        MatMult_SeqSELL,
1743                                /* 4*/  MatMultAdd_SeqSELL,
1744                                        MatMultTranspose_SeqSELL,
1745                                        MatMultTransposeAdd_SeqSELL,
1746                                        0,
1747                                        0,
1748                                        0,
1749                                /* 10*/ 0,
1750                                        0,
1751                                        0,
1752                                        MatSOR_SeqSELL,
1753                                        0,
1754                                /* 15*/ MatGetInfo_SeqSELL,
1755                                        MatEqual_SeqSELL,
1756                                        MatGetDiagonal_SeqSELL,
1757                                        MatDiagonalScale_SeqSELL,
1758                                        0,
1759                                /* 20*/ 0,
1760                                        MatAssemblyEnd_SeqSELL,
1761                                        MatSetOption_SeqSELL,
1762                                        MatZeroEntries_SeqSELL,
1763                                /* 24*/ 0,
1764                                        0,
1765                                        0,
1766                                        0,
1767                                        0,
1768                                /* 29*/ MatSetUp_SeqSELL,
1769                                        0,
1770                                        0,
1771                                        0,
1772                                        0,
1773                                /* 34*/ MatDuplicate_SeqSELL,
1774                                        0,
1775                                        0,
1776                                        0,
1777                                        0,
1778                                /* 39*/ 0,
1779                                        0,
1780                                        0,
1781                                        MatGetValues_SeqSELL,
1782                                        MatCopy_SeqSELL,
1783                                /* 44*/ 0,
1784                                        MatScale_SeqSELL,
1785                                        MatShift_SeqSELL,
1786                                        0,
1787                                        0,
1788                                /* 49*/ 0,
1789                                        0,
1790                                        0,
1791                                        0,
1792                                        0,
1793                                /* 54*/ MatFDColoringCreate_SeqXAIJ,
1794                                        0,
1795                                        0,
1796                                        0,
1797                                        0,
1798                                /* 59*/ 0,
1799                                        MatDestroy_SeqSELL,
1800                                        MatView_SeqSELL,
1801                                        0,
1802                                        0,
1803                                /* 64*/ 0,
1804                                        0,
1805                                        0,
1806                                        0,
1807                                        0,
1808                                /* 69*/ 0,
1809                                        0,
1810                                        0,
1811                                        0,
1812                                        0,
1813                                /* 74*/ 0,
1814                                        MatFDColoringApply_AIJ, /* reuse the FDColoring function for AIJ */
1815                                        0,
1816                                        0,
1817                                        0,
1818                                /* 79*/ 0,
1819                                        0,
1820                                        0,
1821                                        0,
1822                                        0,
1823                                /* 84*/ 0,
1824                                        0,
1825                                        0,
1826                                        0,
1827                                        0,
1828                                /* 89*/ 0,
1829                                        0,
1830                                        0,
1831                                        0,
1832                                        0,
1833                                /* 94*/ 0,
1834                                        0,
1835                                        0,
1836                                        0,
1837                                        0,
1838                                /* 99*/ 0,
1839                                        0,
1840                                        0,
1841                                        MatConjugate_SeqSELL,
1842                                        0,
1843                                /*104*/ 0,
1844                                        0,
1845                                        0,
1846                                        0,
1847                                        0,
1848                                /*109*/ 0,
1849                                        0,
1850                                        0,
1851                                        0,
1852                                        MatMissingDiagonal_SeqSELL,
1853                                /*114*/ 0,
1854                                        0,
1855                                        0,
1856                                        0,
1857                                        0,
1858                                /*119*/ 0,
1859                                        0,
1860                                        0,
1861                                        0,
1862                                        0,
1863                                /*124*/ 0,
1864                                        0,
1865                                        0,
1866                                        0,
1867                                        0,
1868                                /*129*/ 0,
1869                                        0,
1870                                        0,
1871                                        0,
1872                                        0,
1873                                /*134*/ 0,
1874                                        0,
1875                                        0,
1876                                        0,
1877                                        0,
1878                                /*139*/ 0,
1879                                        0,
1880                                        0,
1881                                        MatFDColoringSetUp_SeqXAIJ,
1882                                        0,
1883                                 /*144*/0
1884 };
1885 
1886 PetscErrorCode MatStoreValues_SeqSELL(Mat mat)
1887 {
1888   Mat_SeqSELL    *a=(Mat_SeqSELL*)mat->data;
1889   PetscErrorCode ierr;
1890 
1891   PetscFunctionBegin;
1892   if (!a->nonew) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ORDER,"Must call MatSetOption(A,MAT_NEW_NONZERO_LOCATIONS,PETSC_FALSE);first");
1893 
1894   /* allocate space for values if not already there */
1895   if (!a->saved_values) {
1896     ierr = PetscMalloc1(a->sliidx[a->totalslices]+1,&a->saved_values);CHKERRQ(ierr);
1897     ierr = PetscLogObjectMemory((PetscObject)mat,(a->sliidx[a->totalslices]+1)*sizeof(PetscScalar));CHKERRQ(ierr);
1898   }
1899 
1900   /* copy values over */
1901   ierr = PetscArraycpy(a->saved_values,a->val,a->sliidx[a->totalslices]);CHKERRQ(ierr);
1902   PetscFunctionReturn(0);
1903 }
1904 
1905 PetscErrorCode MatRetrieveValues_SeqSELL(Mat mat)
1906 {
1907   Mat_SeqSELL    *a=(Mat_SeqSELL*)mat->data;
1908   PetscErrorCode ierr;
1909 
1910   PetscFunctionBegin;
1911   if (!a->nonew) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ORDER,"Must call MatSetOption(A,MAT_NEW_NONZERO_LOCATIONS,PETSC_FALSE);first");
1912   if (!a->saved_values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ORDER,"Must call MatStoreValues(A);first");
1913   ierr = PetscArraycpy(a->val,a->saved_values,a->sliidx[a->totalslices]);CHKERRQ(ierr);
1914   PetscFunctionReturn(0);
1915 }
1916 
1917 /*@C
1918  MatSeqSELLRestoreArray - returns access to the array where the data for a MATSEQSELL matrix is stored obtained by MatSeqSELLGetArray()
1919 
1920  Not Collective
1921 
1922  Input Parameters:
1923  .  mat - a MATSEQSELL matrix
1924  .  array - pointer to the data
1925 
1926  Level: intermediate
1927 
1928  .seealso: MatSeqSELLGetArray(), MatSeqSELLRestoreArrayF90()
1929  @*/
1930 PetscErrorCode MatSeqSELLRestoreArray(Mat A,PetscScalar **array)
1931 {
1932   PetscErrorCode ierr;
1933 
1934   PetscFunctionBegin;
1935   ierr = PetscUseMethod(A,"MatSeqSELLRestoreArray_C",(Mat,PetscScalar**),(A,array));CHKERRQ(ierr);
1936   PetscFunctionReturn(0);
1937 }
1938 
1939 PETSC_EXTERN PetscErrorCode MatCreate_SeqSELL(Mat B)
1940 {
1941   Mat_SeqSELL    *b;
1942   PetscMPIInt    size;
1943   PetscErrorCode ierr;
1944 
1945   PetscFunctionBegin;
1946   ierr = MPI_Comm_size(PetscObjectComm((PetscObject)B),&size);CHKERRQ(ierr);
1947   if (size > 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Comm must be of size 1");
1948 
1949   ierr = PetscNewLog(B,&b);CHKERRQ(ierr);
1950 
1951   B->data = (void*)b;
1952 
1953   ierr = PetscMemcpy(B->ops,&MatOps_Values,sizeof(struct _MatOps));CHKERRQ(ierr);
1954 
1955   b->row                = 0;
1956   b->col                = 0;
1957   b->icol               = 0;
1958   b->reallocs           = 0;
1959   b->ignorezeroentries  = PETSC_FALSE;
1960   b->roworiented        = PETSC_TRUE;
1961   b->nonew              = 0;
1962   b->diag               = 0;
1963   b->solve_work         = 0;
1964   B->spptr              = 0;
1965   b->saved_values       = 0;
1966   b->idiag              = 0;
1967   b->mdiag              = 0;
1968   b->ssor_work          = 0;
1969   b->omega              = 1.0;
1970   b->fshift             = 0.0;
1971   b->idiagvalid         = PETSC_FALSE;
1972   b->keepnonzeropattern = PETSC_FALSE;
1973 
1974   ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQSELL);CHKERRQ(ierr);
1975   ierr = PetscObjectComposeFunction((PetscObject)B,"MatSeqSELLGetArray_C",MatSeqSELLGetArray_SeqSELL);CHKERRQ(ierr);
1976   ierr = PetscObjectComposeFunction((PetscObject)B,"MatSeqSELLRestoreArray_C",MatSeqSELLRestoreArray_SeqSELL);CHKERRQ(ierr);
1977   ierr = PetscObjectComposeFunction((PetscObject)B,"MatStoreValues_C",MatStoreValues_SeqSELL);CHKERRQ(ierr);
1978   ierr = PetscObjectComposeFunction((PetscObject)B,"MatRetrieveValues_C",MatRetrieveValues_SeqSELL);CHKERRQ(ierr);
1979   ierr = PetscObjectComposeFunction((PetscObject)B,"MatSeqSELLSetPreallocation_C",MatSeqSELLSetPreallocation_SeqSELL);CHKERRQ(ierr);
1980   ierr = PetscObjectComposeFunction((PetscObject)B,"MatConvert_seqsell_seqaij_C",MatConvert_SeqSELL_SeqAIJ);CHKERRQ(ierr);
1981   PetscFunctionReturn(0);
1982 }
1983 
1984 /*
1985  Given a matrix generated with MatGetFactor() duplicates all the information in A into B
1986  */
1987 PetscErrorCode MatDuplicateNoCreate_SeqSELL(Mat C,Mat A,MatDuplicateOption cpvalues,PetscBool mallocmatspace)
1988 {
1989   Mat_SeqSELL    *c,*a=(Mat_SeqSELL*)A->data;
1990   PetscInt       i,m=A->rmap->n;
1991   PetscInt       totalslices=a->totalslices;
1992   PetscErrorCode ierr;
1993 
1994   PetscFunctionBegin;
1995   c = (Mat_SeqSELL*)C->data;
1996 
1997   C->factortype = A->factortype;
1998   c->row        = 0;
1999   c->col        = 0;
2000   c->icol       = 0;
2001   c->reallocs   = 0;
2002 
2003   C->assembled = PETSC_TRUE;
2004 
2005   ierr = PetscLayoutReference(A->rmap,&C->rmap);CHKERRQ(ierr);
2006   ierr = PetscLayoutReference(A->cmap,&C->cmap);CHKERRQ(ierr);
2007 
2008   ierr = PetscMalloc1(8*totalslices,&c->rlen);CHKERRQ(ierr);
2009   ierr = PetscLogObjectMemory((PetscObject)C,m*sizeof(PetscInt));CHKERRQ(ierr);
2010   ierr = PetscMalloc1(totalslices+1,&c->sliidx);CHKERRQ(ierr);
2011   ierr = PetscLogObjectMemory((PetscObject)C, (totalslices+1)*sizeof(PetscInt));CHKERRQ(ierr);
2012 
2013   for (i=0; i<m; i++) c->rlen[i] = a->rlen[i];
2014   for (i=0; i<totalslices+1; i++) c->sliidx[i] = a->sliidx[i];
2015 
2016   /* allocate the matrix space */
2017   if (mallocmatspace) {
2018     ierr = PetscMalloc2(a->maxallocmat,&c->val,a->maxallocmat,&c->colidx);CHKERRQ(ierr);
2019     ierr = PetscLogObjectMemory((PetscObject)C,a->maxallocmat*(sizeof(PetscScalar)+sizeof(PetscInt)));CHKERRQ(ierr);
2020 
2021     c->singlemalloc = PETSC_TRUE;
2022 
2023     if (m > 0) {
2024       ierr = PetscArraycpy(c->colidx,a->colidx,a->maxallocmat);CHKERRQ(ierr);
2025       if (cpvalues == MAT_COPY_VALUES) {
2026         ierr = PetscArraycpy(c->val,a->val,a->maxallocmat);CHKERRQ(ierr);
2027       } else {
2028         ierr = PetscArrayzero(c->val,a->maxallocmat);CHKERRQ(ierr);
2029       }
2030     }
2031   }
2032 
2033   c->ignorezeroentries = a->ignorezeroentries;
2034   c->roworiented       = a->roworiented;
2035   c->nonew             = a->nonew;
2036   if (a->diag) {
2037     ierr = PetscMalloc1(m,&c->diag);CHKERRQ(ierr);
2038     ierr = PetscLogObjectMemory((PetscObject)C,m*sizeof(PetscInt));CHKERRQ(ierr);
2039     for (i=0; i<m; i++) {
2040       c->diag[i] = a->diag[i];
2041     }
2042   } else c->diag = 0;
2043 
2044   c->solve_work         = 0;
2045   c->saved_values       = 0;
2046   c->idiag              = 0;
2047   c->ssor_work          = 0;
2048   c->keepnonzeropattern = a->keepnonzeropattern;
2049   c->free_val           = PETSC_TRUE;
2050   c->free_colidx        = PETSC_TRUE;
2051 
2052   c->maxallocmat  = a->maxallocmat;
2053   c->maxallocrow  = a->maxallocrow;
2054   c->rlenmax      = a->rlenmax;
2055   c->nz           = a->nz;
2056   C->preallocated = PETSC_TRUE;
2057 
2058   c->nonzerorowcnt = a->nonzerorowcnt;
2059   C->nonzerostate  = A->nonzerostate;
2060 
2061   ierr = PetscFunctionListDuplicate(((PetscObject)A)->qlist,&((PetscObject)C)->qlist);CHKERRQ(ierr);
2062   PetscFunctionReturn(0);
2063 }
2064 
2065 PetscErrorCode MatDuplicate_SeqSELL(Mat A,MatDuplicateOption cpvalues,Mat *B)
2066 {
2067   PetscErrorCode ierr;
2068 
2069   PetscFunctionBegin;
2070   ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr);
2071   ierr = MatSetSizes(*B,A->rmap->n,A->cmap->n,A->rmap->n,A->cmap->n);CHKERRQ(ierr);
2072   if (!(A->rmap->n % A->rmap->bs) && !(A->cmap->n % A->cmap->bs)) {
2073     ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr);
2074   }
2075   ierr = MatSetType(*B,((PetscObject)A)->type_name);CHKERRQ(ierr);
2076   ierr = MatDuplicateNoCreate_SeqSELL(*B,A,cpvalues,PETSC_TRUE);CHKERRQ(ierr);
2077   PetscFunctionReturn(0);
2078 }
2079 
2080 /*@C
2081  MatCreateSeqSELL - Creates a sparse matrix in SELL format.
2082 
2083  Collective
2084 
2085  Input Parameters:
2086 +  comm - MPI communicator, set to PETSC_COMM_SELF
2087 .  m - number of rows
2088 .  n - number of columns
2089 .  rlenmax - maximum number of nonzeros in a row
2090 -  rlen - array containing the number of nonzeros in the various rows
2091  (possibly different for each row) or NULL
2092 
2093  Output Parameter:
2094 .  A - the matrix
2095 
2096  It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
2097  MatXXXXSetPreallocation() paradigm instead of this routine directly.
2098  [MatXXXXSetPreallocation() is, for example, MatSeqSELLSetPreallocation]
2099 
2100  Notes:
2101  If nnz is given then nz is ignored
2102 
2103  Specify the preallocated storage with either rlenmax or rlen (not both).
2104  Set rlenmax=PETSC_DEFAULT and rlen=NULL for PETSc to control dynamic memory
2105  allocation.  For large problems you MUST preallocate memory or you
2106  will get TERRIBLE performance, see the users' manual chapter on matrices.
2107 
2108  Level: intermediate
2109 
2110  .seealso: MatCreate(), MatCreateSELL(), MatSetValues()
2111 
2112  @*/
2113 PetscErrorCode MatCreateSeqSELL(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt maxallocrow,const PetscInt rlen[],Mat *A)
2114 {
2115   PetscErrorCode ierr;
2116 
2117   PetscFunctionBegin;
2118   ierr = MatCreate(comm,A);CHKERRQ(ierr);
2119   ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr);
2120   ierr = MatSetType(*A,MATSEQSELL);CHKERRQ(ierr);
2121   ierr = MatSeqSELLSetPreallocation_SeqSELL(*A,maxallocrow,rlen);CHKERRQ(ierr);
2122   PetscFunctionReturn(0);
2123 }
2124 
2125 PetscErrorCode MatEqual_SeqSELL(Mat A,Mat B,PetscBool * flg)
2126 {
2127   Mat_SeqSELL     *a=(Mat_SeqSELL*)A->data,*b=(Mat_SeqSELL*)B->data;
2128   PetscInt       totalslices=a->totalslices;
2129   PetscErrorCode ierr;
2130 
2131   PetscFunctionBegin;
2132   /* If the  matrix dimensions are not equal,or no of nonzeros */
2133   if ((A->rmap->n != B->rmap->n) || (A->cmap->n != B->cmap->n) ||(a->nz != b->nz) || (a->rlenmax != b->rlenmax)) {
2134     *flg = PETSC_FALSE;
2135     PetscFunctionReturn(0);
2136   }
2137   /* if the a->colidx are the same */
2138   ierr = PetscArraycmp(a->colidx,b->colidx,a->sliidx[totalslices],flg);CHKERRQ(ierr);
2139   if (!*flg) PetscFunctionReturn(0);
2140   /* if a->val are the same */
2141   ierr = PetscArraycmp(a->val,b->val,a->sliidx[totalslices],flg);CHKERRQ(ierr);
2142   PetscFunctionReturn(0);
2143 }
2144 
2145 PetscErrorCode MatSeqSELLInvalidateDiagonal(Mat A)
2146 {
2147   Mat_SeqSELL *a=(Mat_SeqSELL*)A->data;
2148 
2149   PetscFunctionBegin;
2150   a->idiagvalid  = PETSC_FALSE;
2151   PetscFunctionReturn(0);
2152 }
2153 
2154 PetscErrorCode MatConjugate_SeqSELL(Mat A)
2155 {
2156 #if defined(PETSC_USE_COMPLEX)
2157   Mat_SeqSELL *a=(Mat_SeqSELL*)A->data;
2158   PetscInt    i;
2159   PetscScalar *val = a->val;
2160 
2161   PetscFunctionBegin;
2162   for (i=0; i<a->sliidx[a->totalslices]; i++) {
2163     val[i] = PetscConj(val[i]);
2164   }
2165 #else
2166   PetscFunctionBegin;
2167 #endif
2168   PetscFunctionReturn(0);
2169 }
2170