xref: /petsc/src/mat/impls/sell/seq/sell.c (revision e5a36eccef3d6b83a2c625c30d0dfd5adb4001f2)
1 
2 /*
3   Defines the basic matrix operations for the SELL matrix storage format.
4 */
5 #include <../src/mat/impls/sell/seq/sell.h>  /*I   "petscmat.h"  I*/
6 #include <petscblaslapack.h>
7 #include <petsc/private/kernels/blocktranspose.h>
8 #if defined(PETSC_HAVE_IMMINTRIN_H) && (defined(__AVX512F__) || (defined(__AVX2__) && defined(__FMA__)) || defined(__AVX__)) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
9 
10   #include <immintrin.h>
11 
12   #if !defined(_MM_SCALE_8)
13   #define _MM_SCALE_8    8
14   #endif
15 
16   #if defined(__AVX512F__)
17   /* these do not work
18    vec_idx  = _mm512_loadunpackhi_epi32(vec_idx,acolidx);
19    vec_vals = _mm512_loadunpackhi_pd(vec_vals,aval);
20   */
21     #define AVX512_Mult_Private(vec_idx,vec_x,vec_vals,vec_y) \
22     /* if the mask bit is set, copy from acolidx, otherwise from vec_idx */ \
23     vec_idx  = _mm256_loadu_si256((__m256i const*)acolidx); \
24     vec_vals = _mm512_loadu_pd(aval); \
25     vec_x    = _mm512_i32gather_pd(vec_idx,x,_MM_SCALE_8); \
26     vec_y    = _mm512_fmadd_pd(vec_x,vec_vals,vec_y)
27   #elif defined(__AVX2__) && defined(__FMA__)
28     #define AVX2_Mult_Private(vec_idx,vec_x,vec_vals,vec_y) \
29     vec_vals = _mm256_loadu_pd(aval); \
30     vec_idx  = _mm_loadu_si128((__m128i const*)acolidx); /* SSE2 */ \
31     vec_x    = _mm256_i32gather_pd(x,vec_idx,_MM_SCALE_8); \
32     vec_y    = _mm256_fmadd_pd(vec_x,vec_vals,vec_y)
33   #endif
34 #endif  /* PETSC_HAVE_IMMINTRIN_H */
35 
36 /*@C
37  MatSeqSELLSetPreallocation - For good matrix assembly performance
38  the user should preallocate the matrix storage by setting the parameter nz
39  (or the array nnz).  By setting these parameters accurately, performance
40  during matrix assembly can be increased significantly.
41 
42  Collective on MPI_Comm
43 
44  Input Parameters:
45  +  B - The matrix
46  .  nz - number of nonzeros per row (same for all rows)
47  -  nnz - array containing the number of nonzeros in the various rows
48  (possibly different for each row) or NULL
49 
50  Notes:
51  If nnz is given then nz is ignored.
52 
53  Specify the preallocated storage with either nz or nnz (not both).
54  Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
55  allocation.  For large problems you MUST preallocate memory or you
56  will get TERRIBLE performance, see the users' manual chapter on matrices.
57 
58  You can call MatGetInfo() to get information on how effective the preallocation was;
59  for example the fields mallocs,nz_allocated,nz_used,nz_unneeded;
60  You can also run with the option -info and look for messages with the string
61  malloc in them to see if additional memory allocation was needed.
62 
63  Developers: Use nz of MAT_SKIP_ALLOCATION to not allocate any space for the matrix
64  entries or columns indices.
65 
66  The maximum number of nonzeos in any row should be as accurate as possible.
67  If it is underestimated, you will get bad performance due to reallocation
68  (MatSeqXSELLReallocateSELL).
69 
70  Level: intermediate
71 
72  .seealso: MatCreate(), MatCreateSELL(), MatSetValues(), MatGetInfo()
73 
74  @*/
75 PetscErrorCode MatSeqSELLSetPreallocation(Mat B,PetscInt rlenmax,const PetscInt rlen[])
76 {
77   PetscErrorCode ierr;
78 
79   PetscFunctionBegin;
80   PetscValidHeaderSpecific(B,MAT_CLASSID,1);
81   PetscValidType(B,1);
82   ierr = PetscTryMethod(B,"MatSeqSELLSetPreallocation_C",(Mat,PetscInt,const PetscInt[]),(B,rlenmax,rlen));CHKERRQ(ierr);
83   PetscFunctionReturn(0);
84 }
85 
86 PetscErrorCode MatSeqSELLSetPreallocation_SeqSELL(Mat B,PetscInt maxallocrow,const PetscInt rlen[])
87 {
88   Mat_SeqSELL    *b;
89   PetscInt       i,j,totalslices;
90   PetscBool      skipallocation=PETSC_FALSE,realalloc=PETSC_FALSE;
91   PetscErrorCode ierr;
92 
93   PetscFunctionBegin;
94   if (maxallocrow >= 0 || rlen) realalloc = PETSC_TRUE;
95   if (maxallocrow == MAT_SKIP_ALLOCATION) {
96     skipallocation = PETSC_TRUE;
97     maxallocrow    = 0;
98   }
99 
100   ierr = PetscLayoutSetUp(B->rmap);CHKERRQ(ierr);
101   ierr = PetscLayoutSetUp(B->cmap);CHKERRQ(ierr);
102 
103   /* FIXME: if one preallocates more space than needed, the matrix does not shrink automatically, but for best performance it should */
104   if (maxallocrow == PETSC_DEFAULT || maxallocrow == PETSC_DECIDE) maxallocrow = 5;
105   if (maxallocrow < 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"maxallocrow cannot be less than 0: value %D",maxallocrow);
106   if (rlen) {
107     for (i=0; i<B->rmap->n; i++) {
108       if (rlen[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"rlen cannot be less than 0: local row %D value %D",i,rlen[i]);
109       if (rlen[i] > B->cmap->n) SETERRQ3(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"rlen cannot be greater than row length: local row %D value %D rowlength %D",i,rlen[i],B->cmap->n);
110     }
111   }
112 
113   B->preallocated = PETSC_TRUE;
114 
115   b = (Mat_SeqSELL*)B->data;
116 
117   totalslices = B->rmap->n/8+((B->rmap->n & 0x07)?1:0); /* ceil(n/8) */
118   b->totalslices = totalslices;
119   if (!skipallocation) {
120     if (B->rmap->n & 0x07) {ierr = PetscInfo1(B,"Padding rows to the SEQSELL matrix because the number of rows is not the multiple of 8 (value %D)\n",B->rmap->n);CHKERRQ(ierr);}
121 
122     if (!b->sliidx) { /* sliidx gives the starting index of each slice, the last element is the total space allocated */
123       ierr = PetscMalloc1(totalslices+1,&b->sliidx);CHKERRQ(ierr);
124       ierr = PetscLogObjectMemory((PetscObject)B,(totalslices+1)*sizeof(PetscInt));CHKERRQ(ierr);
125     }
126     if (!rlen) { /* if rlen is not provided, allocate same space for all the slices */
127       if (maxallocrow == PETSC_DEFAULT || maxallocrow == PETSC_DECIDE) maxallocrow = 10;
128       else if (maxallocrow < 0) maxallocrow = 1;
129       for (i=0; i<=totalslices; i++) b->sliidx[i] = i*8*maxallocrow;
130     } else {
131       maxallocrow = 0;
132       b->sliidx[0] = 0;
133       for (i=1; i<totalslices; i++) {
134         b->sliidx[i] = 0;
135         for (j=0;j<8;j++) {
136           b->sliidx[i] = PetscMax(b->sliidx[i],rlen[8*(i-1)+j]);
137         }
138         maxallocrow = PetscMax(b->sliidx[i],maxallocrow);
139         b->sliidx[i] = b->sliidx[i-1] + 8*b->sliidx[i];
140       }
141       /* last slice */
142       b->sliidx[totalslices] = 0;
143       for (j=(totalslices-1)*8;j<B->rmap->n;j++) b->sliidx[totalslices] = PetscMax(b->sliidx[totalslices],rlen[j]);
144       maxallocrow = PetscMax(b->sliidx[totalslices],maxallocrow);
145       b->sliidx[totalslices] = b->sliidx[totalslices-1] + 8*b->sliidx[totalslices];
146     }
147 
148     /* allocate space for val, colidx, rlen */
149     /* FIXME: should B's old memory be unlogged? */
150     ierr = MatSeqXSELLFreeSELL(B,&b->val,&b->colidx);CHKERRQ(ierr);
151     /* FIXME: assuming an element of the bit array takes 8 bits */
152     ierr = PetscMalloc2(b->sliidx[totalslices],&b->val,b->sliidx[totalslices],&b->colidx);CHKERRQ(ierr);
153     ierr = PetscLogObjectMemory((PetscObject)B,b->sliidx[totalslices]*(sizeof(PetscScalar)+sizeof(PetscInt)));CHKERRQ(ierr);
154     /* b->rlen will count nonzeros in each row so far. We dont copy rlen to b->rlen because the matrix has not been set. */
155     ierr = PetscCalloc1(8*totalslices,&b->rlen);CHKERRQ(ierr);
156     ierr = PetscLogObjectMemory((PetscObject)B,8*totalslices*sizeof(PetscInt));CHKERRQ(ierr);
157 
158     b->singlemalloc = PETSC_TRUE;
159     b->free_val     = PETSC_TRUE;
160     b->free_colidx  = PETSC_TRUE;
161   } else {
162     b->free_val    = PETSC_FALSE;
163     b->free_colidx = PETSC_FALSE;
164   }
165 
166   b->nz               = 0;
167   b->maxallocrow      = maxallocrow;
168   b->rlenmax          = maxallocrow;
169   b->maxallocmat      = b->sliidx[totalslices];
170   B->info.nz_unneeded = (double)b->maxallocmat;
171   if (realalloc) {
172     ierr = MatSetOption(B,MAT_NEW_NONZERO_ALLOCATION_ERR,PETSC_TRUE);CHKERRQ(ierr);
173   }
174   PetscFunctionReturn(0);
175 }
176 
177 PetscErrorCode MatGetRow_SeqSELL(Mat A,PetscInt row,PetscInt *nz,PetscInt **idx,PetscScalar **v)
178 {
179   Mat_SeqSELL *a = (Mat_SeqSELL*)A->data;
180   PetscInt    shift;
181 
182   PetscFunctionBegin;
183   if (row < 0 || row >= A->rmap->n) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Row %D out of range",row);
184   if (nz) *nz = a->rlen[row];
185   shift = a->sliidx[row>>3]+(row&0x07);
186   if (!a->getrowcols) {
187     PetscErrorCode ierr;
188 
189     ierr = PetscMalloc2(a->rlenmax,&a->getrowcols,a->rlenmax,&a->getrowvals);CHKERRQ(ierr);
190   }
191   if (idx) {
192     PetscInt j;
193     for (j=0; j<a->rlen[row]; j++) a->getrowcols[j] = a->colidx[shift+8*j];
194     *idx = a->getrowcols;
195   }
196   if (v) {
197     PetscInt j;
198     for (j=0; j<a->rlen[row]; j++) a->getrowvals[j] = a->val[shift+8*j];
199     *v = a->getrowvals;
200   }
201   PetscFunctionReturn(0);
202 }
203 
204 PetscErrorCode MatRestoreRow_SeqSELL(Mat A,PetscInt row,PetscInt *nz,PetscInt **idx,PetscScalar **v)
205 {
206   PetscFunctionBegin;
207   PetscFunctionReturn(0);
208 }
209 
210 PetscErrorCode MatConvert_SeqSELL_SeqAIJ(Mat A, MatType newtype,MatReuse reuse,Mat *newmat)
211 {
212   Mat            B;
213   Mat_SeqSELL    *a=(Mat_SeqSELL*)A->data;
214   PetscInt       i;
215   PetscErrorCode ierr;
216 
217   PetscFunctionBegin;
218   if (reuse == MAT_REUSE_MATRIX) {
219     B    = *newmat;
220     ierr = MatZeroEntries(B);CHKERRQ(ierr);
221   } else {
222     ierr = MatCreate(PetscObjectComm((PetscObject)A),&B);CHKERRQ(ierr);
223     ierr = MatSetSizes(B,A->rmap->n,A->cmap->n,A->rmap->N,A->cmap->N);CHKERRQ(ierr);
224     ierr = MatSetType(B,MATSEQAIJ);CHKERRQ(ierr);
225     ierr = MatSeqAIJSetPreallocation(B,0,a->rlen);CHKERRQ(ierr);
226   }
227 
228   for (i=0; i<A->rmap->n; i++) {
229     PetscInt    nz = 0,*cols = NULL;
230     PetscScalar *vals = NULL;
231 
232     ierr = MatGetRow_SeqSELL(A,i,&nz,&cols,&vals);CHKERRQ(ierr);
233     ierr = MatSetValues(B,1,&i,nz,cols,vals,INSERT_VALUES);CHKERRQ(ierr);
234     ierr = MatRestoreRow_SeqSELL(A,i,&nz,&cols,&vals);CHKERRQ(ierr);
235   }
236 
237   ierr = MatAssemblyBegin(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
238   ierr = MatAssemblyEnd(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
239   B->rmap->bs = A->rmap->bs;
240 
241   if (reuse == MAT_INPLACE_MATRIX) {
242     ierr = MatHeaderReplace(A,&B);CHKERRQ(ierr);
243   } else {
244     *newmat = B;
245   }
246   PetscFunctionReturn(0);
247 }
248 
249 #include <../src/mat/impls/aij/seq/aij.h>
250 
251 PetscErrorCode MatConvert_SeqAIJ_SeqSELL(Mat A,MatType newtype,MatReuse reuse,Mat *newmat)
252 {
253   Mat               B;
254   Mat_SeqAIJ        *a=(Mat_SeqAIJ*)A->data;
255   PetscInt          *ai=a->i,m=A->rmap->N,n=A->cmap->N,i,*rowlengths,row,ncols;
256   const PetscInt    *cols;
257   const PetscScalar *vals;
258   PetscErrorCode    ierr;
259 
260   PetscFunctionBegin;
261 
262   if (reuse == MAT_REUSE_MATRIX) {
263     B = *newmat;
264   } else {
265     /* Can we just use ilen? */
266     ierr = PetscMalloc1(m,&rowlengths);CHKERRQ(ierr);
267     for (i=0; i<m; i++) {
268       rowlengths[i] = ai[i+1] - ai[i];
269     }
270 
271     ierr = MatCreate(PetscObjectComm((PetscObject)A),&B);CHKERRQ(ierr);
272     ierr = MatSetSizes(B,m,n,m,n);CHKERRQ(ierr);
273     ierr = MatSetType(B,MATSEQSELL);CHKERRQ(ierr);
274     ierr = MatSeqSELLSetPreallocation(B,0,rowlengths);CHKERRQ(ierr);
275     ierr = PetscFree(rowlengths);CHKERRQ(ierr);
276   }
277 
278   for (row=0; row<m; row++) {
279     ierr = MatGetRow(A,row,&ncols,&cols,&vals);CHKERRQ(ierr);
280     ierr = MatSetValues(B,1,&row,ncols,cols,vals,INSERT_VALUES);CHKERRQ(ierr);
281     ierr = MatRestoreRow(A,row,&ncols,&cols,&vals);CHKERRQ(ierr);
282   }
283   ierr = MatAssemblyBegin(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
284   ierr = MatAssemblyEnd(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
285   B->rmap->bs = A->rmap->bs;
286 
287   if (reuse == MAT_INPLACE_MATRIX) {
288     ierr = MatHeaderReplace(A,&B);CHKERRQ(ierr);
289   } else {
290     *newmat = B;
291   }
292   PetscFunctionReturn(0);
293 }
294 
295 PetscErrorCode MatMult_SeqSELL(Mat A,Vec xx,Vec yy)
296 {
297   Mat_SeqSELL       *a=(Mat_SeqSELL*)A->data;
298   PetscScalar       *y;
299   const PetscScalar *x;
300   const MatScalar   *aval=a->val;
301   PetscInt          totalslices=a->totalslices;
302   const PetscInt    *acolidx=a->colidx;
303   PetscInt          i,j;
304   PetscErrorCode    ierr;
305 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
306   __m512d           vec_x,vec_y,vec_vals;
307   __m256i           vec_idx;
308   __mmask8          mask;
309   __m512d           vec_x2,vec_y2,vec_vals2,vec_x3,vec_y3,vec_vals3,vec_x4,vec_y4,vec_vals4;
310   __m256i           vec_idx2,vec_idx3,vec_idx4;
311 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
312   __m128i           vec_idx;
313   __m256d           vec_x,vec_y,vec_y2,vec_vals;
314   MatScalar         yval;
315   PetscInt          r,rows_left,row,nnz_in_row;
316 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
317   __m128d           vec_x_tmp;
318   __m256d           vec_x,vec_y,vec_y2,vec_vals;
319   MatScalar         yval;
320   PetscInt          r,rows_left,row,nnz_in_row;
321 #else
322   PetscScalar       sum[8];
323 #endif
324 
325 #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
326 #pragma disjoint(*x,*y,*aval)
327 #endif
328 
329   PetscFunctionBegin;
330   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
331   ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
332 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
333   for (i=0; i<totalslices; i++) { /* loop over slices */
334     PetscPrefetchBlock(acolidx,a->sliidx[i+1]-a->sliidx[i],0,PETSC_PREFETCH_HINT_T0);
335     PetscPrefetchBlock(aval,a->sliidx[i+1]-a->sliidx[i],0,PETSC_PREFETCH_HINT_T0);
336 
337     vec_y  = _mm512_setzero_pd();
338     vec_y2 = _mm512_setzero_pd();
339     vec_y3 = _mm512_setzero_pd();
340     vec_y4 = _mm512_setzero_pd();
341 
342     j = a->sliidx[i]>>3; /* 8 bytes are read at each time, corresponding to a slice columnn */
343     switch ((a->sliidx[i+1]-a->sliidx[i])/8 & 3) {
344     case 3:
345       AVX512_Mult_Private(vec_idx,vec_x,vec_vals,vec_y);
346       acolidx += 8; aval += 8;
347       AVX512_Mult_Private(vec_idx2,vec_x2,vec_vals2,vec_y2);
348       acolidx += 8; aval += 8;
349       AVX512_Mult_Private(vec_idx3,vec_x3,vec_vals3,vec_y3);
350       acolidx += 8; aval += 8;
351       j += 3;
352       break;
353     case 2:
354       AVX512_Mult_Private(vec_idx,vec_x,vec_vals,vec_y);
355       acolidx += 8; aval += 8;
356       AVX512_Mult_Private(vec_idx2,vec_x2,vec_vals2,vec_y2);
357       acolidx += 8; aval += 8;
358       j += 2;
359       break;
360     case 1:
361       AVX512_Mult_Private(vec_idx,vec_x,vec_vals,vec_y);
362       acolidx += 8; aval += 8;
363       j += 1;
364       break;
365     }
366     #pragma novector
367     for (; j<(a->sliidx[i+1]>>3); j+=4) {
368       AVX512_Mult_Private(vec_idx,vec_x,vec_vals,vec_y);
369       acolidx += 8; aval += 8;
370       AVX512_Mult_Private(vec_idx2,vec_x2,vec_vals2,vec_y2);
371       acolidx += 8; aval += 8;
372       AVX512_Mult_Private(vec_idx3,vec_x3,vec_vals3,vec_y3);
373       acolidx += 8; aval += 8;
374       AVX512_Mult_Private(vec_idx4,vec_x4,vec_vals4,vec_y4);
375       acolidx += 8; aval += 8;
376     }
377 
378     vec_y = _mm512_add_pd(vec_y,vec_y2);
379     vec_y = _mm512_add_pd(vec_y,vec_y3);
380     vec_y = _mm512_add_pd(vec_y,vec_y4);
381     if (i == totalslices-1 && A->rmap->n & 0x07) { /* if last slice has padding rows */
382       mask = (__mmask8)(0xff >> (8-(A->rmap->n & 0x07)));
383       _mm512_mask_storeu_pd(&y[8*i],mask,vec_y);
384     } else {
385       _mm512_storeu_pd(&y[8*i],vec_y);
386     }
387   }
388 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
389   for (i=0; i<totalslices; i++) { /* loop over full slices */
390     PetscPrefetchBlock(acolidx,a->sliidx[i+1]-a->sliidx[i],0,PETSC_PREFETCH_HINT_T0);
391     PetscPrefetchBlock(aval,a->sliidx[i+1]-a->sliidx[i],0,PETSC_PREFETCH_HINT_T0);
392 
393     /* last slice may have padding rows. Don't use vectorization. */
394     if (i == totalslices-1 && (A->rmap->n & 0x07)) {
395       rows_left = A->rmap->n - 8*i;
396       for (r=0; r<rows_left; ++r) {
397         yval = (MatScalar)0;
398         row = 8*i + r;
399         nnz_in_row = a->rlen[row];
400         for (j=0; j<nnz_in_row; ++j) yval += aval[8*j+r] * x[acolidx[8*j+r]];
401         y[row] = yval;
402       }
403       break;
404     }
405 
406     vec_y  = _mm256_setzero_pd();
407     vec_y2 = _mm256_setzero_pd();
408 
409     /* Process slice of height 8 (512 bits) via two subslices of height 4 (256 bits) via AVX */
410     #pragma novector
411     #pragma unroll(2)
412     for (j=a->sliidx[i]; j<a->sliidx[i+1]; j+=8) {
413       AVX2_Mult_Private(vec_idx,vec_x,vec_vals,vec_y);
414       aval += 4; acolidx += 4;
415       AVX2_Mult_Private(vec_idx,vec_x,vec_vals,vec_y2);
416       aval += 4; acolidx += 4;
417     }
418 
419     _mm256_storeu_pd(y+i*8,vec_y);
420     _mm256_storeu_pd(y+i*8+4,vec_y2);
421   }
422 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
423   for (i=0; i<totalslices; i++) { /* loop over full slices */
424     PetscPrefetchBlock(acolidx,a->sliidx[i+1]-a->sliidx[i],0,PETSC_PREFETCH_HINT_T0);
425     PetscPrefetchBlock(aval,a->sliidx[i+1]-a->sliidx[i],0,PETSC_PREFETCH_HINT_T0);
426 
427     vec_y  = _mm256_setzero_pd();
428     vec_y2 = _mm256_setzero_pd();
429 
430     /* last slice may have padding rows. Don't use vectorization. */
431     if (i == totalslices-1 && (A->rmap->n & 0x07)) {
432       rows_left = A->rmap->n - 8*i;
433       for (r=0; r<rows_left; ++r) {
434         yval = (MatScalar)0;
435         row = 8*i + r;
436         nnz_in_row = a->rlen[row];
437         for (j=0; j<nnz_in_row; ++j) yval += aval[8*j + r] * x[acolidx[8*j + r]];
438         y[row] = yval;
439       }
440       break;
441     }
442 
443     /* Process slice of height 8 (512 bits) via two subslices of height 4 (256 bits) via AVX */
444     #pragma novector
445     #pragma unroll(2)
446     for (j=a->sliidx[i]; j<a->sliidx[i+1]; j+=8) {
447       vec_vals  = _mm256_loadu_pd(aval);
448       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
449       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
450       vec_x     = _mm256_insertf128_pd(vec_x,vec_x_tmp,0);
451       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
452       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
453       vec_x     = _mm256_insertf128_pd(vec_x,vec_x_tmp,1);
454       vec_y     = _mm256_add_pd(_mm256_mul_pd(vec_x,vec_vals),vec_y);
455       aval     += 4;
456 
457       vec_vals  = _mm256_loadu_pd(aval);
458       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
459       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
460       vec_x     = _mm256_insertf128_pd(vec_x,vec_x_tmp,0);
461       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
462       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
463       vec_x     = _mm256_insertf128_pd(vec_x,vec_x_tmp,1);
464       vec_y2    = _mm256_add_pd(_mm256_mul_pd(vec_x,vec_vals),vec_y2);
465       aval     += 4;
466     }
467 
468     _mm256_storeu_pd(y + i*8,     vec_y);
469     _mm256_storeu_pd(y + i*8 + 4, vec_y2);
470   }
471 #else
472   for (i=0; i<totalslices; i++) { /* loop over slices */
473     for (j=0; j<8; j++) sum[j] = 0.0;
474     for (j=a->sliidx[i]; j<a->sliidx[i+1]; j+=8) {
475       sum[0] += aval[j] * x[acolidx[j]];
476       sum[1] += aval[j+1] * x[acolidx[j+1]];
477       sum[2] += aval[j+2] * x[acolidx[j+2]];
478       sum[3] += aval[j+3] * x[acolidx[j+3]];
479       sum[4] += aval[j+4] * x[acolidx[j+4]];
480       sum[5] += aval[j+5] * x[acolidx[j+5]];
481       sum[6] += aval[j+6] * x[acolidx[j+6]];
482       sum[7] += aval[j+7] * x[acolidx[j+7]];
483     }
484     if (i == totalslices-1 && (A->rmap->n & 0x07)) { /* if last slice has padding rows */
485       for(j=0; j<(A->rmap->n & 0x07); j++) y[8*i+j] = sum[j];
486     } else {
487       for(j=0; j<8; j++) y[8*i+j] = sum[j];
488     }
489   }
490 #endif
491 
492   ierr = PetscLogFlops(2.0*a->nz-a->nonzerorowcnt);CHKERRQ(ierr); /* theoretical minimal FLOPs */
493   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
494   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
495   PetscFunctionReturn(0);
496 }
497 
498 #include <../src/mat/impls/aij/seq/ftn-kernels/fmultadd.h>
499 PetscErrorCode MatMultAdd_SeqSELL(Mat A,Vec xx,Vec yy,Vec zz)
500 {
501   Mat_SeqSELL       *a=(Mat_SeqSELL*)A->data;
502   PetscScalar       *y,*z;
503   const PetscScalar *x;
504   const MatScalar   *aval=a->val;
505   PetscInt          totalslices=a->totalslices;
506   const PetscInt    *acolidx=a->colidx;
507   PetscInt          i,j;
508   PetscErrorCode    ierr;
509 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
510   __m512d           vec_x,vec_y,vec_vals;
511   __m256i           vec_idx;
512   __mmask8          mask;
513   __m512d           vec_x2,vec_y2,vec_vals2,vec_x3,vec_y3,vec_vals3,vec_x4,vec_y4,vec_vals4;
514   __m256i           vec_idx2,vec_idx3,vec_idx4;
515 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
516   __m128d           vec_x_tmp;
517   __m256d           vec_x,vec_y,vec_y2,vec_vals;
518   MatScalar         yval;
519   PetscInt          r,row,nnz_in_row;
520 #else
521   PetscScalar       sum[8];
522 #endif
523 
524 #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
525 #pragma disjoint(*x,*y,*aval)
526 #endif
527 
528   PetscFunctionBegin;
529   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
530   ierr = VecGetArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
531 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
532   for (i=0; i<totalslices; i++) { /* loop over slices */
533     PetscPrefetchBlock(acolidx,a->sliidx[i+1]-a->sliidx[i],0,PETSC_PREFETCH_HINT_T0);
534     PetscPrefetchBlock(aval,a->sliidx[i+1]-a->sliidx[i],0,PETSC_PREFETCH_HINT_T0);
535 
536     if (i == totalslices-1 && A->rmap->n & 0x07) { /* if last slice has padding rows */
537       mask   = (__mmask8)(0xff >> (8-(A->rmap->n & 0x07)));
538       vec_y  = _mm512_mask_loadu_pd(vec_y,mask,&y[8*i]);
539     } else {
540       vec_y  = _mm512_loadu_pd(&y[8*i]);
541     }
542     vec_y2 = _mm512_setzero_pd();
543     vec_y3 = _mm512_setzero_pd();
544     vec_y4 = _mm512_setzero_pd();
545 
546     j = a->sliidx[i]>>3; /* 8 bytes are read at each time, corresponding to a slice columnn */
547     switch ((a->sliidx[i+1]-a->sliidx[i])/8 & 3) {
548     case 3:
549       AVX512_Mult_Private(vec_idx,vec_x,vec_vals,vec_y);
550       acolidx += 8; aval += 8;
551       AVX512_Mult_Private(vec_idx2,vec_x2,vec_vals2,vec_y2);
552       acolidx += 8; aval += 8;
553       AVX512_Mult_Private(vec_idx3,vec_x3,vec_vals3,vec_y3);
554       acolidx += 8; aval += 8;
555       j += 3;
556       break;
557     case 2:
558       AVX512_Mult_Private(vec_idx,vec_x,vec_vals,vec_y);
559       acolidx += 8; aval += 8;
560       AVX512_Mult_Private(vec_idx2,vec_x2,vec_vals2,vec_y2);
561       acolidx += 8; aval += 8;
562       j += 2;
563       break;
564     case 1:
565       AVX512_Mult_Private(vec_idx,vec_x,vec_vals,vec_y);
566       acolidx += 8; aval += 8;
567       j += 1;
568       break;
569     }
570     #pragma novector
571     for (; j<(a->sliidx[i+1]>>3); j+=4) {
572       AVX512_Mult_Private(vec_idx,vec_x,vec_vals,vec_y);
573       acolidx += 8; aval += 8;
574       AVX512_Mult_Private(vec_idx2,vec_x2,vec_vals2,vec_y2);
575       acolidx += 8; aval += 8;
576       AVX512_Mult_Private(vec_idx3,vec_x3,vec_vals3,vec_y3);
577       acolidx += 8; aval += 8;
578       AVX512_Mult_Private(vec_idx4,vec_x4,vec_vals4,vec_y4);
579       acolidx += 8; aval += 8;
580     }
581 
582     vec_y = _mm512_add_pd(vec_y,vec_y2);
583     vec_y = _mm512_add_pd(vec_y,vec_y3);
584     vec_y = _mm512_add_pd(vec_y,vec_y4);
585     if (i == totalslices-1 && A->rmap->n & 0x07) { /* if last slice has padding rows */
586       _mm512_mask_storeu_pd(&z[8*i],mask,vec_y);
587     } else {
588       _mm512_storeu_pd(&z[8*i],vec_y);
589     }
590   }
591 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
592   for (i=0; i<totalslices; i++) { /* loop over full slices */
593     PetscPrefetchBlock(acolidx,a->sliidx[i+1]-a->sliidx[i],0,PETSC_PREFETCH_HINT_T0);
594     PetscPrefetchBlock(aval,a->sliidx[i+1]-a->sliidx[i],0,PETSC_PREFETCH_HINT_T0);
595 
596     /* last slice may have padding rows. Don't use vectorization. */
597     if (i == totalslices-1 && (A->rmap->n & 0x07)) {
598       for (r=0; r<(A->rmap->n & 0x07); ++r) {
599         row        = 8*i + r;
600         yval       = (MatScalar)0.0;
601         nnz_in_row = a->rlen[row];
602         for (j=0; j<nnz_in_row; ++j) yval += aval[8*j+r] * x[acolidx[8*j+r]];
603         z[row] = y[row] + yval;
604       }
605       break;
606     }
607 
608     vec_y  = _mm256_loadu_pd(y+8*i);
609     vec_y2 = _mm256_loadu_pd(y+8*i+4);
610 
611     /* Process slice of height 8 (512 bits) via two subslices of height 4 (256 bits) via AVX */
612     for (j=a->sliidx[i]; j<a->sliidx[i+1]; j+=8) {
613       vec_vals  = _mm256_loadu_pd(aval);
614       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
615       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
616       vec_x     = _mm256_insertf128_pd(vec_x,vec_x_tmp,0);
617       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
618       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
619       vec_x     = _mm256_insertf128_pd(vec_x,vec_x_tmp,1);
620       vec_y     = _mm256_add_pd(_mm256_mul_pd(vec_x,vec_vals),vec_y);
621       aval     += 4;
622 
623       vec_vals  = _mm256_loadu_pd(aval);
624       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
625       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
626       vec_x     = _mm256_insertf128_pd(vec_x,vec_x_tmp,0);
627       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
628       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
629       vec_x     = _mm256_insertf128_pd(vec_x,vec_x_tmp,1);
630       vec_y2    = _mm256_add_pd(_mm256_mul_pd(vec_x,vec_vals),vec_y2);
631       aval     += 4;
632     }
633 
634     _mm256_storeu_pd(z+i*8,vec_y);
635     _mm256_storeu_pd(z+i*8+4,vec_y2);
636   }
637 #else
638   for (i=0; i<totalslices; i++) { /* loop over slices */
639     for (j=0; j<8; j++) sum[j] = 0.0;
640     for (j=a->sliidx[i]; j<a->sliidx[i+1]; j+=8) {
641       sum[0] += aval[j] * x[acolidx[j]];
642       sum[1] += aval[j+1] * x[acolidx[j+1]];
643       sum[2] += aval[j+2] * x[acolidx[j+2]];
644       sum[3] += aval[j+3] * x[acolidx[j+3]];
645       sum[4] += aval[j+4] * x[acolidx[j+4]];
646       sum[5] += aval[j+5] * x[acolidx[j+5]];
647       sum[6] += aval[j+6] * x[acolidx[j+6]];
648       sum[7] += aval[j+7] * x[acolidx[j+7]];
649     }
650     if (i == totalslices-1 && (A->rmap->n & 0x07)) {
651       for (j=0; j<(A->rmap->n & 0x07); j++) z[8*i+j] = y[8*i+j] + sum[j];
652     } else {
653       for (j=0; j<8; j++) z[8*i+j] = y[8*i+j] + sum[j];
654     }
655   }
656 #endif
657 
658   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
659   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
660   ierr = VecRestoreArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
661   PetscFunctionReturn(0);
662 }
663 
664 PetscErrorCode MatMultTransposeAdd_SeqSELL(Mat A,Vec xx,Vec zz,Vec yy)
665 {
666   Mat_SeqSELL       *a=(Mat_SeqSELL*)A->data;
667   PetscScalar       *y;
668   const PetscScalar *x;
669   const MatScalar   *aval=a->val;
670   const PetscInt    *acolidx=a->colidx;
671   PetscInt          i,j,r,row,nnz_in_row,totalslices=a->totalslices;
672   PetscErrorCode    ierr;
673 
674 #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
675 #pragma disjoint(*x,*y,*aval)
676 #endif
677 
678   PetscFunctionBegin;
679   if (A->symmetric) {
680     ierr = MatMultAdd_SeqSELL(A,xx,zz,yy);CHKERRQ(ierr);
681     PetscFunctionReturn(0);
682   }
683   if (zz != yy) { ierr = VecCopy(zz,yy);CHKERRQ(ierr); }
684   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
685   ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
686   for (i=0; i<a->totalslices; i++) { /* loop over slices */
687     if (i == totalslices-1 && (A->rmap->n & 0x07)) {
688       for (r=0; r<(A->rmap->n & 0x07); ++r) {
689         row        = 8*i + r;
690         nnz_in_row = a->rlen[row];
691         for (j=0; j<nnz_in_row; ++j) y[acolidx[8*j+r]] += aval[8*j+r] * x[row];
692       }
693       break;
694     }
695     for (j=a->sliidx[i]; j<a->sliidx[i+1]; j+=8) {
696       y[acolidx[j]]   += aval[j] * x[8*i];
697       y[acolidx[j+1]] += aval[j+1] * x[8*i+1];
698       y[acolidx[j+2]] += aval[j+2] * x[8*i+2];
699       y[acolidx[j+3]] += aval[j+3] * x[8*i+3];
700       y[acolidx[j+4]] += aval[j+4] * x[8*i+4];
701       y[acolidx[j+5]] += aval[j+5] * x[8*i+5];
702       y[acolidx[j+6]] += aval[j+6] * x[8*i+6];
703       y[acolidx[j+7]] += aval[j+7] * x[8*i+7];
704     }
705   }
706   ierr = PetscLogFlops(2.0*a->sliidx[a->totalslices]);CHKERRQ(ierr);
707   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
708   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
709   PetscFunctionReturn(0);
710 }
711 
712 PetscErrorCode MatMultTranspose_SeqSELL(Mat A,Vec xx,Vec yy)
713 {
714   PetscErrorCode ierr;
715 
716   PetscFunctionBegin;
717   if (A->symmetric) {
718     ierr = MatMult_SeqSELL(A,xx,yy);CHKERRQ(ierr);
719   } else {
720     ierr = VecSet(yy,0.0);CHKERRQ(ierr);
721     ierr = MatMultTransposeAdd_SeqSELL(A,xx,yy,yy);CHKERRQ(ierr);
722   }
723   PetscFunctionReturn(0);
724 }
725 
726 /*
727      Checks for missing diagonals
728 */
729 PetscErrorCode MatMissingDiagonal_SeqSELL(Mat A,PetscBool  *missing,PetscInt *d)
730 {
731   Mat_SeqSELL    *a=(Mat_SeqSELL*)A->data;
732   PetscInt       *diag,i;
733   PetscErrorCode ierr;
734 
735   PetscFunctionBegin;
736   *missing = PETSC_FALSE;
737   if (A->rmap->n > 0 && !(a->colidx)) {
738     *missing = PETSC_TRUE;
739     if (d) *d = 0;
740     ierr = PetscInfo(A,"Matrix has no entries therefore is missing diagonal\n");CHKERRQ(ierr);
741   } else {
742     diag = a->diag;
743     for (i=0; i<A->rmap->n; i++) {
744       if (diag[i] == -1) {
745         *missing = PETSC_TRUE;
746         if (d) *d = i;
747         ierr = PetscInfo1(A,"Matrix is missing diagonal number %D\n",i);CHKERRQ(ierr);
748         break;
749       }
750     }
751   }
752   PetscFunctionReturn(0);
753 }
754 
755 PetscErrorCode MatMarkDiagonal_SeqSELL(Mat A)
756 {
757   Mat_SeqSELL    *a=(Mat_SeqSELL*)A->data;
758   PetscInt       i,j,m=A->rmap->n,shift;
759   PetscErrorCode ierr;
760 
761   PetscFunctionBegin;
762   if (!a->diag) {
763     ierr         = PetscMalloc1(m,&a->diag);CHKERRQ(ierr);
764     ierr         = PetscLogObjectMemory((PetscObject)A,m*sizeof(PetscInt));CHKERRQ(ierr);
765     a->free_diag = PETSC_TRUE;
766   }
767   for (i=0; i<m; i++) { /* loop over rows */
768     shift = a->sliidx[i>>3]+(i&0x07); /* starting index of the row i */
769     a->diag[i] = -1;
770     for (j=0; j<a->rlen[i]; j++) {
771       if (a->colidx[shift+j*8] == i) {
772         a->diag[i] = shift+j*8;
773         break;
774       }
775     }
776   }
777   PetscFunctionReturn(0);
778 }
779 
780 /*
781   Negative shift indicates do not generate an error if there is a zero diagonal, just invert it anyways
782 */
783 PetscErrorCode MatInvertDiagonal_SeqSELL(Mat A,PetscScalar omega,PetscScalar fshift)
784 {
785   Mat_SeqSELL    *a=(Mat_SeqSELL*) A->data;
786   PetscInt       i,*diag,m = A->rmap->n;
787   MatScalar      *val = a->val;
788   PetscScalar    *idiag,*mdiag;
789   PetscErrorCode ierr;
790 
791   PetscFunctionBegin;
792   if (a->idiagvalid) PetscFunctionReturn(0);
793   ierr = MatMarkDiagonal_SeqSELL(A);CHKERRQ(ierr);
794   diag = a->diag;
795   if (!a->idiag) {
796     ierr = PetscMalloc3(m,&a->idiag,m,&a->mdiag,m,&a->ssor_work);CHKERRQ(ierr);
797     ierr = PetscLogObjectMemory((PetscObject)A, 3*m*sizeof(PetscScalar));CHKERRQ(ierr);
798     val  = a->val;
799   }
800   mdiag = a->mdiag;
801   idiag = a->idiag;
802 
803   if (omega == 1.0 && PetscRealPart(fshift) <= 0.0) {
804     for (i=0; i<m; i++) {
805       mdiag[i] = val[diag[i]];
806       if (!PetscAbsScalar(mdiag[i])) { /* zero diagonal */
807         if (PetscRealPart(fshift)) {
808           ierr = PetscInfo1(A,"Zero diagonal on row %D\n",i);CHKERRQ(ierr);
809           A->factorerrortype             = MAT_FACTOR_NUMERIC_ZEROPIVOT;
810           A->factorerror_zeropivot_value = 0.0;
811           A->factorerror_zeropivot_row   = i;
812         } else SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"Zero diagonal on row %D",i);
813       }
814       idiag[i] = 1.0/val[diag[i]];
815     }
816     ierr = PetscLogFlops(m);CHKERRQ(ierr);
817   } else {
818     for (i=0; i<m; i++) {
819       mdiag[i] = val[diag[i]];
820       idiag[i] = omega/(fshift + val[diag[i]]);
821     }
822     ierr = PetscLogFlops(2.0*m);CHKERRQ(ierr);
823   }
824   a->idiagvalid = PETSC_TRUE;
825   PetscFunctionReturn(0);
826 }
827 
828 PetscErrorCode MatZeroEntries_SeqSELL(Mat A)
829 {
830   Mat_SeqSELL    *a=(Mat_SeqSELL*)A->data;
831   PetscErrorCode ierr;
832 
833   PetscFunctionBegin;
834   ierr = PetscMemzero(a->val,(a->sliidx[a->totalslices])*sizeof(PetscScalar));CHKERRQ(ierr);
835   ierr = MatSeqSELLInvalidateDiagonal(A);CHKERRQ(ierr);
836   PetscFunctionReturn(0);
837 }
838 
839 PetscErrorCode MatDestroy_SeqSELL(Mat A)
840 {
841   Mat_SeqSELL    *a=(Mat_SeqSELL*)A->data;
842   PetscErrorCode ierr;
843 
844   PetscFunctionBegin;
845 #if defined(PETSC_USE_LOG)
846   PetscLogObjectState((PetscObject)A,"Rows=%D, Cols=%D, NZ=%D",A->rmap->n,A->cmap->n,a->nz);
847 #endif
848   ierr = MatSeqXSELLFreeSELL(A,&a->val,&a->colidx);CHKERRQ(ierr);
849   ierr = ISDestroy(&a->row);CHKERRQ(ierr);
850   ierr = ISDestroy(&a->col);CHKERRQ(ierr);
851   ierr = PetscFree(a->diag);CHKERRQ(ierr);
852   ierr = PetscFree(a->rlen);CHKERRQ(ierr);
853   ierr = PetscFree(a->sliidx);CHKERRQ(ierr);
854   ierr = PetscFree3(a->idiag,a->mdiag,a->ssor_work);CHKERRQ(ierr);
855   ierr = PetscFree(a->solve_work);CHKERRQ(ierr);
856   ierr = ISDestroy(&a->icol);CHKERRQ(ierr);
857   ierr = PetscFree(a->saved_values);CHKERRQ(ierr);
858   ierr = PetscFree2(a->getrowcols,a->getrowvals);CHKERRQ(ierr);
859 
860   ierr = PetscFree(A->data);CHKERRQ(ierr);
861 
862   ierr = PetscObjectChangeTypeName((PetscObject)A,0);CHKERRQ(ierr);
863   ierr = PetscObjectComposeFunction((PetscObject)A,"MatStoreValues_C",NULL);CHKERRQ(ierr);
864   ierr = PetscObjectComposeFunction((PetscObject)A,"MatRetrieveValues_C",NULL);CHKERRQ(ierr);
865 #if defined(PETSC_HAVE_ELEMENTAL)
866 #endif
867   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqSELLSetPreallocation_C",NULL);CHKERRQ(ierr);
868   PetscFunctionReturn(0);
869 }
870 
871 PetscErrorCode MatSetOption_SeqSELL(Mat A,MatOption op,PetscBool flg)
872 {
873   Mat_SeqSELL    *a=(Mat_SeqSELL*)A->data;
874   PetscErrorCode ierr;
875 
876   PetscFunctionBegin;
877   switch (op) {
878   case MAT_ROW_ORIENTED:
879     a->roworiented = flg;
880     break;
881   case MAT_KEEP_NONZERO_PATTERN:
882     a->keepnonzeropattern = flg;
883     break;
884   case MAT_NEW_NONZERO_LOCATIONS:
885     a->nonew = (flg ? 0 : 1);
886     break;
887   case MAT_NEW_NONZERO_LOCATION_ERR:
888     a->nonew = (flg ? -1 : 0);
889     break;
890   case MAT_NEW_NONZERO_ALLOCATION_ERR:
891     a->nonew = (flg ? -2 : 0);
892     break;
893   case MAT_UNUSED_NONZERO_LOCATION_ERR:
894     a->nounused = (flg ? -1 : 0);
895     break;
896   case MAT_NEW_DIAGONALS:
897   case MAT_IGNORE_OFF_PROC_ENTRIES:
898   case MAT_USE_HASH_TABLE:
899     ierr = PetscInfo1(A,"Option %s ignored\n",MatOptions[op]);CHKERRQ(ierr);
900     break;
901   case MAT_SPD:
902   case MAT_SYMMETRIC:
903   case MAT_STRUCTURALLY_SYMMETRIC:
904   case MAT_HERMITIAN:
905   case MAT_SYMMETRY_ETERNAL:
906     /* These options are handled directly by MatSetOption() */
907     break;
908   default:
909     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unknown option %d",op);
910   }
911   PetscFunctionReturn(0);
912 }
913 
914 PetscErrorCode MatGetDiagonal_SeqSELL(Mat A,Vec v)
915 {
916   Mat_SeqSELL    *a=(Mat_SeqSELL*)A->data;
917   PetscInt       i,j,n,shift;
918   PetscScalar    *x,zero=0.0;
919   PetscErrorCode ierr;
920 
921   PetscFunctionBegin;
922   ierr = VecGetLocalSize(v,&n);CHKERRQ(ierr);
923   if (n != A->rmap->n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector");
924 
925   if (A->factortype == MAT_FACTOR_ILU || A->factortype == MAT_FACTOR_LU) {
926     PetscInt *diag=a->diag;
927     ierr = VecGetArray(v,&x);CHKERRQ(ierr);
928     for (i=0; i<n; i++) x[i] = 1.0/a->val[diag[i]];
929     ierr = VecRestoreArray(v,&x);CHKERRQ(ierr);
930     PetscFunctionReturn(0);
931   }
932 
933   ierr = VecSet(v,zero);CHKERRQ(ierr);
934   ierr = VecGetArray(v,&x);CHKERRQ(ierr);
935   for (i=0; i<n; i++) { /* loop over rows */
936     shift = a->sliidx[i>>3]+(i&0x07); /* starting index of the row i */
937     x[i] = 0;
938     for (j=0; j<a->rlen[i]; j++) {
939       if (a->colidx[shift+j*8] == i) {
940         x[i] = a->val[shift+j*8];
941         break;
942       }
943     }
944   }
945   ierr = VecRestoreArray(v,&x);CHKERRQ(ierr);
946   PetscFunctionReturn(0);
947 }
948 
949 PetscErrorCode MatDiagonalScale_SeqSELL(Mat A,Vec ll,Vec rr)
950 {
951   Mat_SeqSELL       *a=(Mat_SeqSELL*)A->data;
952   const PetscScalar *l,*r;
953   PetscInt          i,j,m,n,row;
954   PetscErrorCode    ierr;
955 
956   PetscFunctionBegin;
957   if (ll) {
958     /* The local size is used so that VecMPI can be passed to this routine
959        by MatDiagonalScale_MPISELL */
960     ierr = VecGetLocalSize(ll,&m);CHKERRQ(ierr);
961     if (m != A->rmap->n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Left scaling vector wrong length");
962     ierr = VecGetArrayRead(ll,&l);CHKERRQ(ierr);
963     for (i=0; i<a->totalslices; i++) { /* loop over slices */
964       for (j=a->sliidx[i],row=0; j<a->sliidx[i+1]; j++,row=((row+1)&0x07)) {
965         a->val[j] *= l[8*i+row];
966       }
967     }
968     ierr = VecRestoreArrayRead(ll,&l);CHKERRQ(ierr);
969     ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
970   }
971   if (rr) {
972     ierr = VecGetLocalSize(rr,&n);CHKERRQ(ierr);
973     if (n != A->cmap->n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Right scaling vector wrong length");
974     ierr = VecGetArrayRead(rr,&r);CHKERRQ(ierr);
975     for (i=0; i<a->totalslices; i++) { /* loop over slices */
976       for (j=a->sliidx[i]; j<a->sliidx[i+1]; j++) {
977         a->val[j] *= r[a->colidx[j]];
978       }
979     }
980     ierr = VecRestoreArrayRead(rr,&r);CHKERRQ(ierr);
981     ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
982   }
983   ierr = MatSeqSELLInvalidateDiagonal(A);CHKERRQ(ierr);
984   PetscFunctionReturn(0);
985 }
986 
987 extern PetscErrorCode MatSetValues_SeqSELL(Mat,PetscInt,const PetscInt[],PetscInt,const PetscInt[],const PetscScalar[],InsertMode);
988 
989 PetscErrorCode MatGetValues_SeqSELL(Mat A,PetscInt m,const PetscInt im[],PetscInt n,const PetscInt in[],PetscScalar v[])
990 {
991   Mat_SeqSELL *a=(Mat_SeqSELL*)A->data;
992   PetscInt    *cp,i,k,low,high,t,row,col,l;
993   PetscInt    shift;
994   MatScalar   *vp;
995 
996   PetscFunctionBegin;
997   for (k=0; k<m; k++) { /* loop over requested rows */
998     row = im[k];
999     if (row<0) continue;
1000 #if defined(PETSC_USE_DEBUG)
1001     if (row >= A->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Row too large: row %D max %D",row,A->rmap->n-1);
1002 #endif
1003     shift = a->sliidx[row>>3]+(row&0x07); /* starting index of the row */
1004     cp = a->colidx+shift; /* pointer to the row */
1005     vp = a->val+shift; /* pointer to the row */
1006     for (l=0; l<n; l++) { /* loop over requested columns */
1007       col = in[l];
1008       if (col<0) continue;
1009 #if defined(PETSC_USE_DEBUG)
1010       if (col >= A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Column too large: row %D max %D",col,A->cmap->n-1);
1011 #endif
1012       high = a->rlen[row]; low = 0; /* assume unsorted */
1013       while (high-low > 5) {
1014         t = (low+high)/2;
1015         if (*(cp+t*8) > col) high = t;
1016         else low = t;
1017       }
1018       for (i=low; i<high; i++) {
1019         if (*(cp+8*i) > col) break;
1020         if (*(cp+8*i) == col) {
1021           *v++ = *(vp+8*i);
1022           goto finished;
1023         }
1024       }
1025       *v++ = 0.0;
1026     finished:;
1027     }
1028   }
1029   PetscFunctionReturn(0);
1030 }
1031 
1032 PetscErrorCode MatView_SeqSELL_ASCII(Mat A,PetscViewer viewer)
1033 {
1034   Mat_SeqSELL       *a=(Mat_SeqSELL*)A->data;
1035   PetscInt          i,j,m=A->rmap->n,shift;
1036   const char        *name;
1037   PetscViewerFormat format;
1038   PetscErrorCode    ierr;
1039 
1040   PetscFunctionBegin;
1041   ierr = PetscViewerGetFormat(viewer,&format);CHKERRQ(ierr);
1042   if (format == PETSC_VIEWER_ASCII_MATLAB) {
1043     PetscInt nofinalvalue = 0;
1044     /*
1045     if (m && ((a->i[m] == a->i[m-1]) || (a->j[a->nz-1] != A->cmap->n-1))) {
1046       nofinalvalue = 1;
1047     }
1048     */
1049     ierr = PetscViewerASCIIUseTabs(viewer,PETSC_FALSE);CHKERRQ(ierr);
1050     ierr = PetscViewerASCIIPrintf(viewer,"%% Size = %D %D \n",m,A->cmap->n);CHKERRQ(ierr);
1051     ierr = PetscViewerASCIIPrintf(viewer,"%% Nonzeros = %D \n",a->nz);CHKERRQ(ierr);
1052 #if defined(PETSC_USE_COMPLEX)
1053     ierr = PetscViewerASCIIPrintf(viewer,"zzz = zeros(%D,4);\n",a->nz+nofinalvalue);CHKERRQ(ierr);
1054 #else
1055     ierr = PetscViewerASCIIPrintf(viewer,"zzz = zeros(%D,3);\n",a->nz+nofinalvalue);CHKERRQ(ierr);
1056 #endif
1057     ierr = PetscViewerASCIIPrintf(viewer,"zzz = [\n");CHKERRQ(ierr);
1058 
1059     for (i=0; i<m; i++) {
1060       shift = a->sliidx[i>>3]+(i&0x07);
1061       for (j=0; j<a->rlen[i]; j++) {
1062 #if defined(PETSC_USE_COMPLEX)
1063         ierr = PetscViewerASCIIPrintf(viewer,"%D %D  %18.16e %18.16e\n",i+1,a->colidx[shift+8*j]+1,(double)PetscRealPart(a->val[shift+8*j]),(double)PetscImaginaryPart(a->val[shift+8*j]));CHKERRQ(ierr);
1064 #else
1065         ierr = PetscViewerASCIIPrintf(viewer,"%D %D  %18.16e\n",i+1,a->colidx[shift+8*j]+1,(double)a->val[shift+8*j]);CHKERRQ(ierr);
1066 #endif
1067       }
1068     }
1069     /*
1070     if (nofinalvalue) {
1071 #if defined(PETSC_USE_COMPLEX)
1072       ierr = PetscViewerASCIIPrintf(viewer,"%D %D  %18.16e %18.16e\n",m,A->cmap->n,0.,0.);CHKERRQ(ierr);
1073 #else
1074       ierr = PetscViewerASCIIPrintf(viewer,"%D %D  %18.16e\n",m,A->cmap->n,0.0);CHKERRQ(ierr);
1075 #endif
1076     }
1077     */
1078     ierr = PetscObjectGetName((PetscObject)A,&name);CHKERRQ(ierr);
1079     ierr = PetscViewerASCIIPrintf(viewer,"];\n %s = spconvert(zzz);\n",name);CHKERRQ(ierr);
1080     ierr = PetscViewerASCIIUseTabs(viewer,PETSC_TRUE);CHKERRQ(ierr);
1081   } else if (format == PETSC_VIEWER_ASCII_FACTOR_INFO || format == PETSC_VIEWER_ASCII_INFO) {
1082     PetscFunctionReturn(0);
1083   } else if (format == PETSC_VIEWER_ASCII_COMMON) {
1084     ierr = PetscViewerASCIIUseTabs(viewer,PETSC_FALSE);CHKERRQ(ierr);
1085     for (i=0; i<m; i++) {
1086       ierr = PetscViewerASCIIPrintf(viewer,"row %D:",i);CHKERRQ(ierr);
1087       shift = a->sliidx[i>>3]+(i&0x07);
1088       for (j=0; j<a->rlen[i]; j++) {
1089 #if defined(PETSC_USE_COMPLEX)
1090         if (PetscImaginaryPart(a->val[shift+8*j]) > 0.0 && PetscRealPart(a->val[shift+8*j]) != 0.0) {
1091           ierr = PetscViewerASCIIPrintf(viewer," (%D, %g + %g i)",a->colidx[shift+8*j],(double)PetscRealPart(a->val[shift+8*j]),(double)PetscImaginaryPart(a->val[shift+8*j]));CHKERRQ(ierr);
1092         } else if (PetscImaginaryPart(a->val[shift+8*j]) < 0.0 && PetscRealPart(a->val[shift+8*j]) != 0.0) {
1093           ierr = PetscViewerASCIIPrintf(viewer," (%D, %g - %g i)",a->colidx[shift+8*j],(double)PetscRealPart(a->val[shift+8*j]),(double)-PetscImaginaryPart(a->val[shift+8*j]));CHKERRQ(ierr);
1094         } else if (PetscRealPart(a->val[shift+8*j]) != 0.0) {
1095           ierr = PetscViewerASCIIPrintf(viewer," (%D, %g) ",a->colidx[shift+8*j],(double)PetscRealPart(a->val[shift+8*j]));CHKERRQ(ierr);
1096         }
1097 #else
1098         if (a->val[shift+8*j] != 0.0) {ierr = PetscViewerASCIIPrintf(viewer," (%D, %g) ",a->colidx[shift+8*j],(double)a->val[shift+8*j]);CHKERRQ(ierr);}
1099 #endif
1100       }
1101       ierr = PetscViewerASCIIPrintf(viewer,"\n");CHKERRQ(ierr);
1102     }
1103     ierr = PetscViewerASCIIUseTabs(viewer,PETSC_TRUE);CHKERRQ(ierr);
1104   } else if (format == PETSC_VIEWER_ASCII_DENSE) {
1105     PetscInt    cnt=0,jcnt;
1106     PetscScalar value;
1107 #if defined(PETSC_USE_COMPLEX)
1108     PetscBool   realonly=PETSC_TRUE;
1109     for (i=0; i<a->sliidx[a->totalslices]; i++) {
1110       if (PetscImaginaryPart(a->val[i]) != 0.0) {
1111         realonly = PETSC_FALSE;
1112         break;
1113       }
1114     }
1115 #endif
1116 
1117     ierr = PetscViewerASCIIUseTabs(viewer,PETSC_FALSE);CHKERRQ(ierr);
1118     for (i=0; i<m; i++) {
1119       jcnt = 0;
1120       shift = a->sliidx[i>>3]+(i&0x07);
1121       for (j=0; j<A->cmap->n; j++) {
1122         if (jcnt < a->rlen[i] && j == a->colidx[shift+8*j]) {
1123           value = a->val[cnt++];
1124           jcnt++;
1125         } else {
1126           value = 0.0;
1127         }
1128 #if defined(PETSC_USE_COMPLEX)
1129         if (realonly) {
1130           ierr = PetscViewerASCIIPrintf(viewer," %7.5e ",(double)PetscRealPart(value));CHKERRQ(ierr);
1131         } else {
1132           ierr = PetscViewerASCIIPrintf(viewer," %7.5e+%7.5e i ",(double)PetscRealPart(value),(double)PetscImaginaryPart(value));CHKERRQ(ierr);
1133         }
1134 #else
1135         ierr = PetscViewerASCIIPrintf(viewer," %7.5e ",(double)value);CHKERRQ(ierr);
1136 #endif
1137       }
1138       ierr = PetscViewerASCIIPrintf(viewer,"\n");CHKERRQ(ierr);
1139     }
1140     ierr = PetscViewerASCIIUseTabs(viewer,PETSC_TRUE);CHKERRQ(ierr);
1141   } else if (format == PETSC_VIEWER_ASCII_MATRIXMARKET) {
1142     PetscInt fshift=1;
1143     ierr = PetscViewerASCIIUseTabs(viewer,PETSC_FALSE);CHKERRQ(ierr);
1144 #if defined(PETSC_USE_COMPLEX)
1145     ierr = PetscViewerASCIIPrintf(viewer,"%%%%MatrixMarket matrix coordinate complex general\n");CHKERRQ(ierr);
1146 #else
1147     ierr = PetscViewerASCIIPrintf(viewer,"%%%%MatrixMarket matrix coordinate real general\n");CHKERRQ(ierr);
1148 #endif
1149     ierr = PetscViewerASCIIPrintf(viewer,"%D %D %D\n", m, A->cmap->n, a->nz);CHKERRQ(ierr);
1150     for (i=0; i<m; i++) {
1151       shift = a->sliidx[i>>3]+(i&0x07);
1152       for (j=0; j<a->rlen[i]; j++) {
1153 #if defined(PETSC_USE_COMPLEX)
1154         ierr = PetscViewerASCIIPrintf(viewer,"%D %D %g %g\n",i+fshift,a->colidx[shift+8*j]+fshift,(double)PetscRealPart(a->val[shift+8*j]),(double)PetscImaginaryPart(a->val[shift+8*j]));CHKERRQ(ierr);
1155 #else
1156         ierr = PetscViewerASCIIPrintf(viewer,"%D %D %g\n",i+fshift,a->colidx[shift+8*j]+fshift,(double)a->val[shift+8*j]);CHKERRQ(ierr);
1157 #endif
1158       }
1159     }
1160     ierr = PetscViewerASCIIUseTabs(viewer,PETSC_TRUE);CHKERRQ(ierr);
1161   } else if (format == PETSC_VIEWER_NATIVE) {
1162     for (i=0; i<a->totalslices; i++) { /* loop over slices */
1163       PetscInt row;
1164       ierr = PetscViewerASCIIPrintf(viewer,"slice %D: %D %D\n",i,a->sliidx[i],a->sliidx[i+1]);CHKERRQ(ierr);
1165       for (j=a->sliidx[i],row=0; j<a->sliidx[i+1]; j++,row=((row+1)&0x07)) {
1166 #if defined(PETSC_USE_COMPLEX)
1167         if (PetscImaginaryPart(a->val[j]) > 0.0) {
1168           ierr = PetscViewerASCIIPrintf(viewer,"  %D %D %g + %g i\n",8*i+row,a->colidx[j],(double)PetscRealPart(a->val[j]),(double)PetscImaginaryPart(a->val[j]));CHKERRQ(ierr);
1169         } else if (PetscImaginaryPart(a->val[j]) < 0.0) {
1170           ierr = PetscViewerASCIIPrintf(viewer,"  %D %D %g - %g i\n",8*i+row,a->colidx[j],(double)PetscRealPart(a->val[j]),-(double)PetscImaginaryPart(a->val[j]));CHKERRQ(ierr);
1171         } else {
1172           ierr = PetscViewerASCIIPrintf(viewer,"  %D %D %g\n",8*i+row,a->colidx[j],(double)PetscRealPart(a->val[j]));CHKERRQ(ierr);
1173         }
1174 #else
1175         ierr = PetscViewerASCIIPrintf(viewer,"  %D %D %g\n",8*i+row,a->colidx[j],(double)a->val[j]);CHKERRQ(ierr);
1176 #endif
1177       }
1178     }
1179   } else {
1180     ierr = PetscViewerASCIIUseTabs(viewer,PETSC_FALSE);CHKERRQ(ierr);
1181     if (A->factortype) {
1182       for (i=0; i<m; i++) {
1183         shift = a->sliidx[i>>3]+(i&0x07);
1184         ierr = PetscViewerASCIIPrintf(viewer,"row %D:",i);CHKERRQ(ierr);
1185         /* L part */
1186         for (j=shift; j<a->diag[i]; j+=8) {
1187 #if defined(PETSC_USE_COMPLEX)
1188           if (PetscImaginaryPart(a->val[shift+8*j]) > 0.0) {
1189             ierr = PetscViewerASCIIPrintf(viewer," (%D, %g + %g i)",a->colidx[j],(double)PetscRealPart(a->val[j]),(double)PetscImaginaryPart(a->val[j]));CHKERRQ(ierr);
1190           } else if (PetscImaginaryPart(a->val[shift+8*j]) < 0.0) {
1191             ierr = PetscViewerASCIIPrintf(viewer," (%D, %g - %g i)",a->colidx[j],(double)PetscRealPart(a->val[j]),(double)(-PetscImaginaryPart(a->val[j])));CHKERRQ(ierr);
1192           } else {
1193             ierr = PetscViewerASCIIPrintf(viewer," (%D, %g) ",a->colidx[j],(double)PetscRealPart(a->val[j]));CHKERRQ(ierr);
1194           }
1195 #else
1196           ierr = PetscViewerASCIIPrintf(viewer," (%D, %g) ",a->colidx[j],(double)a->val[j]);CHKERRQ(ierr);
1197 #endif
1198         }
1199         /* diagonal */
1200         j = a->diag[i];
1201 #if defined(PETSC_USE_COMPLEX)
1202         if (PetscImaginaryPart(a->val[j]) > 0.0) {
1203           ierr = PetscViewerASCIIPrintf(viewer," (%D, %g + %g i)",a->colidx[j],(double)PetscRealPart(1.0/a->val[j]),(double)PetscImaginaryPart(1.0/a->val[j]));CHKERRQ(ierr);
1204         } else if (PetscImaginaryPart(a->val[j]) < 0.0) {
1205           ierr = PetscViewerASCIIPrintf(viewer," (%D, %g - %g i)",a->colidx[j],(double)PetscRealPart(1.0/a->val[j]),(double)(-PetscImaginaryPart(1.0/a->val[j])));CHKERRQ(ierr);
1206         } else {
1207           ierr = PetscViewerASCIIPrintf(viewer," (%D, %g) ",a->colidx[j],(double)PetscRealPart(1.0/a->val[j]));CHKERRQ(ierr);
1208         }
1209 #else
1210         ierr = PetscViewerASCIIPrintf(viewer," (%D, %g) ",a->colidx[j],(double)(1.0/a->val[j]));CHKERRQ(ierr);
1211 #endif
1212 
1213         /* U part */
1214         for (j=a->diag[i]+1; j<shift+8*a->rlen[i]; j+=8) {
1215 #if defined(PETSC_USE_COMPLEX)
1216           if (PetscImaginaryPart(a->val[j]) > 0.0) {
1217             ierr = PetscViewerASCIIPrintf(viewer," (%D, %g + %g i)",a->colidx[j],(double)PetscRealPart(a->val[j]),(double)PetscImaginaryPart(a->val[j]));CHKERRQ(ierr);
1218           } else if (PetscImaginaryPart(a->val[j]) < 0.0) {
1219             ierr = PetscViewerASCIIPrintf(viewer," (%D, %g - %g i)",a->colidx[j],(double)PetscRealPart(a->val[j]),(double)(-PetscImaginaryPart(a->val[j])));CHKERRQ(ierr);
1220           } else {
1221             ierr = PetscViewerASCIIPrintf(viewer," (%D, %g) ",a->colidx[j],(double)PetscRealPart(a->val[j]));CHKERRQ(ierr);
1222           }
1223 #else
1224           ierr = PetscViewerASCIIPrintf(viewer," (%D, %g) ",a->colidx[j],(double)a->val[j]);CHKERRQ(ierr);
1225 #endif
1226         }
1227         ierr = PetscViewerASCIIPrintf(viewer,"\n");CHKERRQ(ierr);
1228       }
1229     } else {
1230       for (i=0; i<m; i++) {
1231         shift = a->sliidx[i>>3]+(i&0x07);
1232         ierr = PetscViewerASCIIPrintf(viewer,"row %D:",i);CHKERRQ(ierr);
1233         for (j=0; j<a->rlen[i]; j++) {
1234 #if defined(PETSC_USE_COMPLEX)
1235           if (PetscImaginaryPart(a->val[j]) > 0.0) {
1236             ierr = PetscViewerASCIIPrintf(viewer," (%D, %g + %g i)",a->colidx[shift+8*j],(double)PetscRealPart(a->val[shift+8*j]),(double)PetscImaginaryPart(a->val[shift+8*j]));CHKERRQ(ierr);
1237           } else if (PetscImaginaryPart(a->val[j]) < 0.0) {
1238             ierr = PetscViewerASCIIPrintf(viewer," (%D, %g - %g i)",a->colidx[shift+8*j],(double)PetscRealPart(a->val[shift+8*j]),(double)-PetscImaginaryPart(a->val[shift+8*j]));CHKERRQ(ierr);
1239           } else {
1240             ierr = PetscViewerASCIIPrintf(viewer," (%D, %g) ",a->colidx[shift+8*j],(double)PetscRealPart(a->val[shift+8*j]));CHKERRQ(ierr);
1241           }
1242 #else
1243           ierr = PetscViewerASCIIPrintf(viewer," (%D, %g) ",a->colidx[shift+8*j],(double)a->val[shift+8*j]);CHKERRQ(ierr);
1244 #endif
1245         }
1246         ierr = PetscViewerASCIIPrintf(viewer,"\n");CHKERRQ(ierr);
1247       }
1248     }
1249     ierr = PetscViewerASCIIUseTabs(viewer,PETSC_TRUE);CHKERRQ(ierr);
1250   }
1251   ierr = PetscViewerFlush(viewer);CHKERRQ(ierr);
1252   PetscFunctionReturn(0);
1253 }
1254 
1255 #include <petscdraw.h>
1256 PetscErrorCode MatView_SeqSELL_Draw_Zoom(PetscDraw draw,void *Aa)
1257 {
1258   Mat               A=(Mat)Aa;
1259   Mat_SeqSELL       *a=(Mat_SeqSELL*)A->data;
1260   PetscInt          i,j,m=A->rmap->n,shift;
1261   int               color;
1262   PetscReal         xl,yl,xr,yr,x_l,x_r,y_l,y_r;
1263   PetscViewer       viewer;
1264   PetscViewerFormat format;
1265   PetscErrorCode    ierr;
1266 
1267   PetscFunctionBegin;
1268   ierr = PetscObjectQuery((PetscObject)A,"Zoomviewer",(PetscObject*)&viewer);CHKERRQ(ierr);
1269   ierr = PetscViewerGetFormat(viewer,&format);CHKERRQ(ierr);
1270   ierr = PetscDrawGetCoordinates(draw,&xl,&yl,&xr,&yr);CHKERRQ(ierr);
1271 
1272   /* loop over matrix elements drawing boxes */
1273 
1274   if (format != PETSC_VIEWER_DRAW_CONTOUR) {
1275     ierr = PetscDrawCollectiveBegin(draw);CHKERRQ(ierr);
1276     /* Blue for negative, Cyan for zero and  Red for positive */
1277     color = PETSC_DRAW_BLUE;
1278     for (i=0; i<m; i++) {
1279       shift = a->sliidx[i>>3]+(i&0x07); /* starting index of the row i */
1280       y_l = m - i - 1.0; y_r = y_l + 1.0;
1281       for (j=0; j<a->rlen[i]; j++) {
1282         x_l = a->colidx[shift+j*8]; x_r = x_l + 1.0;
1283         if (PetscRealPart(a->val[shift+8*j]) >=  0.) continue;
1284         ierr = PetscDrawRectangle(draw,x_l,y_l,x_r,y_r,color,color,color,color);CHKERRQ(ierr);
1285       }
1286     }
1287     color = PETSC_DRAW_CYAN;
1288     for (i=0; i<m; i++) {
1289       shift = a->sliidx[i>>3]+(i&0x07);
1290       y_l = m - i - 1.0; y_r = y_l + 1.0;
1291       for (j=0; j<a->rlen[i]; j++) {
1292         x_l = a->colidx[shift+j*8]; x_r = x_l + 1.0;
1293         if (a->val[shift+8*j] !=  0.) continue;
1294         ierr = PetscDrawRectangle(draw,x_l,y_l,x_r,y_r,color,color,color,color);CHKERRQ(ierr);
1295       }
1296     }
1297     color = PETSC_DRAW_RED;
1298     for (i=0; i<m; i++) {
1299       shift = a->sliidx[i>>3]+(i&0x07);
1300       y_l = m - i - 1.0; y_r = y_l + 1.0;
1301       for (j=0; j<a->rlen[i]; j++) {
1302         x_l = a->colidx[shift+j*8]; x_r = x_l + 1.0;
1303         if (PetscRealPart(a->val[shift+8*j]) <=  0.) continue;
1304         ierr = PetscDrawRectangle(draw,x_l,y_l,x_r,y_r,color,color,color,color);CHKERRQ(ierr);
1305       }
1306     }
1307     ierr = PetscDrawCollectiveEnd(draw);CHKERRQ(ierr);
1308   } else {
1309     /* use contour shading to indicate magnitude of values */
1310     /* first determine max of all nonzero values */
1311     PetscReal minv=0.0,maxv=0.0;
1312     PetscInt  count=0;
1313     PetscDraw popup;
1314     for (i=0; i<a->sliidx[a->totalslices]; i++) {
1315       if (PetscAbsScalar(a->val[i]) > maxv) maxv = PetscAbsScalar(a->val[i]);
1316     }
1317     if (minv >= maxv) maxv = minv + PETSC_SMALL;
1318     ierr = PetscDrawGetPopup(draw,&popup);CHKERRQ(ierr);
1319     ierr = PetscDrawScalePopup(popup,minv,maxv);CHKERRQ(ierr);
1320 
1321     ierr = PetscDrawCollectiveBegin(draw);CHKERRQ(ierr);
1322     for (i=0; i<m; i++) {
1323       shift = a->sliidx[i>>3]+(i&0x07);
1324       y_l = m - i - 1.0;
1325       y_r = y_l + 1.0;
1326       for (j=0; j<a->rlen[i]; j++) {
1327         x_l = a->colidx[shift+j*8];
1328         x_r = x_l + 1.0;
1329         color = PetscDrawRealToColor(PetscAbsScalar(a->val[count]),minv,maxv);
1330         ierr = PetscDrawRectangle(draw,x_l,y_l,x_r,y_r,color,color,color,color);CHKERRQ(ierr);
1331         count++;
1332       }
1333     }
1334     ierr = PetscDrawCollectiveEnd(draw);CHKERRQ(ierr);
1335   }
1336   PetscFunctionReturn(0);
1337 }
1338 
1339 #include <petscdraw.h>
1340 PetscErrorCode MatView_SeqSELL_Draw(Mat A,PetscViewer viewer)
1341 {
1342   PetscDraw      draw;
1343   PetscReal      xr,yr,xl,yl,h,w;
1344   PetscBool      isnull;
1345   PetscErrorCode ierr;
1346 
1347   PetscFunctionBegin;
1348   ierr = PetscViewerDrawGetDraw(viewer,0,&draw);CHKERRQ(ierr);
1349   ierr = PetscDrawIsNull(draw,&isnull);CHKERRQ(ierr);
1350   if (isnull) PetscFunctionReturn(0);
1351 
1352   xr   = A->cmap->n; yr  = A->rmap->n; h = yr/10.0; w = xr/10.0;
1353   xr  += w;          yr += h;         xl = -w;     yl = -h;
1354   ierr = PetscDrawSetCoordinates(draw,xl,yl,xr,yr);CHKERRQ(ierr);
1355   ierr = PetscObjectCompose((PetscObject)A,"Zoomviewer",(PetscObject)viewer);CHKERRQ(ierr);
1356   ierr = PetscDrawZoom(draw,MatView_SeqSELL_Draw_Zoom,A);CHKERRQ(ierr);
1357   ierr = PetscObjectCompose((PetscObject)A,"Zoomviewer",NULL);CHKERRQ(ierr);
1358   ierr = PetscDrawSave(draw);CHKERRQ(ierr);
1359   PetscFunctionReturn(0);
1360 }
1361 
1362 PetscErrorCode MatView_SeqSELL(Mat A,PetscViewer viewer)
1363 {
1364   PetscBool      iascii,isbinary,isdraw;
1365   PetscErrorCode ierr;
1366 
1367   PetscFunctionBegin;
1368   ierr = PetscObjectTypeCompare((PetscObject)viewer,PETSCVIEWERASCII,&iascii);CHKERRQ(ierr);
1369   ierr = PetscObjectTypeCompare((PetscObject)viewer,PETSCVIEWERBINARY,&isbinary);CHKERRQ(ierr);
1370   ierr = PetscObjectTypeCompare((PetscObject)viewer,PETSCVIEWERDRAW,&isdraw);CHKERRQ(ierr);
1371   if (iascii) {
1372     ierr = MatView_SeqSELL_ASCII(A,viewer);CHKERRQ(ierr);
1373   } else if (isbinary) {
1374     /* ierr = MatView_SeqSELL_Binary(A,viewer);CHKERRQ(ierr); */
1375   } else if (isdraw) {
1376     ierr = MatView_SeqSELL_Draw(A,viewer);CHKERRQ(ierr);
1377   }
1378   PetscFunctionReturn(0);
1379 }
1380 
1381 PetscErrorCode MatAssemblyEnd_SeqSELL(Mat A,MatAssemblyType mode)
1382 {
1383   Mat_SeqSELL    *a=(Mat_SeqSELL*)A->data;
1384   PetscInt       i,shift,row_in_slice,row,nrow,*cp,lastcol,j,k;
1385   MatScalar      *vp;
1386   PetscErrorCode ierr;
1387 
1388   PetscFunctionBegin;
1389   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0);
1390   /* To do: compress out the unused elements */
1391   ierr = MatMarkDiagonal_SeqSELL(A);CHKERRQ(ierr);
1392   ierr = PetscInfo6(A,"Matrix size: %D X %D; storage space: %D allocated %D used (%D nonzeros+%D paddedzeros)\n",A->rmap->n,A->cmap->n,a->maxallocmat,a->sliidx[a->totalslices],a->nz,a->sliidx[a->totalslices]-a->nz);CHKERRQ(ierr);
1393   ierr = PetscInfo1(A,"Number of mallocs during MatSetValues() is %D\n",a->reallocs);CHKERRQ(ierr);
1394   ierr = PetscInfo1(A,"Maximum nonzeros in any row is %D\n",a->rlenmax);CHKERRQ(ierr);
1395   /* Set unused slots for column indices to last valid column index. Set unused slots for values to zero. This allows for a use of unmasked intrinsics -> higher performance */
1396   for (i=0; i<a->totalslices; ++i) {
1397     shift = a->sliidx[i];    /* starting index of the slice */
1398     cp    = a->colidx+shift; /* pointer to the column indices of the slice */
1399     vp    = a->val+shift;    /* pointer to the nonzero values of the slice */
1400     for (row_in_slice=0; row_in_slice<8; ++row_in_slice) { /* loop over rows in the slice */
1401       row  = 8*i + row_in_slice;
1402       nrow = a->rlen[row]; /* number of nonzeros in row */
1403       /*
1404         Search for the nearest nonzero. Normally setting the index to zero may cause extra communication.
1405         But if the entire slice are empty, it is fine to use 0 since the index will not be loaded.
1406       */
1407       lastcol = 0;
1408       if (nrow>0) { /* nonempty row */
1409         lastcol = cp[8*(nrow-1)+row_in_slice]; /* use the index from the last nonzero at current row */
1410       } else if (!row_in_slice) { /* first row of the currect slice is empty */
1411         for (j=1;j<8;j++) {
1412           if (a->rlen[8*i+j]) {
1413             lastcol = cp[j];
1414             break;
1415           }
1416         }
1417       } else {
1418         if (a->sliidx[i+1] != shift) lastcol = cp[row_in_slice-1]; /* use the index from the previous row */
1419       }
1420 
1421       for (k=nrow; k<(a->sliidx[i+1]-shift)/8; ++k) {
1422         cp[8*k+row_in_slice] = lastcol;
1423         vp[8*k+row_in_slice] = (MatScalar)0;
1424       }
1425     }
1426   }
1427 
1428   A->info.mallocs += a->reallocs;
1429   a->reallocs      = 0;
1430 
1431   ierr = MatSeqSELLInvalidateDiagonal(A);CHKERRQ(ierr);
1432   PetscFunctionReturn(0);
1433 }
1434 
1435 PetscErrorCode MatGetInfo_SeqSELL(Mat A,MatInfoType flag,MatInfo *info)
1436 {
1437   Mat_SeqSELL *a=(Mat_SeqSELL*)A->data;
1438 
1439   PetscFunctionBegin;
1440   info->block_size   = 1.0;
1441   info->nz_allocated = (double)a->maxallocmat;
1442   info->nz_used      = (double)a->sliidx[a->totalslices]; /* include padding zeros */
1443   info->nz_unneeded  = (double)(a->maxallocmat-a->sliidx[a->totalslices]);
1444   info->assemblies   = (double)A->num_ass;
1445   info->mallocs      = (double)A->info.mallocs;
1446   info->memory       = ((PetscObject)A)->mem;
1447   if (A->factortype) {
1448     info->fill_ratio_given  = A->info.fill_ratio_given;
1449     info->fill_ratio_needed = A->info.fill_ratio_needed;
1450     info->factor_mallocs    = A->info.factor_mallocs;
1451   } else {
1452     info->fill_ratio_given  = 0;
1453     info->fill_ratio_needed = 0;
1454     info->factor_mallocs    = 0;
1455   }
1456   PetscFunctionReturn(0);
1457 }
1458 
1459 PetscErrorCode MatSetValues_SeqSELL(Mat A,PetscInt m,const PetscInt im[],PetscInt n,const PetscInt in[],const PetscScalar v[],InsertMode is)
1460 {
1461   Mat_SeqSELL    *a=(Mat_SeqSELL*)A->data;
1462   PetscInt       shift,i,k,l,low,high,t,ii,row,col,nrow;
1463   PetscInt       *cp,nonew=a->nonew,lastcol=-1;
1464   MatScalar      *vp,value;
1465   PetscErrorCode ierr;
1466 
1467   PetscFunctionBegin;
1468   for (k=0; k<m; k++) { /* loop over added rows */
1469     row = im[k];
1470     if (row < 0) continue;
1471 #if defined(PETSC_USE_DEBUG)
1472     if (row >= A->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Row too large: row %D max %D",row,A->rmap->n-1);
1473 #endif
1474     shift = a->sliidx[row>>3]+(row&0x07); /* starting index of the row */
1475     cp    = a->colidx+shift; /* pointer to the row */
1476     vp    = a->val+shift; /* pointer to the row */
1477     nrow  = a->rlen[row];
1478     low   = 0;
1479     high  = nrow;
1480 
1481     for (l=0; l<n; l++) { /* loop over added columns */
1482       col = in[l];
1483       if (col<0) continue;
1484 #if defined(PETSC_USE_DEBUG)
1485       if (col >= A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Col too large: row %D max %D",col,A->cmap->n-1);
1486 #endif
1487       if (a->roworiented) {
1488         value = v[l+k*n];
1489       } else {
1490         value = v[k+l*m];
1491       }
1492       if ((value == 0.0 && a->ignorezeroentries) && (is == ADD_VALUES)) continue;
1493 
1494       /* search in this row for the specified colmun, i indicates the column to be set */
1495       if (col <= lastcol) low = 0;
1496       else high = nrow;
1497       lastcol = col;
1498       while (high-low > 5) {
1499         t = (low+high)/2;
1500         if (*(cp+t*8) > col) high = t;
1501         else low = t;
1502       }
1503       for (i=low; i<high; i++) {
1504         if (*(cp+i*8) > col) break;
1505         if (*(cp+i*8) == col) {
1506           if (is == ADD_VALUES) *(vp+i*8) += value;
1507           else *(vp+i*8) = value;
1508           low = i + 1;
1509           goto noinsert;
1510         }
1511       }
1512       if (value == 0.0 && a->ignorezeroentries) goto noinsert;
1513       if (nonew == 1) goto noinsert;
1514       if (nonew == -1) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Inserting a new nonzero (%D, %D) in the matrix", row, col);
1515       /* If the current row length exceeds the slice width (e.g. nrow==slice_width), allocate a new space, otherwise do nothing */
1516       MatSeqXSELLReallocateSELL(A,A->rmap->n,1,nrow,a->sliidx,row/8,row,col,a->colidx,a->val,cp,vp,nonew,MatScalar);
1517       /* add the new nonzero to the high position, shift the remaining elements in current row to the right by one slot */
1518       for (ii=nrow-1; ii>=i; ii--) {
1519         *(cp+(ii+1)*8) = *(cp+ii*8);
1520         *(vp+(ii+1)*8) = *(vp+ii*8);
1521       }
1522       a->rlen[row]++;
1523       *(cp+i*8) = col;
1524       *(vp+i*8) = value;
1525       a->nz++;
1526       A->nonzerostate++;
1527       low = i+1; high++; nrow++;
1528 noinsert:;
1529     }
1530     a->rlen[row] = nrow;
1531   }
1532   PetscFunctionReturn(0);
1533 }
1534 
1535 PetscErrorCode MatCopy_SeqSELL(Mat A,Mat B,MatStructure str)
1536 {
1537   PetscErrorCode ierr;
1538 
1539   PetscFunctionBegin;
1540   /* If the two matrices have the same copy implementation, use fast copy. */
1541   if (str == SAME_NONZERO_PATTERN && (A->ops->copy == B->ops->copy)) {
1542     Mat_SeqSELL *a=(Mat_SeqSELL*)A->data;
1543     Mat_SeqSELL *b=(Mat_SeqSELL*)B->data;
1544 
1545     if (a->sliidx[a->totalslices] != b->sliidx[b->totalslices]) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"Number of nonzeros in two matrices are different");
1546     ierr = PetscMemcpy(b->val,a->val,a->sliidx[a->totalslices]*sizeof(PetscScalar));CHKERRQ(ierr);
1547   } else {
1548     ierr = MatCopy_Basic(A,B,str);CHKERRQ(ierr);
1549   }
1550   PetscFunctionReturn(0);
1551 }
1552 
1553 PetscErrorCode MatSetUp_SeqSELL(Mat A)
1554 {
1555   PetscErrorCode ierr;
1556 
1557   PetscFunctionBegin;
1558   ierr = MatSeqSELLSetPreallocation(A,PETSC_DEFAULT,0);CHKERRQ(ierr);
1559   PetscFunctionReturn(0);
1560 }
1561 
1562 PetscErrorCode MatSeqSELLGetArray_SeqSELL(Mat A,PetscScalar *array[])
1563 {
1564   Mat_SeqSELL *a=(Mat_SeqSELL*)A->data;
1565 
1566   PetscFunctionBegin;
1567   *array = a->val;
1568   PetscFunctionReturn(0);
1569 }
1570 
1571 PetscErrorCode MatSeqSELLRestoreArray_SeqSELL(Mat A,PetscScalar *array[])
1572 {
1573   PetscFunctionBegin;
1574   PetscFunctionReturn(0);
1575 }
1576 
1577 PetscErrorCode MatRealPart_SeqSELL(Mat A)
1578 {
1579   Mat_SeqSELL *a=(Mat_SeqSELL*)A->data;
1580   PetscInt    i;
1581   MatScalar   *aval=a->val;
1582 
1583   PetscFunctionBegin;
1584   for (i=0; i<a->sliidx[a->totalslices]; i++) aval[i]=PetscRealPart(aval[i]);
1585   PetscFunctionReturn(0);
1586 }
1587 
1588 PetscErrorCode MatImaginaryPart_SeqSELL(Mat A)
1589 {
1590   Mat_SeqSELL    *a=(Mat_SeqSELL*)A->data;
1591   PetscInt       i;
1592   MatScalar      *aval=a->val;
1593   PetscErrorCode ierr;
1594 
1595   PetscFunctionBegin;
1596   for (i=0; i<a->sliidx[a->totalslices]; i++) aval[i] = PetscImaginaryPart(aval[i]);
1597   ierr = MatSeqSELLInvalidateDiagonal(A);CHKERRQ(ierr);
1598   PetscFunctionReturn(0);
1599 }
1600 
1601 PetscErrorCode MatScale_SeqSELL(Mat inA,PetscScalar alpha)
1602 {
1603   Mat_SeqSELL    *a=(Mat_SeqSELL*)inA->data;
1604   MatScalar      *aval=a->val;
1605   PetscScalar    oalpha=alpha;
1606   PetscBLASInt   one=1,size;
1607   PetscErrorCode ierr;
1608 
1609   PetscFunctionBegin;
1610   ierr = PetscBLASIntCast(a->sliidx[a->totalslices],&size);CHKERRQ(ierr);
1611   PetscStackCallBLAS("BLASscal",BLASscal_(&size,&oalpha,aval,&one));
1612   ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
1613   ierr = MatSeqSELLInvalidateDiagonal(inA);CHKERRQ(ierr);
1614   PetscFunctionReturn(0);
1615 }
1616 
1617 PetscErrorCode MatShift_SeqSELL(Mat Y,PetscScalar a)
1618 {
1619   Mat_SeqSELL    *y=(Mat_SeqSELL*)Y->data;
1620   PetscErrorCode ierr;
1621 
1622   PetscFunctionBegin;
1623   if (!Y->preallocated || !y->nz) {
1624     ierr = MatSeqSELLSetPreallocation(Y,1,NULL);CHKERRQ(ierr);
1625   }
1626   ierr = MatShift_Basic(Y,a);CHKERRQ(ierr);
1627   PetscFunctionReturn(0);
1628 }
1629 
1630 PetscErrorCode MatSOR_SeqSELL(Mat A,Vec bb,PetscReal omega,MatSORType flag,PetscReal fshift,PetscInt its,PetscInt lits,Vec xx)
1631 {
1632   Mat_SeqSELL       *a=(Mat_SeqSELL*)A->data;
1633   PetscScalar       *x,sum,*t;
1634   const MatScalar   *idiag=0,*mdiag;
1635   const PetscScalar *b,*xb;
1636   PetscInt          n,m=A->rmap->n,i,j,shift;
1637   const PetscInt    *diag;
1638   PetscErrorCode    ierr;
1639 
1640   PetscFunctionBegin;
1641   its = its*lits;
1642 
1643   if (fshift != a->fshift || omega != a->omega) a->idiagvalid = PETSC_FALSE; /* must recompute idiag[] */
1644   if (!a->idiagvalid) {ierr = MatInvertDiagonal_SeqSELL(A,omega,fshift);CHKERRQ(ierr);}
1645   a->fshift = fshift;
1646   a->omega  = omega;
1647 
1648   diag  = a->diag;
1649   t     = a->ssor_work;
1650   idiag = a->idiag;
1651   mdiag = a->mdiag;
1652 
1653   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1654   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
1655   /* We count flops by assuming the upper triangular and lower triangular parts have the same number of nonzeros */
1656   if (flag == SOR_APPLY_UPPER) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"SOR_APPLY_UPPER is not implemented");
1657   if (flag == SOR_APPLY_LOWER) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"SOR_APPLY_LOWER is not implemented");
1658   if (flag & SOR_EISENSTAT) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support yet for Eisenstat");
1659 
1660   if (flag & SOR_ZERO_INITIAL_GUESS) {
1661     if ((flag & SOR_FORWARD_SWEEP) || (flag & SOR_LOCAL_FORWARD_SWEEP)) {
1662       for (i=0; i<m; i++) {
1663         shift = a->sliidx[i>>3]+(i&0x07); /* starting index of the row i */
1664         sum   = b[i];
1665         n     = (diag[i]-shift)/8;
1666         for (j=0; j<n; j++) sum -= a->val[shift+j*8]*x[a->colidx[shift+j*8]];
1667         t[i]  = sum;
1668         x[i]  = sum*idiag[i];
1669       }
1670       xb   = t;
1671       ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
1672     } else xb = b;
1673     if ((flag & SOR_BACKWARD_SWEEP) || (flag & SOR_LOCAL_BACKWARD_SWEEP)) {
1674       for (i=m-1; i>=0; i--) {
1675         shift = a->sliidx[i>>3]+(i&0x07); /* starting index of the row i */
1676         sum   = xb[i];
1677         n     = a->rlen[i]-(diag[i]-shift)/8-1;
1678         for (j=1; j<=n; j++) sum -= a->val[diag[i]+j*8]*x[a->colidx[diag[i]+j*8]];
1679         if (xb == b) {
1680           x[i] = sum*idiag[i];
1681         } else {
1682           x[i] = (1.-omega)*x[i]+sum*idiag[i];  /* omega in idiag */
1683         }
1684       }
1685       ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); /* assumes 1/2 in upper */
1686     }
1687     its--;
1688   }
1689   while (its--) {
1690     if ((flag & SOR_FORWARD_SWEEP) || (flag & SOR_LOCAL_FORWARD_SWEEP)) {
1691       for (i=0; i<m; i++) {
1692         /* lower */
1693         shift = a->sliidx[i>>3]+(i&0x07); /* starting index of the row i */
1694         sum   = b[i];
1695         n     = (diag[i]-shift)/8;
1696         for (j=0; j<n; j++) sum -= a->val[shift+j*8]*x[a->colidx[shift+j*8]];
1697         t[i]  = sum;             /* save application of the lower-triangular part */
1698         /* upper */
1699         n     = a->rlen[i]-(diag[i]-shift)/8-1;
1700         for (j=1; j<=n; j++) sum -= a->val[diag[i]+j*8]*x[a->colidx[diag[i]+j*8]];
1701         x[i]  = (1.-omega)*x[i]+sum*idiag[i];  /* omega in idiag */
1702       }
1703       xb   = t;
1704       ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
1705     } else xb = b;
1706     if ((flag & SOR_BACKWARD_SWEEP) || (flag & SOR_LOCAL_BACKWARD_SWEEP)) {
1707       for (i=m-1; i>=0; i--) {
1708         shift = a->sliidx[i>>3]+(i&0x07); /* starting index of the row i */
1709         sum = xb[i];
1710         if (xb == b) {
1711           /* whole matrix (no checkpointing available) */
1712           n     = a->rlen[i];
1713           for (j=0; j<n; j++) sum -= a->val[shift+j*8]*x[a->colidx[shift+j*8]];
1714           x[i] = (1.-omega)*x[i]+(sum+mdiag[i]*x[i])*idiag[i];
1715         } else { /* lower-triangular part has been saved, so only apply upper-triangular */
1716           n     = a->rlen[i]-(diag[i]-shift)/8-1;
1717           for (j=1; j<=n; j++) sum -= a->val[diag[i]+j*8]*x[a->colidx[diag[i]+j*8]];
1718           x[i]  = (1.-omega)*x[i]+sum*idiag[i];  /* omega in idiag */
1719         }
1720       }
1721       if (xb == b) {
1722         ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
1723       } else {
1724         ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); /* assumes 1/2 in upper */
1725       }
1726     }
1727   }
1728   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1729   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
1730   PetscFunctionReturn(0);
1731 }
1732 
1733 /* -------------------------------------------------------------------*/
1734 static struct _MatOps MatOps_Values = {MatSetValues_SeqSELL,
1735                                        MatGetRow_SeqSELL,
1736                                        MatRestoreRow_SeqSELL,
1737                                        MatMult_SeqSELL,
1738                                /* 4*/  MatMultAdd_SeqSELL,
1739                                        MatMultTranspose_SeqSELL,
1740                                        MatMultTransposeAdd_SeqSELL,
1741                                        0,
1742                                        0,
1743                                        0,
1744                                /* 10*/ 0,
1745                                        0,
1746                                        0,
1747                                        MatSOR_SeqSELL,
1748                                        0,
1749                                /* 15*/ MatGetInfo_SeqSELL,
1750                                        MatEqual_SeqSELL,
1751                                        MatGetDiagonal_SeqSELL,
1752                                        MatDiagonalScale_SeqSELL,
1753                                        0,
1754                                /* 20*/ 0,
1755                                        MatAssemblyEnd_SeqSELL,
1756                                        MatSetOption_SeqSELL,
1757                                        MatZeroEntries_SeqSELL,
1758                                /* 24*/ 0,
1759                                        0,
1760                                        0,
1761                                        0,
1762                                        0,
1763                                /* 29*/ MatSetUp_SeqSELL,
1764                                        0,
1765                                        0,
1766                                        0,
1767                                        0,
1768                                /* 34*/ MatDuplicate_SeqSELL,
1769                                        0,
1770                                        0,
1771                                        0,
1772                                        0,
1773                                /* 39*/ 0,
1774                                        0,
1775                                        0,
1776                                        MatGetValues_SeqSELL,
1777                                        MatCopy_SeqSELL,
1778                                /* 44*/ 0,
1779                                        MatScale_SeqSELL,
1780                                        MatShift_SeqSELL,
1781                                        0,
1782                                        0,
1783                                /* 49*/ 0,
1784                                        0,
1785                                        0,
1786                                        0,
1787                                        0,
1788                                /* 54*/ MatFDColoringCreate_SeqXAIJ,
1789                                        0,
1790                                        0,
1791                                        0,
1792                                        0,
1793                                /* 59*/ 0,
1794                                        MatDestroy_SeqSELL,
1795                                        MatView_SeqSELL,
1796                                        0,
1797                                        0,
1798                                /* 64*/ 0,
1799                                        0,
1800                                        0,
1801                                        0,
1802                                        0,
1803                                /* 69*/ 0,
1804                                        0,
1805                                        0,
1806                                        0,
1807                                        0,
1808                                /* 74*/ 0,
1809                                        MatFDColoringApply_AIJ, /* reuse the FDColoring function for AIJ */
1810                                        0,
1811                                        0,
1812                                        0,
1813                                /* 79*/ 0,
1814                                        0,
1815                                        0,
1816                                        0,
1817                                        0,
1818                                /* 84*/ 0,
1819                                        0,
1820                                        0,
1821                                        0,
1822                                        0,
1823                                /* 89*/ 0,
1824                                        0,
1825                                        0,
1826                                        0,
1827                                        0,
1828                                /* 94*/ 0,
1829                                        0,
1830                                        0,
1831                                        0,
1832                                        0,
1833                                /* 99*/ 0,
1834                                        0,
1835                                        0,
1836                                        MatConjugate_SeqSELL,
1837                                        0,
1838                                /*104*/ 0,
1839                                        0,
1840                                        0,
1841                                        0,
1842                                        0,
1843                                /*109*/ 0,
1844                                        0,
1845                                        0,
1846                                        0,
1847                                        MatMissingDiagonal_SeqSELL,
1848                                /*114*/ 0,
1849                                        0,
1850                                        0,
1851                                        0,
1852                                        0,
1853                                /*119*/ 0,
1854                                        0,
1855                                        0,
1856                                        0,
1857                                        0,
1858                                /*124*/ 0,
1859                                        0,
1860                                        0,
1861                                        0,
1862                                        0,
1863                                /*129*/ 0,
1864                                        0,
1865                                        0,
1866                                        0,
1867                                        0,
1868                                /*134*/ 0,
1869                                        0,
1870                                        0,
1871                                        0,
1872                                        0,
1873                                /*139*/ 0,
1874                                        0,
1875                                        0,
1876                                        MatFDColoringSetUp_SeqXAIJ,
1877                                        0,
1878                                 /*144*/0
1879 };
1880 
1881 PetscErrorCode MatStoreValues_SeqSELL(Mat mat)
1882 {
1883   Mat_SeqSELL    *a=(Mat_SeqSELL*)mat->data;
1884   PetscErrorCode ierr;
1885 
1886   PetscFunctionBegin;
1887   if (!a->nonew) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ORDER,"Must call MatSetOption(A,MAT_NEW_NONZERO_LOCATIONS,PETSC_FALSE);first");
1888 
1889   /* allocate space for values if not already there */
1890   if (!a->saved_values) {
1891     ierr = PetscMalloc1(a->sliidx[a->totalslices]+1,&a->saved_values);CHKERRQ(ierr);
1892     ierr = PetscLogObjectMemory((PetscObject)mat,(a->sliidx[a->totalslices]+1)*sizeof(PetscScalar));CHKERRQ(ierr);
1893   }
1894 
1895   /* copy values over */
1896   ierr = PetscMemcpy(a->saved_values,a->val,a->sliidx[a->totalslices]*sizeof(PetscScalar));CHKERRQ(ierr);
1897   PetscFunctionReturn(0);
1898 }
1899 
1900 PetscErrorCode MatRetrieveValues_SeqSELL(Mat mat)
1901 {
1902   Mat_SeqSELL    *a=(Mat_SeqSELL*)mat->data;
1903   PetscErrorCode ierr;
1904 
1905   PetscFunctionBegin;
1906   if (!a->nonew) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ORDER,"Must call MatSetOption(A,MAT_NEW_NONZERO_LOCATIONS,PETSC_FALSE);first");
1907   if (!a->saved_values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ORDER,"Must call MatStoreValues(A);first");
1908   /* copy values over */
1909   ierr = PetscMemcpy(a->val,a->saved_values,a->sliidx[a->totalslices]*sizeof(PetscScalar));CHKERRQ(ierr);
1910   PetscFunctionReturn(0);
1911 }
1912 
1913 /*@C
1914  MatSeqSELLRestoreArray - returns access to the array where the data for a MATSEQSELL matrix is stored obtained by MatSeqSELLGetArray()
1915 
1916  Not Collective
1917 
1918  Input Parameters:
1919  .  mat - a MATSEQSELL matrix
1920  .  array - pointer to the data
1921 
1922  Level: intermediate
1923 
1924  .seealso: MatSeqSELLGetArray(), MatSeqSELLRestoreArrayF90()
1925  @*/
1926 PetscErrorCode MatSeqSELLRestoreArray(Mat A,PetscScalar **array)
1927 {
1928   PetscErrorCode ierr;
1929 
1930   PetscFunctionBegin;
1931   ierr = PetscUseMethod(A,"MatSeqSELLRestoreArray_C",(Mat,PetscScalar**),(A,array));CHKERRQ(ierr);
1932   PetscFunctionReturn(0);
1933 }
1934 
1935 PETSC_EXTERN PetscErrorCode MatCreate_SeqSELL(Mat B)
1936 {
1937   Mat_SeqSELL    *b;
1938   PetscMPIInt    size;
1939   PetscErrorCode ierr;
1940 
1941   PetscFunctionBegin;
1942   ierr = MPI_Comm_size(PetscObjectComm((PetscObject)B),&size);CHKERRQ(ierr);
1943   if (size > 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Comm must be of size 1");
1944 
1945   ierr = PetscNewLog(B,&b);CHKERRQ(ierr);
1946 
1947   B->data = (void*)b;
1948 
1949   ierr = PetscMemcpy(B->ops,&MatOps_Values,sizeof(struct _MatOps));CHKERRQ(ierr);
1950 
1951   b->row                = 0;
1952   b->col                = 0;
1953   b->icol               = 0;
1954   b->reallocs           = 0;
1955   b->ignorezeroentries  = PETSC_FALSE;
1956   b->roworiented        = PETSC_TRUE;
1957   b->nonew              = 0;
1958   b->diag               = 0;
1959   b->solve_work         = 0;
1960   B->spptr              = 0;
1961   b->saved_values       = 0;
1962   b->idiag              = 0;
1963   b->mdiag              = 0;
1964   b->ssor_work          = 0;
1965   b->omega              = 1.0;
1966   b->fshift             = 0.0;
1967   b->idiagvalid         = PETSC_FALSE;
1968   b->keepnonzeropattern = PETSC_FALSE;
1969 
1970   ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQSELL);CHKERRQ(ierr);
1971   ierr = PetscObjectComposeFunction((PetscObject)B,"MatSeqSELLGetArray_C",MatSeqSELLGetArray_SeqSELL);CHKERRQ(ierr);
1972   ierr = PetscObjectComposeFunction((PetscObject)B,"MatSeqSELLRestoreArray_C",MatSeqSELLRestoreArray_SeqSELL);CHKERRQ(ierr);
1973   ierr = PetscObjectComposeFunction((PetscObject)B,"MatStoreValues_C",MatStoreValues_SeqSELL);CHKERRQ(ierr);
1974   ierr = PetscObjectComposeFunction((PetscObject)B,"MatRetrieveValues_C",MatRetrieveValues_SeqSELL);CHKERRQ(ierr);
1975   ierr = PetscObjectComposeFunction((PetscObject)B,"MatSeqSELLSetPreallocation_C",MatSeqSELLSetPreallocation_SeqSELL);CHKERRQ(ierr);
1976   ierr = PetscObjectComposeFunction((PetscObject)B,"MatConvert_seqsell_seqaij_C",MatConvert_SeqSELL_SeqAIJ);CHKERRQ(ierr);
1977   PetscFunctionReturn(0);
1978 }
1979 
1980 /*
1981  Given a matrix generated with MatGetFactor() duplicates all the information in A into B
1982  */
1983 PetscErrorCode MatDuplicateNoCreate_SeqSELL(Mat C,Mat A,MatDuplicateOption cpvalues,PetscBool mallocmatspace)
1984 {
1985   Mat_SeqSELL    *c,*a=(Mat_SeqSELL*)A->data;
1986   PetscInt       i,m=A->rmap->n;
1987   PetscInt       totalslices=a->totalslices;
1988   PetscErrorCode ierr;
1989 
1990   PetscFunctionBegin;
1991   c = (Mat_SeqSELL*)C->data;
1992 
1993   C->factortype = A->factortype;
1994   c->row        = 0;
1995   c->col        = 0;
1996   c->icol       = 0;
1997   c->reallocs   = 0;
1998 
1999   C->assembled = PETSC_TRUE;
2000 
2001   ierr = PetscLayoutReference(A->rmap,&C->rmap);CHKERRQ(ierr);
2002   ierr = PetscLayoutReference(A->cmap,&C->cmap);CHKERRQ(ierr);
2003 
2004   ierr = PetscMalloc1(8*totalslices,&c->rlen);CHKERRQ(ierr);
2005   ierr = PetscLogObjectMemory((PetscObject)C,m*sizeof(PetscInt));CHKERRQ(ierr);
2006   ierr = PetscMalloc1(totalslices+1,&c->sliidx);CHKERRQ(ierr);
2007   ierr = PetscLogObjectMemory((PetscObject)C, (totalslices+1)*sizeof(PetscInt));CHKERRQ(ierr);
2008 
2009   for (i=0; i<m; i++) c->rlen[i] = a->rlen[i];
2010   for (i=0; i<totalslices+1; i++) c->sliidx[i] = a->sliidx[i];
2011 
2012   /* allocate the matrix space */
2013   if (mallocmatspace) {
2014     ierr = PetscMalloc2(a->maxallocmat,&c->val,a->maxallocmat,&c->colidx);CHKERRQ(ierr);
2015     ierr = PetscLogObjectMemory((PetscObject)C,a->maxallocmat*(sizeof(PetscScalar)+sizeof(PetscInt)));CHKERRQ(ierr);
2016 
2017     c->singlemalloc = PETSC_TRUE;
2018 
2019     if (m > 0) {
2020       ierr = PetscMemcpy(c->colidx,a->colidx,(a->maxallocmat)*sizeof(PetscInt));CHKERRQ(ierr);
2021       if (cpvalues == MAT_COPY_VALUES) {
2022         ierr = PetscMemcpy(c->val,a->val,a->maxallocmat*sizeof(PetscScalar));CHKERRQ(ierr);
2023       } else {
2024         ierr = PetscMemzero(c->val,a->maxallocmat*sizeof(PetscScalar));CHKERRQ(ierr);
2025       }
2026     }
2027   }
2028 
2029   c->ignorezeroentries = a->ignorezeroentries;
2030   c->roworiented       = a->roworiented;
2031   c->nonew             = a->nonew;
2032   if (a->diag) {
2033     ierr = PetscMalloc1(m,&c->diag);CHKERRQ(ierr);
2034     ierr = PetscLogObjectMemory((PetscObject)C,m*sizeof(PetscInt));CHKERRQ(ierr);
2035     for (i=0; i<m; i++) {
2036       c->diag[i] = a->diag[i];
2037     }
2038   } else c->diag = 0;
2039 
2040   c->solve_work         = 0;
2041   c->saved_values       = 0;
2042   c->idiag              = 0;
2043   c->ssor_work          = 0;
2044   c->keepnonzeropattern = a->keepnonzeropattern;
2045   c->free_val           = PETSC_TRUE;
2046   c->free_colidx        = PETSC_TRUE;
2047 
2048   c->maxallocmat  = a->maxallocmat;
2049   c->maxallocrow  = a->maxallocrow;
2050   c->rlenmax      = a->rlenmax;
2051   c->nz           = a->nz;
2052   C->preallocated = PETSC_TRUE;
2053 
2054   c->nonzerorowcnt = a->nonzerorowcnt;
2055   C->nonzerostate  = A->nonzerostate;
2056 
2057   ierr = PetscFunctionListDuplicate(((PetscObject)A)->qlist,&((PetscObject)C)->qlist);CHKERRQ(ierr);
2058   PetscFunctionReturn(0);
2059 }
2060 
2061 PetscErrorCode MatDuplicate_SeqSELL(Mat A,MatDuplicateOption cpvalues,Mat *B)
2062 {
2063   PetscErrorCode ierr;
2064 
2065   PetscFunctionBegin;
2066   ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr);
2067   ierr = MatSetSizes(*B,A->rmap->n,A->cmap->n,A->rmap->n,A->cmap->n);CHKERRQ(ierr);
2068   if (!(A->rmap->n % A->rmap->bs) && !(A->cmap->n % A->cmap->bs)) {
2069     ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr);
2070   }
2071   ierr = MatSetType(*B,((PetscObject)A)->type_name);CHKERRQ(ierr);
2072   ierr = MatDuplicateNoCreate_SeqSELL(*B,A,cpvalues,PETSC_TRUE);CHKERRQ(ierr);
2073   PetscFunctionReturn(0);
2074 }
2075 
2076 /*@C
2077  MatCreateSeqSELL - Creates a sparse matrix in SELL format.
2078 
2079  Collective on MPI_Comm
2080 
2081  Input Parameters:
2082  +  comm - MPI communicator, set to PETSC_COMM_SELF
2083  .  m - number of rows
2084  .  n - number of columns
2085  .  rlenmax - maximum number of nonzeros in a row
2086  -  rlen - array containing the number of nonzeros in the various rows
2087  (possibly different for each row) or NULL
2088 
2089  Output Parameter:
2090  .  A - the matrix
2091 
2092  It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
2093  MatXXXXSetPreallocation() paradgm instead of this routine directly.
2094  [MatXXXXSetPreallocation() is, for example, MatSeqSELLSetPreallocation]
2095 
2096  Notes:
2097  If nnz is given then nz is ignored
2098 
2099  Specify the preallocated storage with either rlenmax or rlen (not both).
2100  Set rlenmax=PETSC_DEFAULT and rlen=NULL for PETSc to control dynamic memory
2101  allocation.  For large problems you MUST preallocate memory or you
2102  will get TERRIBLE performance, see the users' manual chapter on matrices.
2103 
2104  Level: intermediate
2105 
2106  .seealso: MatCreate(), MatCreateSELL(), MatSetValues(), MatCreateSeqSELLWithArrays()
2107 
2108  @*/
2109 PetscErrorCode MatCreateSeqSELL(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt maxallocrow,const PetscInt rlen[],Mat *A)
2110 {
2111   PetscErrorCode ierr;
2112 
2113   PetscFunctionBegin;
2114   ierr = MatCreate(comm,A);CHKERRQ(ierr);
2115   ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr);
2116   ierr = MatSetType(*A,MATSEQSELL);CHKERRQ(ierr);
2117   ierr = MatSeqSELLSetPreallocation_SeqSELL(*A,maxallocrow,rlen);CHKERRQ(ierr);
2118   PetscFunctionReturn(0);
2119 }
2120 
2121 PetscErrorCode MatEqual_SeqSELL(Mat A,Mat B,PetscBool * flg)
2122 {
2123   Mat_SeqSELL     *a=(Mat_SeqSELL*)A->data,*b=(Mat_SeqSELL*)B->data;
2124   PetscInt       totalslices=a->totalslices;
2125   PetscErrorCode ierr;
2126 
2127   PetscFunctionBegin;
2128   /* If the  matrix dimensions are not equal,or no of nonzeros */
2129   if ((A->rmap->n != B->rmap->n) || (A->cmap->n != B->cmap->n) ||(a->nz != b->nz) || (a->rlenmax != b->rlenmax)) {
2130     *flg = PETSC_FALSE;
2131     PetscFunctionReturn(0);
2132   }
2133   /* if the a->colidx are the same */
2134   ierr = PetscMemcmp(a->colidx,b->colidx,a->sliidx[totalslices]*sizeof(PetscInt),flg);CHKERRQ(ierr);
2135   if (!*flg) PetscFunctionReturn(0);
2136   /* if a->val are the same */
2137   ierr = PetscMemcmp(a->val,b->val,a->sliidx[totalslices]*sizeof(PetscScalar),flg);CHKERRQ(ierr);
2138   PetscFunctionReturn(0);
2139 }
2140 
2141 PetscErrorCode MatSeqSELLInvalidateDiagonal(Mat A)
2142 {
2143   Mat_SeqSELL *a=(Mat_SeqSELL*)A->data;
2144 
2145   PetscFunctionBegin;
2146   a->idiagvalid  = PETSC_FALSE;
2147   PetscFunctionReturn(0);
2148 }
2149 
2150 PetscErrorCode MatConjugate_SeqSELL(Mat A)
2151 {
2152 #if defined(PETSC_USE_COMPLEX)
2153   Mat_SeqSELL *a=(Mat_SeqSELL*)A->data;
2154   PetscInt    i;
2155   PetscScalar *val = a->val;
2156 
2157   PetscFunctionBegin;
2158   for (i=0; i<a->sliidx[a->totalslices]; i++) {
2159     val[i] = PetscConj(val[i]);
2160   }
2161 #else
2162   PetscFunctionBegin;
2163 #endif
2164   PetscFunctionReturn(0);
2165 }
2166