xref: /petsc/src/mat/impls/sell/seq/sell.c (revision 2475b7ca256cea2a4b7cbf2d8babcda14e5fa36e)
1 
2 /*
3   Defines the basic matrix operations for the SELL matrix storage format.
4 */
5 #include <../src/mat/impls/sell/seq/sell.h>  /*I   "petscmat.h"  I*/
6 #include <petscblaslapack.h>
7 #include <petsc/private/kernels/blocktranspose.h>
8 #if defined(PETSC_HAVE_IMMINTRIN_H) && (defined(__AVX512F__) || (defined(__AVX2__) && defined(__FMA__)) || defined(__AVX__)) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
9 
10   #include <immintrin.h>
11 
12   #if !defined(_MM_SCALE_8)
13   #define _MM_SCALE_8    8
14   #endif
15 
16   #if defined(__AVX512F__)
17   /* these do not work
18    vec_idx  = _mm512_loadunpackhi_epi32(vec_idx,acolidx);
19    vec_vals = _mm512_loadunpackhi_pd(vec_vals,aval);
20   */
21     #define AVX512_Mult_Private(vec_idx,vec_x,vec_vals,vec_y) \
22     /* if the mask bit is set, copy from acolidx, otherwise from vec_idx */ \
23     vec_idx  = _mm256_loadu_si256((__m256i const*)acolidx); \
24     vec_vals = _mm512_loadu_pd(aval); \
25     vec_x    = _mm512_i32gather_pd(vec_idx,x,_MM_SCALE_8); \
26     vec_y    = _mm512_fmadd_pd(vec_x,vec_vals,vec_y)
27   #elif defined(__AVX2__) && defined(__FMA__)
28     #define AVX2_Mult_Private(vec_idx,vec_x,vec_vals,vec_y) \
29     vec_vals = _mm256_loadu_pd(aval); \
30     vec_idx  = _mm_loadu_si128((__m128i const*)acolidx); /* SSE2 */ \
31     vec_x    = _mm256_i32gather_pd(x,vec_idx,_MM_SCALE_8); \
32     vec_y    = _mm256_fmadd_pd(vec_x,vec_vals,vec_y)
33   #endif
34 #endif  /* PETSC_HAVE_IMMINTRIN_H */
35 
36 /*@C
37  MatSeqSELLSetPreallocation - For good matrix assembly performance
38  the user should preallocate the matrix storage by setting the parameter nz
39  (or the array nnz).  By setting these parameters accurately, performance
40  during matrix assembly can be increased significantly.
41 
42  Collective
43 
44  Input Parameters:
45  +  B - The matrix
46  .  nz - number of nonzeros per row (same for all rows)
47  -  nnz - array containing the number of nonzeros in the various rows
48  (possibly different for each row) or NULL
49 
50  Notes:
51  If nnz is given then nz is ignored.
52 
53  Specify the preallocated storage with either nz or nnz (not both).
54  Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
55  allocation.  For large problems you MUST preallocate memory or you
56  will get TERRIBLE performance, see the users' manual chapter on matrices.
57 
58  You can call MatGetInfo() to get information on how effective the preallocation was;
59  for example the fields mallocs,nz_allocated,nz_used,nz_unneeded;
60  You can also run with the option -info and look for messages with the string
61  malloc in them to see if additional memory allocation was needed.
62 
63  Developers: Use nz of MAT_SKIP_ALLOCATION to not allocate any space for the matrix
64  entries or columns indices.
65 
66  The maximum number of nonzeos in any row should be as accurate as possible.
67  If it is underestimated, you will get bad performance due to reallocation
68  (MatSeqXSELLReallocateSELL).
69 
70  Level: intermediate
71 
72  .seealso: MatCreate(), MatCreateSELL(), MatSetValues(), MatGetInfo()
73 
74  @*/
75 PetscErrorCode MatSeqSELLSetPreallocation(Mat B,PetscInt rlenmax,const PetscInt rlen[])
76 {
77   PetscErrorCode ierr;
78 
79   PetscFunctionBegin;
80   PetscValidHeaderSpecific(B,MAT_CLASSID,1);
81   PetscValidType(B,1);
82   ierr = PetscTryMethod(B,"MatSeqSELLSetPreallocation_C",(Mat,PetscInt,const PetscInt[]),(B,rlenmax,rlen));CHKERRQ(ierr);
83   PetscFunctionReturn(0);
84 }
85 
86 PetscErrorCode MatSeqSELLSetPreallocation_SeqSELL(Mat B,PetscInt maxallocrow,const PetscInt rlen[])
87 {
88   Mat_SeqSELL    *b;
89   PetscInt       i,j,totalslices;
90   PetscBool      skipallocation=PETSC_FALSE,realalloc=PETSC_FALSE;
91   PetscErrorCode ierr;
92 
93   PetscFunctionBegin;
94   if (maxallocrow >= 0 || rlen) realalloc = PETSC_TRUE;
95   if (maxallocrow == MAT_SKIP_ALLOCATION) {
96     skipallocation = PETSC_TRUE;
97     maxallocrow    = 0;
98   }
99 
100   ierr = PetscLayoutSetUp(B->rmap);CHKERRQ(ierr);
101   ierr = PetscLayoutSetUp(B->cmap);CHKERRQ(ierr);
102 
103   /* FIXME: if one preallocates more space than needed, the matrix does not shrink automatically, but for best performance it should */
104   if (maxallocrow == PETSC_DEFAULT || maxallocrow == PETSC_DECIDE) maxallocrow = 5;
105   if (maxallocrow < 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"maxallocrow cannot be less than 0: value %D",maxallocrow);
106   if (rlen) {
107     for (i=0; i<B->rmap->n; i++) {
108       if (rlen[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"rlen cannot be less than 0: local row %D value %D",i,rlen[i]);
109       if (rlen[i] > B->cmap->n) SETERRQ3(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"rlen cannot be greater than row length: local row %D value %D rowlength %D",i,rlen[i],B->cmap->n);
110     }
111   }
112 
113   B->preallocated = PETSC_TRUE;
114 
115   b = (Mat_SeqSELL*)B->data;
116 
117   totalslices = B->rmap->n/8+((B->rmap->n & 0x07)?1:0); /* ceil(n/8) */
118   b->totalslices = totalslices;
119   if (!skipallocation) {
120     if (B->rmap->n & 0x07) {ierr = PetscInfo1(B,"Padding rows to the SEQSELL matrix because the number of rows is not the multiple of 8 (value %D)\n",B->rmap->n);CHKERRQ(ierr);}
121 
122     if (!b->sliidx) { /* sliidx gives the starting index of each slice, the last element is the total space allocated */
123       ierr = PetscMalloc1(totalslices+1,&b->sliidx);CHKERRQ(ierr);
124       ierr = PetscLogObjectMemory((PetscObject)B,(totalslices+1)*sizeof(PetscInt));CHKERRQ(ierr);
125     }
126     if (!rlen) { /* if rlen is not provided, allocate same space for all the slices */
127       if (maxallocrow == PETSC_DEFAULT || maxallocrow == PETSC_DECIDE) maxallocrow = 10;
128       else if (maxallocrow < 0) maxallocrow = 1;
129       for (i=0; i<=totalslices; i++) b->sliidx[i] = i*8*maxallocrow;
130     } else {
131       maxallocrow = 0;
132       b->sliidx[0] = 0;
133       for (i=1; i<totalslices; i++) {
134         b->sliidx[i] = 0;
135         for (j=0;j<8;j++) {
136           b->sliidx[i] = PetscMax(b->sliidx[i],rlen[8*(i-1)+j]);
137         }
138         maxallocrow = PetscMax(b->sliidx[i],maxallocrow);
139         b->sliidx[i] = b->sliidx[i-1] + 8*b->sliidx[i];
140       }
141       /* last slice */
142       b->sliidx[totalslices] = 0;
143       for (j=(totalslices-1)*8;j<B->rmap->n;j++) b->sliidx[totalslices] = PetscMax(b->sliidx[totalslices],rlen[j]);
144       maxallocrow = PetscMax(b->sliidx[totalslices],maxallocrow);
145       b->sliidx[totalslices] = b->sliidx[totalslices-1] + 8*b->sliidx[totalslices];
146     }
147 
148     /* allocate space for val, colidx, rlen */
149     /* FIXME: should B's old memory be unlogged? */
150     ierr = MatSeqXSELLFreeSELL(B,&b->val,&b->colidx);CHKERRQ(ierr);
151     /* FIXME: assuming an element of the bit array takes 8 bits */
152     ierr = PetscMalloc2(b->sliidx[totalslices],&b->val,b->sliidx[totalslices],&b->colidx);CHKERRQ(ierr);
153     ierr = PetscLogObjectMemory((PetscObject)B,b->sliidx[totalslices]*(sizeof(PetscScalar)+sizeof(PetscInt)));CHKERRQ(ierr);
154     /* b->rlen will count nonzeros in each row so far. We dont copy rlen to b->rlen because the matrix has not been set. */
155     ierr = PetscCalloc1(8*totalslices,&b->rlen);CHKERRQ(ierr);
156     ierr = PetscLogObjectMemory((PetscObject)B,8*totalslices*sizeof(PetscInt));CHKERRQ(ierr);
157 
158     b->singlemalloc = PETSC_TRUE;
159     b->free_val     = PETSC_TRUE;
160     b->free_colidx  = PETSC_TRUE;
161   } else {
162     b->free_val    = PETSC_FALSE;
163     b->free_colidx = PETSC_FALSE;
164   }
165 
166   b->nz               = 0;
167   b->maxallocrow      = maxallocrow;
168   b->rlenmax          = maxallocrow;
169   b->maxallocmat      = b->sliidx[totalslices];
170   B->info.nz_unneeded = (double)b->maxallocmat;
171   if (realalloc) {
172     ierr = MatSetOption(B,MAT_NEW_NONZERO_ALLOCATION_ERR,PETSC_TRUE);CHKERRQ(ierr);
173   }
174   PetscFunctionReturn(0);
175 }
176 
177 PetscErrorCode MatGetRow_SeqSELL(Mat A,PetscInt row,PetscInt *nz,PetscInt **idx,PetscScalar **v)
178 {
179   Mat_SeqSELL *a = (Mat_SeqSELL*)A->data;
180   PetscInt    shift;
181 
182   PetscFunctionBegin;
183   if (row < 0 || row >= A->rmap->n) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Row %D out of range",row);
184   if (nz) *nz = a->rlen[row];
185   shift = a->sliidx[row>>3]+(row&0x07);
186   if (!a->getrowcols) {
187     PetscErrorCode ierr;
188 
189     ierr = PetscMalloc2(a->rlenmax,&a->getrowcols,a->rlenmax,&a->getrowvals);CHKERRQ(ierr);
190   }
191   if (idx) {
192     PetscInt j;
193     for (j=0; j<a->rlen[row]; j++) a->getrowcols[j] = a->colidx[shift+8*j];
194     *idx = a->getrowcols;
195   }
196   if (v) {
197     PetscInt j;
198     for (j=0; j<a->rlen[row]; j++) a->getrowvals[j] = a->val[shift+8*j];
199     *v = a->getrowvals;
200   }
201   PetscFunctionReturn(0);
202 }
203 
204 PetscErrorCode MatRestoreRow_SeqSELL(Mat A,PetscInt row,PetscInt *nz,PetscInt **idx,PetscScalar **v)
205 {
206   PetscFunctionBegin;
207   PetscFunctionReturn(0);
208 }
209 
210 PetscErrorCode MatConvert_SeqSELL_SeqAIJ(Mat A, MatType newtype,MatReuse reuse,Mat *newmat)
211 {
212   Mat            B;
213   Mat_SeqSELL    *a=(Mat_SeqSELL*)A->data;
214   PetscInt       i;
215   PetscErrorCode ierr;
216 
217   PetscFunctionBegin;
218   if (reuse == MAT_REUSE_MATRIX) {
219     B    = *newmat;
220     ierr = MatZeroEntries(B);CHKERRQ(ierr);
221   } else {
222     ierr = MatCreate(PetscObjectComm((PetscObject)A),&B);CHKERRQ(ierr);
223     ierr = MatSetSizes(B,A->rmap->n,A->cmap->n,A->rmap->N,A->cmap->N);CHKERRQ(ierr);
224     ierr = MatSetType(B,MATSEQAIJ);CHKERRQ(ierr);
225     ierr = MatSeqAIJSetPreallocation(B,0,a->rlen);CHKERRQ(ierr);
226   }
227 
228   for (i=0; i<A->rmap->n; i++) {
229     PetscInt    nz = 0,*cols = NULL;
230     PetscScalar *vals = NULL;
231 
232     ierr = MatGetRow_SeqSELL(A,i,&nz,&cols,&vals);CHKERRQ(ierr);
233     ierr = MatSetValues(B,1,&i,nz,cols,vals,INSERT_VALUES);CHKERRQ(ierr);
234     ierr = MatRestoreRow_SeqSELL(A,i,&nz,&cols,&vals);CHKERRQ(ierr);
235   }
236 
237   ierr = MatAssemblyBegin(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
238   ierr = MatAssemblyEnd(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
239   B->rmap->bs = A->rmap->bs;
240 
241   if (reuse == MAT_INPLACE_MATRIX) {
242     ierr = MatHeaderReplace(A,&B);CHKERRQ(ierr);
243   } else {
244     *newmat = B;
245   }
246   PetscFunctionReturn(0);
247 }
248 
249 #include <../src/mat/impls/aij/seq/aij.h>
250 
251 PetscErrorCode MatConvert_SeqAIJ_SeqSELL(Mat A,MatType newtype,MatReuse reuse,Mat *newmat)
252 {
253   Mat               B;
254   Mat_SeqAIJ        *a=(Mat_SeqAIJ*)A->data;
255   PetscInt          *ai=a->i,m=A->rmap->N,n=A->cmap->N,i,*rowlengths,row,ncols;
256   const PetscInt    *cols;
257   const PetscScalar *vals;
258   PetscErrorCode    ierr;
259 
260   PetscFunctionBegin;
261 
262   if (reuse == MAT_REUSE_MATRIX) {
263     B = *newmat;
264   } else {
265     /* Can we just use ilen? */
266     ierr = PetscMalloc1(m,&rowlengths);CHKERRQ(ierr);
267     for (i=0; i<m; i++) {
268       rowlengths[i] = ai[i+1] - ai[i];
269     }
270 
271     ierr = MatCreate(PetscObjectComm((PetscObject)A),&B);CHKERRQ(ierr);
272     ierr = MatSetSizes(B,m,n,m,n);CHKERRQ(ierr);
273     ierr = MatSetType(B,MATSEQSELL);CHKERRQ(ierr);
274     ierr = MatSeqSELLSetPreallocation(B,0,rowlengths);CHKERRQ(ierr);
275     ierr = PetscFree(rowlengths);CHKERRQ(ierr);
276   }
277 
278   for (row=0; row<m; row++) {
279     ierr = MatGetRow(A,row,&ncols,&cols,&vals);CHKERRQ(ierr);
280     ierr = MatSetValues(B,1,&row,ncols,cols,vals,INSERT_VALUES);CHKERRQ(ierr);
281     ierr = MatRestoreRow(A,row,&ncols,&cols,&vals);CHKERRQ(ierr);
282   }
283   ierr = MatAssemblyBegin(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
284   ierr = MatAssemblyEnd(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
285   B->rmap->bs = A->rmap->bs;
286 
287   if (reuse == MAT_INPLACE_MATRIX) {
288     ierr = MatHeaderReplace(A,&B);CHKERRQ(ierr);
289   } else {
290     *newmat = B;
291   }
292   PetscFunctionReturn(0);
293 }
294 
295 PetscErrorCode MatMult_SeqSELL(Mat A,Vec xx,Vec yy)
296 {
297   Mat_SeqSELL       *a=(Mat_SeqSELL*)A->data;
298   PetscScalar       *y;
299   const PetscScalar *x;
300   const MatScalar   *aval=a->val;
301   PetscInt          totalslices=a->totalslices;
302   const PetscInt    *acolidx=a->colidx;
303   PetscInt          i,j;
304   PetscErrorCode    ierr;
305 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
306   __m512d           vec_x,vec_y,vec_vals;
307   __m256i           vec_idx;
308   __mmask8          mask;
309   __m512d           vec_x2,vec_y2,vec_vals2,vec_x3,vec_y3,vec_vals3,vec_x4,vec_y4,vec_vals4;
310   __m256i           vec_idx2,vec_idx3,vec_idx4;
311 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
312   __m128i           vec_idx;
313   __m256d           vec_x,vec_y,vec_y2,vec_vals;
314   MatScalar         yval;
315   PetscInt          r,rows_left,row,nnz_in_row;
316 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
317   __m128d           vec_x_tmp;
318   __m256d           vec_x,vec_y,vec_y2,vec_vals;
319   MatScalar         yval;
320   PetscInt          r,rows_left,row,nnz_in_row;
321 #else
322   PetscScalar       sum[8];
323 #endif
324 
325 #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
326 #pragma disjoint(*x,*y,*aval)
327 #endif
328 
329   PetscFunctionBegin;
330   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
331   ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
332 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
333   for (i=0; i<totalslices; i++) { /* loop over slices */
334     PetscPrefetchBlock(acolidx,a->sliidx[i+1]-a->sliidx[i],0,PETSC_PREFETCH_HINT_T0);
335     PetscPrefetchBlock(aval,a->sliidx[i+1]-a->sliidx[i],0,PETSC_PREFETCH_HINT_T0);
336 
337     vec_y  = _mm512_setzero_pd();
338     vec_y2 = _mm512_setzero_pd();
339     vec_y3 = _mm512_setzero_pd();
340     vec_y4 = _mm512_setzero_pd();
341 
342     j = a->sliidx[i]>>3; /* 8 bytes are read at each time, corresponding to a slice columnn */
343     switch ((a->sliidx[i+1]-a->sliidx[i])/8 & 3) {
344     case 3:
345       AVX512_Mult_Private(vec_idx,vec_x,vec_vals,vec_y);
346       acolidx += 8; aval += 8;
347       AVX512_Mult_Private(vec_idx2,vec_x2,vec_vals2,vec_y2);
348       acolidx += 8; aval += 8;
349       AVX512_Mult_Private(vec_idx3,vec_x3,vec_vals3,vec_y3);
350       acolidx += 8; aval += 8;
351       j += 3;
352       break;
353     case 2:
354       AVX512_Mult_Private(vec_idx,vec_x,vec_vals,vec_y);
355       acolidx += 8; aval += 8;
356       AVX512_Mult_Private(vec_idx2,vec_x2,vec_vals2,vec_y2);
357       acolidx += 8; aval += 8;
358       j += 2;
359       break;
360     case 1:
361       AVX512_Mult_Private(vec_idx,vec_x,vec_vals,vec_y);
362       acolidx += 8; aval += 8;
363       j += 1;
364       break;
365     }
366     #pragma novector
367     for (; j<(a->sliidx[i+1]>>3); j+=4) {
368       AVX512_Mult_Private(vec_idx,vec_x,vec_vals,vec_y);
369       acolidx += 8; aval += 8;
370       AVX512_Mult_Private(vec_idx2,vec_x2,vec_vals2,vec_y2);
371       acolidx += 8; aval += 8;
372       AVX512_Mult_Private(vec_idx3,vec_x3,vec_vals3,vec_y3);
373       acolidx += 8; aval += 8;
374       AVX512_Mult_Private(vec_idx4,vec_x4,vec_vals4,vec_y4);
375       acolidx += 8; aval += 8;
376     }
377 
378     vec_y = _mm512_add_pd(vec_y,vec_y2);
379     vec_y = _mm512_add_pd(vec_y,vec_y3);
380     vec_y = _mm512_add_pd(vec_y,vec_y4);
381     if (i == totalslices-1 && A->rmap->n & 0x07) { /* if last slice has padding rows */
382       mask = (__mmask8)(0xff >> (8-(A->rmap->n & 0x07)));
383       _mm512_mask_storeu_pd(&y[8*i],mask,vec_y);
384     } else {
385       _mm512_storeu_pd(&y[8*i],vec_y);
386     }
387   }
388 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
389   for (i=0; i<totalslices; i++) { /* loop over full slices */
390     PetscPrefetchBlock(acolidx,a->sliidx[i+1]-a->sliidx[i],0,PETSC_PREFETCH_HINT_T0);
391     PetscPrefetchBlock(aval,a->sliidx[i+1]-a->sliidx[i],0,PETSC_PREFETCH_HINT_T0);
392 
393     /* last slice may have padding rows. Don't use vectorization. */
394     if (i == totalslices-1 && (A->rmap->n & 0x07)) {
395       rows_left = A->rmap->n - 8*i;
396       for (r=0; r<rows_left; ++r) {
397         yval = (MatScalar)0;
398         row = 8*i + r;
399         nnz_in_row = a->rlen[row];
400         for (j=0; j<nnz_in_row; ++j) yval += aval[8*j+r] * x[acolidx[8*j+r]];
401         y[row] = yval;
402       }
403       break;
404     }
405 
406     vec_y  = _mm256_setzero_pd();
407     vec_y2 = _mm256_setzero_pd();
408 
409     /* Process slice of height 8 (512 bits) via two subslices of height 4 (256 bits) via AVX */
410     #pragma novector
411     #pragma unroll(2)
412     for (j=a->sliidx[i]; j<a->sliidx[i+1]; j+=8) {
413       AVX2_Mult_Private(vec_idx,vec_x,vec_vals,vec_y);
414       aval += 4; acolidx += 4;
415       AVX2_Mult_Private(vec_idx,vec_x,vec_vals,vec_y2);
416       aval += 4; acolidx += 4;
417     }
418 
419     _mm256_storeu_pd(y+i*8,vec_y);
420     _mm256_storeu_pd(y+i*8+4,vec_y2);
421   }
422 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
423   for (i=0; i<totalslices; i++) { /* loop over full slices */
424     PetscPrefetchBlock(acolidx,a->sliidx[i+1]-a->sliidx[i],0,PETSC_PREFETCH_HINT_T0);
425     PetscPrefetchBlock(aval,a->sliidx[i+1]-a->sliidx[i],0,PETSC_PREFETCH_HINT_T0);
426 
427     vec_y  = _mm256_setzero_pd();
428     vec_y2 = _mm256_setzero_pd();
429 
430     /* last slice may have padding rows. Don't use vectorization. */
431     if (i == totalslices-1 && (A->rmap->n & 0x07)) {
432       rows_left = A->rmap->n - 8*i;
433       for (r=0; r<rows_left; ++r) {
434         yval = (MatScalar)0;
435         row = 8*i + r;
436         nnz_in_row = a->rlen[row];
437         for (j=0; j<nnz_in_row; ++j) yval += aval[8*j + r] * x[acolidx[8*j + r]];
438         y[row] = yval;
439       }
440       break;
441     }
442 
443     /* Process slice of height 8 (512 bits) via two subslices of height 4 (256 bits) via AVX */
444     #pragma novector
445     #pragma unroll(2)
446     for (j=a->sliidx[i]; j<a->sliidx[i+1]; j+=8) {
447       vec_vals  = _mm256_loadu_pd(aval);
448       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
449       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
450       vec_x     = _mm256_insertf128_pd(vec_x,vec_x_tmp,0);
451       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
452       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
453       vec_x     = _mm256_insertf128_pd(vec_x,vec_x_tmp,1);
454       vec_y     = _mm256_add_pd(_mm256_mul_pd(vec_x,vec_vals),vec_y);
455       aval     += 4;
456 
457       vec_vals  = _mm256_loadu_pd(aval);
458       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
459       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
460       vec_x     = _mm256_insertf128_pd(vec_x,vec_x_tmp,0);
461       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
462       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
463       vec_x     = _mm256_insertf128_pd(vec_x,vec_x_tmp,1);
464       vec_y2    = _mm256_add_pd(_mm256_mul_pd(vec_x,vec_vals),vec_y2);
465       aval     += 4;
466     }
467 
468     _mm256_storeu_pd(y + i*8,     vec_y);
469     _mm256_storeu_pd(y + i*8 + 4, vec_y2);
470   }
471 #else
472   for (i=0; i<totalslices; i++) { /* loop over slices */
473     for (j=0; j<8; j++) sum[j] = 0.0;
474     for (j=a->sliidx[i]; j<a->sliidx[i+1]; j+=8) {
475       sum[0] += aval[j] * x[acolidx[j]];
476       sum[1] += aval[j+1] * x[acolidx[j+1]];
477       sum[2] += aval[j+2] * x[acolidx[j+2]];
478       sum[3] += aval[j+3] * x[acolidx[j+3]];
479       sum[4] += aval[j+4] * x[acolidx[j+4]];
480       sum[5] += aval[j+5] * x[acolidx[j+5]];
481       sum[6] += aval[j+6] * x[acolidx[j+6]];
482       sum[7] += aval[j+7] * x[acolidx[j+7]];
483     }
484     if (i == totalslices-1 && (A->rmap->n & 0x07)) { /* if last slice has padding rows */
485       for(j=0; j<(A->rmap->n & 0x07); j++) y[8*i+j] = sum[j];
486     } else {
487       for(j=0; j<8; j++) y[8*i+j] = sum[j];
488     }
489   }
490 #endif
491 
492   ierr = PetscLogFlops(2.0*a->nz-a->nonzerorowcnt);CHKERRQ(ierr); /* theoretical minimal FLOPs */
493   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
494   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
495   PetscFunctionReturn(0);
496 }
497 
498 #include <../src/mat/impls/aij/seq/ftn-kernels/fmultadd.h>
499 PetscErrorCode MatMultAdd_SeqSELL(Mat A,Vec xx,Vec yy,Vec zz)
500 {
501   Mat_SeqSELL       *a=(Mat_SeqSELL*)A->data;
502   PetscScalar       *y,*z;
503   const PetscScalar *x;
504   const MatScalar   *aval=a->val;
505   PetscInt          totalslices=a->totalslices;
506   const PetscInt    *acolidx=a->colidx;
507   PetscInt          i,j;
508   PetscErrorCode    ierr;
509 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
510   __m512d           vec_x,vec_y,vec_vals;
511   __m256i           vec_idx;
512   __mmask8          mask;
513   __m512d           vec_x2,vec_y2,vec_vals2,vec_x3,vec_y3,vec_vals3,vec_x4,vec_y4,vec_vals4;
514   __m256i           vec_idx2,vec_idx3,vec_idx4;
515 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
516   __m128d           vec_x_tmp;
517   __m256d           vec_x,vec_y,vec_y2,vec_vals;
518   MatScalar         yval;
519   PetscInt          r,row,nnz_in_row;
520 #else
521   PetscScalar       sum[8];
522 #endif
523 
524 #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
525 #pragma disjoint(*x,*y,*aval)
526 #endif
527 
528   PetscFunctionBegin;
529   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
530   ierr = VecGetArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
531 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
532   for (i=0; i<totalslices; i++) { /* loop over slices */
533     PetscPrefetchBlock(acolidx,a->sliidx[i+1]-a->sliidx[i],0,PETSC_PREFETCH_HINT_T0);
534     PetscPrefetchBlock(aval,a->sliidx[i+1]-a->sliidx[i],0,PETSC_PREFETCH_HINT_T0);
535 
536     if (i == totalslices-1 && A->rmap->n & 0x07) { /* if last slice has padding rows */
537       mask   = (__mmask8)(0xff >> (8-(A->rmap->n & 0x07)));
538       vec_y  = _mm512_mask_loadu_pd(vec_y,mask,&y[8*i]);
539     } else {
540       vec_y  = _mm512_loadu_pd(&y[8*i]);
541     }
542     vec_y2 = _mm512_setzero_pd();
543     vec_y3 = _mm512_setzero_pd();
544     vec_y4 = _mm512_setzero_pd();
545 
546     j = a->sliidx[i]>>3; /* 8 bytes are read at each time, corresponding to a slice columnn */
547     switch ((a->sliidx[i+1]-a->sliidx[i])/8 & 3) {
548     case 3:
549       AVX512_Mult_Private(vec_idx,vec_x,vec_vals,vec_y);
550       acolidx += 8; aval += 8;
551       AVX512_Mult_Private(vec_idx2,vec_x2,vec_vals2,vec_y2);
552       acolidx += 8; aval += 8;
553       AVX512_Mult_Private(vec_idx3,vec_x3,vec_vals3,vec_y3);
554       acolidx += 8; aval += 8;
555       j += 3;
556       break;
557     case 2:
558       AVX512_Mult_Private(vec_idx,vec_x,vec_vals,vec_y);
559       acolidx += 8; aval += 8;
560       AVX512_Mult_Private(vec_idx2,vec_x2,vec_vals2,vec_y2);
561       acolidx += 8; aval += 8;
562       j += 2;
563       break;
564     case 1:
565       AVX512_Mult_Private(vec_idx,vec_x,vec_vals,vec_y);
566       acolidx += 8; aval += 8;
567       j += 1;
568       break;
569     }
570     #pragma novector
571     for (; j<(a->sliidx[i+1]>>3); j+=4) {
572       AVX512_Mult_Private(vec_idx,vec_x,vec_vals,vec_y);
573       acolidx += 8; aval += 8;
574       AVX512_Mult_Private(vec_idx2,vec_x2,vec_vals2,vec_y2);
575       acolidx += 8; aval += 8;
576       AVX512_Mult_Private(vec_idx3,vec_x3,vec_vals3,vec_y3);
577       acolidx += 8; aval += 8;
578       AVX512_Mult_Private(vec_idx4,vec_x4,vec_vals4,vec_y4);
579       acolidx += 8; aval += 8;
580     }
581 
582     vec_y = _mm512_add_pd(vec_y,vec_y2);
583     vec_y = _mm512_add_pd(vec_y,vec_y3);
584     vec_y = _mm512_add_pd(vec_y,vec_y4);
585     if (i == totalslices-1 && A->rmap->n & 0x07) { /* if last slice has padding rows */
586       _mm512_mask_storeu_pd(&z[8*i],mask,vec_y);
587     } else {
588       _mm512_storeu_pd(&z[8*i],vec_y);
589     }
590   }
591 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
592   for (i=0; i<totalslices; i++) { /* loop over full slices */
593     PetscPrefetchBlock(acolidx,a->sliidx[i+1]-a->sliidx[i],0,PETSC_PREFETCH_HINT_T0);
594     PetscPrefetchBlock(aval,a->sliidx[i+1]-a->sliidx[i],0,PETSC_PREFETCH_HINT_T0);
595 
596     /* last slice may have padding rows. Don't use vectorization. */
597     if (i == totalslices-1 && (A->rmap->n & 0x07)) {
598       for (r=0; r<(A->rmap->n & 0x07); ++r) {
599         row        = 8*i + r;
600         yval       = (MatScalar)0.0;
601         nnz_in_row = a->rlen[row];
602         for (j=0; j<nnz_in_row; ++j) yval += aval[8*j+r] * x[acolidx[8*j+r]];
603         z[row] = y[row] + yval;
604       }
605       break;
606     }
607 
608     vec_y  = _mm256_loadu_pd(y+8*i);
609     vec_y2 = _mm256_loadu_pd(y+8*i+4);
610 
611     /* Process slice of height 8 (512 bits) via two subslices of height 4 (256 bits) via AVX */
612     for (j=a->sliidx[i]; j<a->sliidx[i+1]; j+=8) {
613       vec_vals  = _mm256_loadu_pd(aval);
614       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
615       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
616       vec_x     = _mm256_insertf128_pd(vec_x,vec_x_tmp,0);
617       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
618       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
619       vec_x     = _mm256_insertf128_pd(vec_x,vec_x_tmp,1);
620       vec_y     = _mm256_add_pd(_mm256_mul_pd(vec_x,vec_vals),vec_y);
621       aval     += 4;
622 
623       vec_vals  = _mm256_loadu_pd(aval);
624       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
625       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
626       vec_x     = _mm256_insertf128_pd(vec_x,vec_x_tmp,0);
627       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
628       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
629       vec_x     = _mm256_insertf128_pd(vec_x,vec_x_tmp,1);
630       vec_y2    = _mm256_add_pd(_mm256_mul_pd(vec_x,vec_vals),vec_y2);
631       aval     += 4;
632     }
633 
634     _mm256_storeu_pd(z+i*8,vec_y);
635     _mm256_storeu_pd(z+i*8+4,vec_y2);
636   }
637 #else
638   for (i=0; i<totalslices; i++) { /* loop over slices */
639     for (j=0; j<8; j++) sum[j] = 0.0;
640     for (j=a->sliidx[i]; j<a->sliidx[i+1]; j+=8) {
641       sum[0] += aval[j] * x[acolidx[j]];
642       sum[1] += aval[j+1] * x[acolidx[j+1]];
643       sum[2] += aval[j+2] * x[acolidx[j+2]];
644       sum[3] += aval[j+3] * x[acolidx[j+3]];
645       sum[4] += aval[j+4] * x[acolidx[j+4]];
646       sum[5] += aval[j+5] * x[acolidx[j+5]];
647       sum[6] += aval[j+6] * x[acolidx[j+6]];
648       sum[7] += aval[j+7] * x[acolidx[j+7]];
649     }
650     if (i == totalslices-1 && (A->rmap->n & 0x07)) {
651       for (j=0; j<(A->rmap->n & 0x07); j++) z[8*i+j] = y[8*i+j] + sum[j];
652     } else {
653       for (j=0; j<8; j++) z[8*i+j] = y[8*i+j] + sum[j];
654     }
655   }
656 #endif
657 
658   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
659   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
660   ierr = VecRestoreArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
661   PetscFunctionReturn(0);
662 }
663 
664 PetscErrorCode MatMultTransposeAdd_SeqSELL(Mat A,Vec xx,Vec zz,Vec yy)
665 {
666   Mat_SeqSELL       *a=(Mat_SeqSELL*)A->data;
667   PetscScalar       *y;
668   const PetscScalar *x;
669   const MatScalar   *aval=a->val;
670   const PetscInt    *acolidx=a->colidx;
671   PetscInt          i,j,r,row,nnz_in_row,totalslices=a->totalslices;
672   PetscErrorCode    ierr;
673 
674 #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
675 #pragma disjoint(*x,*y,*aval)
676 #endif
677 
678   PetscFunctionBegin;
679   if (A->symmetric) {
680     ierr = MatMultAdd_SeqSELL(A,xx,zz,yy);CHKERRQ(ierr);
681     PetscFunctionReturn(0);
682   }
683   if (zz != yy) { ierr = VecCopy(zz,yy);CHKERRQ(ierr); }
684   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
685   ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
686   for (i=0; i<a->totalslices; i++) { /* loop over slices */
687     if (i == totalslices-1 && (A->rmap->n & 0x07)) {
688       for (r=0; r<(A->rmap->n & 0x07); ++r) {
689         row        = 8*i + r;
690         nnz_in_row = a->rlen[row];
691         for (j=0; j<nnz_in_row; ++j) y[acolidx[8*j+r]] += aval[8*j+r] * x[row];
692       }
693       break;
694     }
695     for (j=a->sliidx[i]; j<a->sliidx[i+1]; j+=8) {
696       y[acolidx[j]]   += aval[j] * x[8*i];
697       y[acolidx[j+1]] += aval[j+1] * x[8*i+1];
698       y[acolidx[j+2]] += aval[j+2] * x[8*i+2];
699       y[acolidx[j+3]] += aval[j+3] * x[8*i+3];
700       y[acolidx[j+4]] += aval[j+4] * x[8*i+4];
701       y[acolidx[j+5]] += aval[j+5] * x[8*i+5];
702       y[acolidx[j+6]] += aval[j+6] * x[8*i+6];
703       y[acolidx[j+7]] += aval[j+7] * x[8*i+7];
704     }
705   }
706   ierr = PetscLogFlops(2.0*a->sliidx[a->totalslices]);CHKERRQ(ierr);
707   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
708   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
709   PetscFunctionReturn(0);
710 }
711 
712 PetscErrorCode MatMultTranspose_SeqSELL(Mat A,Vec xx,Vec yy)
713 {
714   PetscErrorCode ierr;
715 
716   PetscFunctionBegin;
717   if (A->symmetric) {
718     ierr = MatMult_SeqSELL(A,xx,yy);CHKERRQ(ierr);
719   } else {
720     ierr = VecSet(yy,0.0);CHKERRQ(ierr);
721     ierr = MatMultTransposeAdd_SeqSELL(A,xx,yy,yy);CHKERRQ(ierr);
722   }
723   PetscFunctionReturn(0);
724 }
725 
726 /*
727      Checks for missing diagonals
728 */
729 PetscErrorCode MatMissingDiagonal_SeqSELL(Mat A,PetscBool  *missing,PetscInt *d)
730 {
731   Mat_SeqSELL    *a=(Mat_SeqSELL*)A->data;
732   PetscInt       *diag,i;
733   PetscErrorCode ierr;
734 
735   PetscFunctionBegin;
736   *missing = PETSC_FALSE;
737   if (A->rmap->n > 0 && !(a->colidx)) {
738     *missing = PETSC_TRUE;
739     if (d) *d = 0;
740     ierr = PetscInfo(A,"Matrix has no entries therefore is missing diagonal\n");CHKERRQ(ierr);
741   } else {
742     diag = a->diag;
743     for (i=0; i<A->rmap->n; i++) {
744       if (diag[i] == -1) {
745         *missing = PETSC_TRUE;
746         if (d) *d = i;
747         ierr = PetscInfo1(A,"Matrix is missing diagonal number %D\n",i);CHKERRQ(ierr);
748         break;
749       }
750     }
751   }
752   PetscFunctionReturn(0);
753 }
754 
755 PetscErrorCode MatMarkDiagonal_SeqSELL(Mat A)
756 {
757   Mat_SeqSELL    *a=(Mat_SeqSELL*)A->data;
758   PetscInt       i,j,m=A->rmap->n,shift;
759   PetscErrorCode ierr;
760 
761   PetscFunctionBegin;
762   if (!a->diag) {
763     ierr         = PetscMalloc1(m,&a->diag);CHKERRQ(ierr);
764     ierr         = PetscLogObjectMemory((PetscObject)A,m*sizeof(PetscInt));CHKERRQ(ierr);
765     a->free_diag = PETSC_TRUE;
766   }
767   for (i=0; i<m; i++) { /* loop over rows */
768     shift = a->sliidx[i>>3]+(i&0x07); /* starting index of the row i */
769     a->diag[i] = -1;
770     for (j=0; j<a->rlen[i]; j++) {
771       if (a->colidx[shift+j*8] == i) {
772         a->diag[i] = shift+j*8;
773         break;
774       }
775     }
776   }
777   PetscFunctionReturn(0);
778 }
779 
780 /*
781   Negative shift indicates do not generate an error if there is a zero diagonal, just invert it anyways
782 */
783 PetscErrorCode MatInvertDiagonal_SeqSELL(Mat A,PetscScalar omega,PetscScalar fshift)
784 {
785   Mat_SeqSELL    *a=(Mat_SeqSELL*) A->data;
786   PetscInt       i,*diag,m = A->rmap->n;
787   MatScalar      *val = a->val;
788   PetscScalar    *idiag,*mdiag;
789   PetscErrorCode ierr;
790 
791   PetscFunctionBegin;
792   if (a->idiagvalid) PetscFunctionReturn(0);
793   ierr = MatMarkDiagonal_SeqSELL(A);CHKERRQ(ierr);
794   diag = a->diag;
795   if (!a->idiag) {
796     ierr = PetscMalloc3(m,&a->idiag,m,&a->mdiag,m,&a->ssor_work);CHKERRQ(ierr);
797     ierr = PetscLogObjectMemory((PetscObject)A, 3*m*sizeof(PetscScalar));CHKERRQ(ierr);
798     val  = a->val;
799   }
800   mdiag = a->mdiag;
801   idiag = a->idiag;
802 
803   if (omega == 1.0 && PetscRealPart(fshift) <= 0.0) {
804     for (i=0; i<m; i++) {
805       mdiag[i] = val[diag[i]];
806       if (!PetscAbsScalar(mdiag[i])) { /* zero diagonal */
807         if (PetscRealPart(fshift)) {
808           ierr = PetscInfo1(A,"Zero diagonal on row %D\n",i);CHKERRQ(ierr);
809           A->factorerrortype             = MAT_FACTOR_NUMERIC_ZEROPIVOT;
810           A->factorerror_zeropivot_value = 0.0;
811           A->factorerror_zeropivot_row   = i;
812         } else SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"Zero diagonal on row %D",i);
813       }
814       idiag[i] = 1.0/val[diag[i]];
815     }
816     ierr = PetscLogFlops(m);CHKERRQ(ierr);
817   } else {
818     for (i=0; i<m; i++) {
819       mdiag[i] = val[diag[i]];
820       idiag[i] = omega/(fshift + val[diag[i]]);
821     }
822     ierr = PetscLogFlops(2.0*m);CHKERRQ(ierr);
823   }
824   a->idiagvalid = PETSC_TRUE;
825   PetscFunctionReturn(0);
826 }
827 
828 PetscErrorCode MatZeroEntries_SeqSELL(Mat A)
829 {
830   Mat_SeqSELL    *a=(Mat_SeqSELL*)A->data;
831   PetscErrorCode ierr;
832 
833   PetscFunctionBegin;
834   ierr = PetscArrayzero(a->val,a->sliidx[a->totalslices]);CHKERRQ(ierr);
835   ierr = MatSeqSELLInvalidateDiagonal(A);CHKERRQ(ierr);
836   PetscFunctionReturn(0);
837 }
838 
839 PetscErrorCode MatDestroy_SeqSELL(Mat A)
840 {
841   Mat_SeqSELL    *a=(Mat_SeqSELL*)A->data;
842   PetscErrorCode ierr;
843 
844   PetscFunctionBegin;
845 #if defined(PETSC_USE_LOG)
846   PetscLogObjectState((PetscObject)A,"Rows=%D, Cols=%D, NZ=%D",A->rmap->n,A->cmap->n,a->nz);
847 #endif
848   ierr = MatSeqXSELLFreeSELL(A,&a->val,&a->colidx);CHKERRQ(ierr);
849   ierr = ISDestroy(&a->row);CHKERRQ(ierr);
850   ierr = ISDestroy(&a->col);CHKERRQ(ierr);
851   ierr = PetscFree(a->diag);CHKERRQ(ierr);
852   ierr = PetscFree(a->rlen);CHKERRQ(ierr);
853   ierr = PetscFree(a->sliidx);CHKERRQ(ierr);
854   ierr = PetscFree3(a->idiag,a->mdiag,a->ssor_work);CHKERRQ(ierr);
855   ierr = PetscFree(a->solve_work);CHKERRQ(ierr);
856   ierr = ISDestroy(&a->icol);CHKERRQ(ierr);
857   ierr = PetscFree(a->saved_values);CHKERRQ(ierr);
858   ierr = PetscFree2(a->getrowcols,a->getrowvals);CHKERRQ(ierr);
859 
860   ierr = PetscFree(A->data);CHKERRQ(ierr);
861 
862   ierr = PetscObjectChangeTypeName((PetscObject)A,0);CHKERRQ(ierr);
863   ierr = PetscObjectComposeFunction((PetscObject)A,"MatStoreValues_C",NULL);CHKERRQ(ierr);
864   ierr = PetscObjectComposeFunction((PetscObject)A,"MatRetrieveValues_C",NULL);CHKERRQ(ierr);
865 #if defined(PETSC_HAVE_ELEMENTAL)
866 #endif
867   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqSELLSetPreallocation_C",NULL);CHKERRQ(ierr);
868   PetscFunctionReturn(0);
869 }
870 
871 PetscErrorCode MatSetOption_SeqSELL(Mat A,MatOption op,PetscBool flg)
872 {
873   Mat_SeqSELL    *a=(Mat_SeqSELL*)A->data;
874   PetscErrorCode ierr;
875 
876   PetscFunctionBegin;
877   switch (op) {
878   case MAT_ROW_ORIENTED:
879     a->roworiented = flg;
880     break;
881   case MAT_KEEP_NONZERO_PATTERN:
882     a->keepnonzeropattern = flg;
883     break;
884   case MAT_NEW_NONZERO_LOCATIONS:
885     a->nonew = (flg ? 0 : 1);
886     break;
887   case MAT_NEW_NONZERO_LOCATION_ERR:
888     a->nonew = (flg ? -1 : 0);
889     break;
890   case MAT_NEW_NONZERO_ALLOCATION_ERR:
891     a->nonew = (flg ? -2 : 0);
892     break;
893   case MAT_UNUSED_NONZERO_LOCATION_ERR:
894     a->nounused = (flg ? -1 : 0);
895     break;
896   case MAT_NEW_DIAGONALS:
897   case MAT_IGNORE_OFF_PROC_ENTRIES:
898   case MAT_USE_HASH_TABLE:
899   case MAT_SORTED_FULL:
900     ierr = PetscInfo1(A,"Option %s ignored\n",MatOptions[op]);CHKERRQ(ierr);
901     break;
902   case MAT_SPD:
903   case MAT_SYMMETRIC:
904   case MAT_STRUCTURALLY_SYMMETRIC:
905   case MAT_HERMITIAN:
906   case MAT_SYMMETRY_ETERNAL:
907     /* These options are handled directly by MatSetOption() */
908     break;
909   default:
910     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unknown option %d",op);
911   }
912   PetscFunctionReturn(0);
913 }
914 
915 PetscErrorCode MatGetDiagonal_SeqSELL(Mat A,Vec v)
916 {
917   Mat_SeqSELL    *a=(Mat_SeqSELL*)A->data;
918   PetscInt       i,j,n,shift;
919   PetscScalar    *x,zero=0.0;
920   PetscErrorCode ierr;
921 
922   PetscFunctionBegin;
923   ierr = VecGetLocalSize(v,&n);CHKERRQ(ierr);
924   if (n != A->rmap->n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector");
925 
926   if (A->factortype == MAT_FACTOR_ILU || A->factortype == MAT_FACTOR_LU) {
927     PetscInt *diag=a->diag;
928     ierr = VecGetArray(v,&x);CHKERRQ(ierr);
929     for (i=0; i<n; i++) x[i] = 1.0/a->val[diag[i]];
930     ierr = VecRestoreArray(v,&x);CHKERRQ(ierr);
931     PetscFunctionReturn(0);
932   }
933 
934   ierr = VecSet(v,zero);CHKERRQ(ierr);
935   ierr = VecGetArray(v,&x);CHKERRQ(ierr);
936   for (i=0; i<n; i++) { /* loop over rows */
937     shift = a->sliidx[i>>3]+(i&0x07); /* starting index of the row i */
938     x[i] = 0;
939     for (j=0; j<a->rlen[i]; j++) {
940       if (a->colidx[shift+j*8] == i) {
941         x[i] = a->val[shift+j*8];
942         break;
943       }
944     }
945   }
946   ierr = VecRestoreArray(v,&x);CHKERRQ(ierr);
947   PetscFunctionReturn(0);
948 }
949 
950 PetscErrorCode MatDiagonalScale_SeqSELL(Mat A,Vec ll,Vec rr)
951 {
952   Mat_SeqSELL       *a=(Mat_SeqSELL*)A->data;
953   const PetscScalar *l,*r;
954   PetscInt          i,j,m,n,row;
955   PetscErrorCode    ierr;
956 
957   PetscFunctionBegin;
958   if (ll) {
959     /* The local size is used so that VecMPI can be passed to this routine
960        by MatDiagonalScale_MPISELL */
961     ierr = VecGetLocalSize(ll,&m);CHKERRQ(ierr);
962     if (m != A->rmap->n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Left scaling vector wrong length");
963     ierr = VecGetArrayRead(ll,&l);CHKERRQ(ierr);
964     for (i=0; i<a->totalslices; i++) { /* loop over slices */
965       for (j=a->sliidx[i],row=0; j<a->sliidx[i+1]; j++,row=((row+1)&0x07)) {
966         a->val[j] *= l[8*i+row];
967       }
968     }
969     ierr = VecRestoreArrayRead(ll,&l);CHKERRQ(ierr);
970     ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
971   }
972   if (rr) {
973     ierr = VecGetLocalSize(rr,&n);CHKERRQ(ierr);
974     if (n != A->cmap->n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Right scaling vector wrong length");
975     ierr = VecGetArrayRead(rr,&r);CHKERRQ(ierr);
976     for (i=0; i<a->totalslices; i++) { /* loop over slices */
977       for (j=a->sliidx[i]; j<a->sliidx[i+1]; j++) {
978         a->val[j] *= r[a->colidx[j]];
979       }
980     }
981     ierr = VecRestoreArrayRead(rr,&r);CHKERRQ(ierr);
982     ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
983   }
984   ierr = MatSeqSELLInvalidateDiagonal(A);CHKERRQ(ierr);
985   PetscFunctionReturn(0);
986 }
987 
988 extern PetscErrorCode MatSetValues_SeqSELL(Mat,PetscInt,const PetscInt[],PetscInt,const PetscInt[],const PetscScalar[],InsertMode);
989 
990 PetscErrorCode MatGetValues_SeqSELL(Mat A,PetscInt m,const PetscInt im[],PetscInt n,const PetscInt in[],PetscScalar v[])
991 {
992   Mat_SeqSELL *a=(Mat_SeqSELL*)A->data;
993   PetscInt    *cp,i,k,low,high,t,row,col,l;
994   PetscInt    shift;
995   MatScalar   *vp;
996 
997   PetscFunctionBegin;
998   for (k=0; k<m; k++) { /* loop over requested rows */
999     row = im[k];
1000     if (row<0) continue;
1001 #if defined(PETSC_USE_DEBUG)
1002     if (row >= A->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Row too large: row %D max %D",row,A->rmap->n-1);
1003 #endif
1004     shift = a->sliidx[row>>3]+(row&0x07); /* starting index of the row */
1005     cp = a->colidx+shift; /* pointer to the row */
1006     vp = a->val+shift; /* pointer to the row */
1007     for (l=0; l<n; l++) { /* loop over requested columns */
1008       col = in[l];
1009       if (col<0) continue;
1010 #if defined(PETSC_USE_DEBUG)
1011       if (col >= A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Column too large: row %D max %D",col,A->cmap->n-1);
1012 #endif
1013       high = a->rlen[row]; low = 0; /* assume unsorted */
1014       while (high-low > 5) {
1015         t = (low+high)/2;
1016         if (*(cp+t*8) > col) high = t;
1017         else low = t;
1018       }
1019       for (i=low; i<high; i++) {
1020         if (*(cp+8*i) > col) break;
1021         if (*(cp+8*i) == col) {
1022           *v++ = *(vp+8*i);
1023           goto finished;
1024         }
1025       }
1026       *v++ = 0.0;
1027     finished:;
1028     }
1029   }
1030   PetscFunctionReturn(0);
1031 }
1032 
1033 PetscErrorCode MatView_SeqSELL_ASCII(Mat A,PetscViewer viewer)
1034 {
1035   Mat_SeqSELL       *a=(Mat_SeqSELL*)A->data;
1036   PetscInt          i,j,m=A->rmap->n,shift;
1037   const char        *name;
1038   PetscViewerFormat format;
1039   PetscErrorCode    ierr;
1040 
1041   PetscFunctionBegin;
1042   ierr = PetscViewerGetFormat(viewer,&format);CHKERRQ(ierr);
1043   if (format == PETSC_VIEWER_ASCII_MATLAB) {
1044     PetscInt nofinalvalue = 0;
1045     /*
1046     if (m && ((a->i[m] == a->i[m-1]) || (a->j[a->nz-1] != A->cmap->n-1))) {
1047       nofinalvalue = 1;
1048     }
1049     */
1050     ierr = PetscViewerASCIIUseTabs(viewer,PETSC_FALSE);CHKERRQ(ierr);
1051     ierr = PetscViewerASCIIPrintf(viewer,"%% Size = %D %D \n",m,A->cmap->n);CHKERRQ(ierr);
1052     ierr = PetscViewerASCIIPrintf(viewer,"%% Nonzeros = %D \n",a->nz);CHKERRQ(ierr);
1053 #if defined(PETSC_USE_COMPLEX)
1054     ierr = PetscViewerASCIIPrintf(viewer,"zzz = zeros(%D,4);\n",a->nz+nofinalvalue);CHKERRQ(ierr);
1055 #else
1056     ierr = PetscViewerASCIIPrintf(viewer,"zzz = zeros(%D,3);\n",a->nz+nofinalvalue);CHKERRQ(ierr);
1057 #endif
1058     ierr = PetscViewerASCIIPrintf(viewer,"zzz = [\n");CHKERRQ(ierr);
1059 
1060     for (i=0; i<m; i++) {
1061       shift = a->sliidx[i>>3]+(i&0x07);
1062       for (j=0; j<a->rlen[i]; j++) {
1063 #if defined(PETSC_USE_COMPLEX)
1064         ierr = PetscViewerASCIIPrintf(viewer,"%D %D  %18.16e %18.16e\n",i+1,a->colidx[shift+8*j]+1,(double)PetscRealPart(a->val[shift+8*j]),(double)PetscImaginaryPart(a->val[shift+8*j]));CHKERRQ(ierr);
1065 #else
1066         ierr = PetscViewerASCIIPrintf(viewer,"%D %D  %18.16e\n",i+1,a->colidx[shift+8*j]+1,(double)a->val[shift+8*j]);CHKERRQ(ierr);
1067 #endif
1068       }
1069     }
1070     /*
1071     if (nofinalvalue) {
1072 #if defined(PETSC_USE_COMPLEX)
1073       ierr = PetscViewerASCIIPrintf(viewer,"%D %D  %18.16e %18.16e\n",m,A->cmap->n,0.,0.);CHKERRQ(ierr);
1074 #else
1075       ierr = PetscViewerASCIIPrintf(viewer,"%D %D  %18.16e\n",m,A->cmap->n,0.0);CHKERRQ(ierr);
1076 #endif
1077     }
1078     */
1079     ierr = PetscObjectGetName((PetscObject)A,&name);CHKERRQ(ierr);
1080     ierr = PetscViewerASCIIPrintf(viewer,"];\n %s = spconvert(zzz);\n",name);CHKERRQ(ierr);
1081     ierr = PetscViewerASCIIUseTabs(viewer,PETSC_TRUE);CHKERRQ(ierr);
1082   } else if (format == PETSC_VIEWER_ASCII_FACTOR_INFO || format == PETSC_VIEWER_ASCII_INFO) {
1083     PetscFunctionReturn(0);
1084   } else if (format == PETSC_VIEWER_ASCII_COMMON) {
1085     ierr = PetscViewerASCIIUseTabs(viewer,PETSC_FALSE);CHKERRQ(ierr);
1086     for (i=0; i<m; i++) {
1087       ierr = PetscViewerASCIIPrintf(viewer,"row %D:",i);CHKERRQ(ierr);
1088       shift = a->sliidx[i>>3]+(i&0x07);
1089       for (j=0; j<a->rlen[i]; j++) {
1090 #if defined(PETSC_USE_COMPLEX)
1091         if (PetscImaginaryPart(a->val[shift+8*j]) > 0.0 && PetscRealPart(a->val[shift+8*j]) != 0.0) {
1092           ierr = PetscViewerASCIIPrintf(viewer," (%D, %g + %g i)",a->colidx[shift+8*j],(double)PetscRealPart(a->val[shift+8*j]),(double)PetscImaginaryPart(a->val[shift+8*j]));CHKERRQ(ierr);
1093         } else if (PetscImaginaryPart(a->val[shift+8*j]) < 0.0 && PetscRealPart(a->val[shift+8*j]) != 0.0) {
1094           ierr = PetscViewerASCIIPrintf(viewer," (%D, %g - %g i)",a->colidx[shift+8*j],(double)PetscRealPart(a->val[shift+8*j]),(double)-PetscImaginaryPart(a->val[shift+8*j]));CHKERRQ(ierr);
1095         } else if (PetscRealPart(a->val[shift+8*j]) != 0.0) {
1096           ierr = PetscViewerASCIIPrintf(viewer," (%D, %g) ",a->colidx[shift+8*j],(double)PetscRealPart(a->val[shift+8*j]));CHKERRQ(ierr);
1097         }
1098 #else
1099         if (a->val[shift+8*j] != 0.0) {ierr = PetscViewerASCIIPrintf(viewer," (%D, %g) ",a->colidx[shift+8*j],(double)a->val[shift+8*j]);CHKERRQ(ierr);}
1100 #endif
1101       }
1102       ierr = PetscViewerASCIIPrintf(viewer,"\n");CHKERRQ(ierr);
1103     }
1104     ierr = PetscViewerASCIIUseTabs(viewer,PETSC_TRUE);CHKERRQ(ierr);
1105   } else if (format == PETSC_VIEWER_ASCII_DENSE) {
1106     PetscInt    cnt=0,jcnt;
1107     PetscScalar value;
1108 #if defined(PETSC_USE_COMPLEX)
1109     PetscBool   realonly=PETSC_TRUE;
1110     for (i=0; i<a->sliidx[a->totalslices]; i++) {
1111       if (PetscImaginaryPart(a->val[i]) != 0.0) {
1112         realonly = PETSC_FALSE;
1113         break;
1114       }
1115     }
1116 #endif
1117 
1118     ierr = PetscViewerASCIIUseTabs(viewer,PETSC_FALSE);CHKERRQ(ierr);
1119     for (i=0; i<m; i++) {
1120       jcnt = 0;
1121       shift = a->sliidx[i>>3]+(i&0x07);
1122       for (j=0; j<A->cmap->n; j++) {
1123         if (jcnt < a->rlen[i] && j == a->colidx[shift+8*j]) {
1124           value = a->val[cnt++];
1125           jcnt++;
1126         } else {
1127           value = 0.0;
1128         }
1129 #if defined(PETSC_USE_COMPLEX)
1130         if (realonly) {
1131           ierr = PetscViewerASCIIPrintf(viewer," %7.5e ",(double)PetscRealPart(value));CHKERRQ(ierr);
1132         } else {
1133           ierr = PetscViewerASCIIPrintf(viewer," %7.5e+%7.5e i ",(double)PetscRealPart(value),(double)PetscImaginaryPart(value));CHKERRQ(ierr);
1134         }
1135 #else
1136         ierr = PetscViewerASCIIPrintf(viewer," %7.5e ",(double)value);CHKERRQ(ierr);
1137 #endif
1138       }
1139       ierr = PetscViewerASCIIPrintf(viewer,"\n");CHKERRQ(ierr);
1140     }
1141     ierr = PetscViewerASCIIUseTabs(viewer,PETSC_TRUE);CHKERRQ(ierr);
1142   } else if (format == PETSC_VIEWER_ASCII_MATRIXMARKET) {
1143     PetscInt fshift=1;
1144     ierr = PetscViewerASCIIUseTabs(viewer,PETSC_FALSE);CHKERRQ(ierr);
1145 #if defined(PETSC_USE_COMPLEX)
1146     ierr = PetscViewerASCIIPrintf(viewer,"%%%%MatrixMarket matrix coordinate complex general\n");CHKERRQ(ierr);
1147 #else
1148     ierr = PetscViewerASCIIPrintf(viewer,"%%%%MatrixMarket matrix coordinate real general\n");CHKERRQ(ierr);
1149 #endif
1150     ierr = PetscViewerASCIIPrintf(viewer,"%D %D %D\n", m, A->cmap->n, a->nz);CHKERRQ(ierr);
1151     for (i=0; i<m; i++) {
1152       shift = a->sliidx[i>>3]+(i&0x07);
1153       for (j=0; j<a->rlen[i]; j++) {
1154 #if defined(PETSC_USE_COMPLEX)
1155         ierr = PetscViewerASCIIPrintf(viewer,"%D %D %g %g\n",i+fshift,a->colidx[shift+8*j]+fshift,(double)PetscRealPart(a->val[shift+8*j]),(double)PetscImaginaryPart(a->val[shift+8*j]));CHKERRQ(ierr);
1156 #else
1157         ierr = PetscViewerASCIIPrintf(viewer,"%D %D %g\n",i+fshift,a->colidx[shift+8*j]+fshift,(double)a->val[shift+8*j]);CHKERRQ(ierr);
1158 #endif
1159       }
1160     }
1161     ierr = PetscViewerASCIIUseTabs(viewer,PETSC_TRUE);CHKERRQ(ierr);
1162   } else if (format == PETSC_VIEWER_NATIVE) {
1163     for (i=0; i<a->totalslices; i++) { /* loop over slices */
1164       PetscInt row;
1165       ierr = PetscViewerASCIIPrintf(viewer,"slice %D: %D %D\n",i,a->sliidx[i],a->sliidx[i+1]);CHKERRQ(ierr);
1166       for (j=a->sliidx[i],row=0; j<a->sliidx[i+1]; j++,row=((row+1)&0x07)) {
1167 #if defined(PETSC_USE_COMPLEX)
1168         if (PetscImaginaryPart(a->val[j]) > 0.0) {
1169           ierr = PetscViewerASCIIPrintf(viewer,"  %D %D %g + %g i\n",8*i+row,a->colidx[j],(double)PetscRealPart(a->val[j]),(double)PetscImaginaryPart(a->val[j]));CHKERRQ(ierr);
1170         } else if (PetscImaginaryPart(a->val[j]) < 0.0) {
1171           ierr = PetscViewerASCIIPrintf(viewer,"  %D %D %g - %g i\n",8*i+row,a->colidx[j],(double)PetscRealPart(a->val[j]),-(double)PetscImaginaryPart(a->val[j]));CHKERRQ(ierr);
1172         } else {
1173           ierr = PetscViewerASCIIPrintf(viewer,"  %D %D %g\n",8*i+row,a->colidx[j],(double)PetscRealPart(a->val[j]));CHKERRQ(ierr);
1174         }
1175 #else
1176         ierr = PetscViewerASCIIPrintf(viewer,"  %D %D %g\n",8*i+row,a->colidx[j],(double)a->val[j]);CHKERRQ(ierr);
1177 #endif
1178       }
1179     }
1180   } else {
1181     ierr = PetscViewerASCIIUseTabs(viewer,PETSC_FALSE);CHKERRQ(ierr);
1182     if (A->factortype) {
1183       for (i=0; i<m; i++) {
1184         shift = a->sliidx[i>>3]+(i&0x07);
1185         ierr = PetscViewerASCIIPrintf(viewer,"row %D:",i);CHKERRQ(ierr);
1186         /* L part */
1187         for (j=shift; j<a->diag[i]; j+=8) {
1188 #if defined(PETSC_USE_COMPLEX)
1189           if (PetscImaginaryPart(a->val[shift+8*j]) > 0.0) {
1190             ierr = PetscViewerASCIIPrintf(viewer," (%D, %g + %g i)",a->colidx[j],(double)PetscRealPart(a->val[j]),(double)PetscImaginaryPart(a->val[j]));CHKERRQ(ierr);
1191           } else if (PetscImaginaryPart(a->val[shift+8*j]) < 0.0) {
1192             ierr = PetscViewerASCIIPrintf(viewer," (%D, %g - %g i)",a->colidx[j],(double)PetscRealPart(a->val[j]),(double)(-PetscImaginaryPart(a->val[j])));CHKERRQ(ierr);
1193           } else {
1194             ierr = PetscViewerASCIIPrintf(viewer," (%D, %g) ",a->colidx[j],(double)PetscRealPart(a->val[j]));CHKERRQ(ierr);
1195           }
1196 #else
1197           ierr = PetscViewerASCIIPrintf(viewer," (%D, %g) ",a->colidx[j],(double)a->val[j]);CHKERRQ(ierr);
1198 #endif
1199         }
1200         /* diagonal */
1201         j = a->diag[i];
1202 #if defined(PETSC_USE_COMPLEX)
1203         if (PetscImaginaryPart(a->val[j]) > 0.0) {
1204           ierr = PetscViewerASCIIPrintf(viewer," (%D, %g + %g i)",a->colidx[j],(double)PetscRealPart(1.0/a->val[j]),(double)PetscImaginaryPart(1.0/a->val[j]));CHKERRQ(ierr);
1205         } else if (PetscImaginaryPart(a->val[j]) < 0.0) {
1206           ierr = PetscViewerASCIIPrintf(viewer," (%D, %g - %g i)",a->colidx[j],(double)PetscRealPart(1.0/a->val[j]),(double)(-PetscImaginaryPart(1.0/a->val[j])));CHKERRQ(ierr);
1207         } else {
1208           ierr = PetscViewerASCIIPrintf(viewer," (%D, %g) ",a->colidx[j],(double)PetscRealPart(1.0/a->val[j]));CHKERRQ(ierr);
1209         }
1210 #else
1211         ierr = PetscViewerASCIIPrintf(viewer," (%D, %g) ",a->colidx[j],(double)(1.0/a->val[j]));CHKERRQ(ierr);
1212 #endif
1213 
1214         /* U part */
1215         for (j=a->diag[i]+1; j<shift+8*a->rlen[i]; j+=8) {
1216 #if defined(PETSC_USE_COMPLEX)
1217           if (PetscImaginaryPart(a->val[j]) > 0.0) {
1218             ierr = PetscViewerASCIIPrintf(viewer," (%D, %g + %g i)",a->colidx[j],(double)PetscRealPart(a->val[j]),(double)PetscImaginaryPart(a->val[j]));CHKERRQ(ierr);
1219           } else if (PetscImaginaryPart(a->val[j]) < 0.0) {
1220             ierr = PetscViewerASCIIPrintf(viewer," (%D, %g - %g i)",a->colidx[j],(double)PetscRealPart(a->val[j]),(double)(-PetscImaginaryPart(a->val[j])));CHKERRQ(ierr);
1221           } else {
1222             ierr = PetscViewerASCIIPrintf(viewer," (%D, %g) ",a->colidx[j],(double)PetscRealPart(a->val[j]));CHKERRQ(ierr);
1223           }
1224 #else
1225           ierr = PetscViewerASCIIPrintf(viewer," (%D, %g) ",a->colidx[j],(double)a->val[j]);CHKERRQ(ierr);
1226 #endif
1227         }
1228         ierr = PetscViewerASCIIPrintf(viewer,"\n");CHKERRQ(ierr);
1229       }
1230     } else {
1231       for (i=0; i<m; i++) {
1232         shift = a->sliidx[i>>3]+(i&0x07);
1233         ierr = PetscViewerASCIIPrintf(viewer,"row %D:",i);CHKERRQ(ierr);
1234         for (j=0; j<a->rlen[i]; j++) {
1235 #if defined(PETSC_USE_COMPLEX)
1236           if (PetscImaginaryPart(a->val[j]) > 0.0) {
1237             ierr = PetscViewerASCIIPrintf(viewer," (%D, %g + %g i)",a->colidx[shift+8*j],(double)PetscRealPart(a->val[shift+8*j]),(double)PetscImaginaryPart(a->val[shift+8*j]));CHKERRQ(ierr);
1238           } else if (PetscImaginaryPart(a->val[j]) < 0.0) {
1239             ierr = PetscViewerASCIIPrintf(viewer," (%D, %g - %g i)",a->colidx[shift+8*j],(double)PetscRealPart(a->val[shift+8*j]),(double)-PetscImaginaryPart(a->val[shift+8*j]));CHKERRQ(ierr);
1240           } else {
1241             ierr = PetscViewerASCIIPrintf(viewer," (%D, %g) ",a->colidx[shift+8*j],(double)PetscRealPart(a->val[shift+8*j]));CHKERRQ(ierr);
1242           }
1243 #else
1244           ierr = PetscViewerASCIIPrintf(viewer," (%D, %g) ",a->colidx[shift+8*j],(double)a->val[shift+8*j]);CHKERRQ(ierr);
1245 #endif
1246         }
1247         ierr = PetscViewerASCIIPrintf(viewer,"\n");CHKERRQ(ierr);
1248       }
1249     }
1250     ierr = PetscViewerASCIIUseTabs(viewer,PETSC_TRUE);CHKERRQ(ierr);
1251   }
1252   ierr = PetscViewerFlush(viewer);CHKERRQ(ierr);
1253   PetscFunctionReturn(0);
1254 }
1255 
1256 #include <petscdraw.h>
1257 PetscErrorCode MatView_SeqSELL_Draw_Zoom(PetscDraw draw,void *Aa)
1258 {
1259   Mat               A=(Mat)Aa;
1260   Mat_SeqSELL       *a=(Mat_SeqSELL*)A->data;
1261   PetscInt          i,j,m=A->rmap->n,shift;
1262   int               color;
1263   PetscReal         xl,yl,xr,yr,x_l,x_r,y_l,y_r;
1264   PetscViewer       viewer;
1265   PetscViewerFormat format;
1266   PetscErrorCode    ierr;
1267 
1268   PetscFunctionBegin;
1269   ierr = PetscObjectQuery((PetscObject)A,"Zoomviewer",(PetscObject*)&viewer);CHKERRQ(ierr);
1270   ierr = PetscViewerGetFormat(viewer,&format);CHKERRQ(ierr);
1271   ierr = PetscDrawGetCoordinates(draw,&xl,&yl,&xr,&yr);CHKERRQ(ierr);
1272 
1273   /* loop over matrix elements drawing boxes */
1274 
1275   if (format != PETSC_VIEWER_DRAW_CONTOUR) {
1276     ierr = PetscDrawCollectiveBegin(draw);CHKERRQ(ierr);
1277     /* Blue for negative, Cyan for zero and  Red for positive */
1278     color = PETSC_DRAW_BLUE;
1279     for (i=0; i<m; i++) {
1280       shift = a->sliidx[i>>3]+(i&0x07); /* starting index of the row i */
1281       y_l = m - i - 1.0; y_r = y_l + 1.0;
1282       for (j=0; j<a->rlen[i]; j++) {
1283         x_l = a->colidx[shift+j*8]; x_r = x_l + 1.0;
1284         if (PetscRealPart(a->val[shift+8*j]) >=  0.) continue;
1285         ierr = PetscDrawRectangle(draw,x_l,y_l,x_r,y_r,color,color,color,color);CHKERRQ(ierr);
1286       }
1287     }
1288     color = PETSC_DRAW_CYAN;
1289     for (i=0; i<m; i++) {
1290       shift = a->sliidx[i>>3]+(i&0x07);
1291       y_l = m - i - 1.0; y_r = y_l + 1.0;
1292       for (j=0; j<a->rlen[i]; j++) {
1293         x_l = a->colidx[shift+j*8]; x_r = x_l + 1.0;
1294         if (a->val[shift+8*j] !=  0.) continue;
1295         ierr = PetscDrawRectangle(draw,x_l,y_l,x_r,y_r,color,color,color,color);CHKERRQ(ierr);
1296       }
1297     }
1298     color = PETSC_DRAW_RED;
1299     for (i=0; i<m; i++) {
1300       shift = a->sliidx[i>>3]+(i&0x07);
1301       y_l = m - i - 1.0; y_r = y_l + 1.0;
1302       for (j=0; j<a->rlen[i]; j++) {
1303         x_l = a->colidx[shift+j*8]; x_r = x_l + 1.0;
1304         if (PetscRealPart(a->val[shift+8*j]) <=  0.) continue;
1305         ierr = PetscDrawRectangle(draw,x_l,y_l,x_r,y_r,color,color,color,color);CHKERRQ(ierr);
1306       }
1307     }
1308     ierr = PetscDrawCollectiveEnd(draw);CHKERRQ(ierr);
1309   } else {
1310     /* use contour shading to indicate magnitude of values */
1311     /* first determine max of all nonzero values */
1312     PetscReal minv=0.0,maxv=0.0;
1313     PetscInt  count=0;
1314     PetscDraw popup;
1315     for (i=0; i<a->sliidx[a->totalslices]; i++) {
1316       if (PetscAbsScalar(a->val[i]) > maxv) maxv = PetscAbsScalar(a->val[i]);
1317     }
1318     if (minv >= maxv) maxv = minv + PETSC_SMALL;
1319     ierr = PetscDrawGetPopup(draw,&popup);CHKERRQ(ierr);
1320     ierr = PetscDrawScalePopup(popup,minv,maxv);CHKERRQ(ierr);
1321 
1322     ierr = PetscDrawCollectiveBegin(draw);CHKERRQ(ierr);
1323     for (i=0; i<m; i++) {
1324       shift = a->sliidx[i>>3]+(i&0x07);
1325       y_l = m - i - 1.0;
1326       y_r = y_l + 1.0;
1327       for (j=0; j<a->rlen[i]; j++) {
1328         x_l = a->colidx[shift+j*8];
1329         x_r = x_l + 1.0;
1330         color = PetscDrawRealToColor(PetscAbsScalar(a->val[count]),minv,maxv);
1331         ierr = PetscDrawRectangle(draw,x_l,y_l,x_r,y_r,color,color,color,color);CHKERRQ(ierr);
1332         count++;
1333       }
1334     }
1335     ierr = PetscDrawCollectiveEnd(draw);CHKERRQ(ierr);
1336   }
1337   PetscFunctionReturn(0);
1338 }
1339 
1340 #include <petscdraw.h>
1341 PetscErrorCode MatView_SeqSELL_Draw(Mat A,PetscViewer viewer)
1342 {
1343   PetscDraw      draw;
1344   PetscReal      xr,yr,xl,yl,h,w;
1345   PetscBool      isnull;
1346   PetscErrorCode ierr;
1347 
1348   PetscFunctionBegin;
1349   ierr = PetscViewerDrawGetDraw(viewer,0,&draw);CHKERRQ(ierr);
1350   ierr = PetscDrawIsNull(draw,&isnull);CHKERRQ(ierr);
1351   if (isnull) PetscFunctionReturn(0);
1352 
1353   xr   = A->cmap->n; yr  = A->rmap->n; h = yr/10.0; w = xr/10.0;
1354   xr  += w;          yr += h;         xl = -w;     yl = -h;
1355   ierr = PetscDrawSetCoordinates(draw,xl,yl,xr,yr);CHKERRQ(ierr);
1356   ierr = PetscObjectCompose((PetscObject)A,"Zoomviewer",(PetscObject)viewer);CHKERRQ(ierr);
1357   ierr = PetscDrawZoom(draw,MatView_SeqSELL_Draw_Zoom,A);CHKERRQ(ierr);
1358   ierr = PetscObjectCompose((PetscObject)A,"Zoomviewer",NULL);CHKERRQ(ierr);
1359   ierr = PetscDrawSave(draw);CHKERRQ(ierr);
1360   PetscFunctionReturn(0);
1361 }
1362 
1363 PetscErrorCode MatView_SeqSELL(Mat A,PetscViewer viewer)
1364 {
1365   PetscBool      iascii,isbinary,isdraw;
1366   PetscErrorCode ierr;
1367 
1368   PetscFunctionBegin;
1369   ierr = PetscObjectTypeCompare((PetscObject)viewer,PETSCVIEWERASCII,&iascii);CHKERRQ(ierr);
1370   ierr = PetscObjectTypeCompare((PetscObject)viewer,PETSCVIEWERBINARY,&isbinary);CHKERRQ(ierr);
1371   ierr = PetscObjectTypeCompare((PetscObject)viewer,PETSCVIEWERDRAW,&isdraw);CHKERRQ(ierr);
1372   if (iascii) {
1373     ierr = MatView_SeqSELL_ASCII(A,viewer);CHKERRQ(ierr);
1374   } else if (isbinary) {
1375     /* ierr = MatView_SeqSELL_Binary(A,viewer);CHKERRQ(ierr); */
1376   } else if (isdraw) {
1377     ierr = MatView_SeqSELL_Draw(A,viewer);CHKERRQ(ierr);
1378   }
1379   PetscFunctionReturn(0);
1380 }
1381 
1382 PetscErrorCode MatAssemblyEnd_SeqSELL(Mat A,MatAssemblyType mode)
1383 {
1384   Mat_SeqSELL    *a=(Mat_SeqSELL*)A->data;
1385   PetscInt       i,shift,row_in_slice,row,nrow,*cp,lastcol,j,k;
1386   MatScalar      *vp;
1387   PetscErrorCode ierr;
1388 
1389   PetscFunctionBegin;
1390   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0);
1391   /* To do: compress out the unused elements */
1392   ierr = MatMarkDiagonal_SeqSELL(A);CHKERRQ(ierr);
1393   ierr = PetscInfo6(A,"Matrix size: %D X %D; storage space: %D allocated %D used (%D nonzeros+%D paddedzeros)\n",A->rmap->n,A->cmap->n,a->maxallocmat,a->sliidx[a->totalslices],a->nz,a->sliidx[a->totalslices]-a->nz);CHKERRQ(ierr);
1394   ierr = PetscInfo1(A,"Number of mallocs during MatSetValues() is %D\n",a->reallocs);CHKERRQ(ierr);
1395   ierr = PetscInfo1(A,"Maximum nonzeros in any row is %D\n",a->rlenmax);CHKERRQ(ierr);
1396   /* Set unused slots for column indices to last valid column index. Set unused slots for values to zero. This allows for a use of unmasked intrinsics -> higher performance */
1397   for (i=0; i<a->totalslices; ++i) {
1398     shift = a->sliidx[i];    /* starting index of the slice */
1399     cp    = a->colidx+shift; /* pointer to the column indices of the slice */
1400     vp    = a->val+shift;    /* pointer to the nonzero values of the slice */
1401     for (row_in_slice=0; row_in_slice<8; ++row_in_slice) { /* loop over rows in the slice */
1402       row  = 8*i + row_in_slice;
1403       nrow = a->rlen[row]; /* number of nonzeros in row */
1404       /*
1405         Search for the nearest nonzero. Normally setting the index to zero may cause extra communication.
1406         But if the entire slice are empty, it is fine to use 0 since the index will not be loaded.
1407       */
1408       lastcol = 0;
1409       if (nrow>0) { /* nonempty row */
1410         lastcol = cp[8*(nrow-1)+row_in_slice]; /* use the index from the last nonzero at current row */
1411       } else if (!row_in_slice) { /* first row of the currect slice is empty */
1412         for (j=1;j<8;j++) {
1413           if (a->rlen[8*i+j]) {
1414             lastcol = cp[j];
1415             break;
1416           }
1417         }
1418       } else {
1419         if (a->sliidx[i+1] != shift) lastcol = cp[row_in_slice-1]; /* use the index from the previous row */
1420       }
1421 
1422       for (k=nrow; k<(a->sliidx[i+1]-shift)/8; ++k) {
1423         cp[8*k+row_in_slice] = lastcol;
1424         vp[8*k+row_in_slice] = (MatScalar)0;
1425       }
1426     }
1427   }
1428 
1429   A->info.mallocs += a->reallocs;
1430   a->reallocs      = 0;
1431 
1432   ierr = MatSeqSELLInvalidateDiagonal(A);CHKERRQ(ierr);
1433   PetscFunctionReturn(0);
1434 }
1435 
1436 PetscErrorCode MatGetInfo_SeqSELL(Mat A,MatInfoType flag,MatInfo *info)
1437 {
1438   Mat_SeqSELL *a=(Mat_SeqSELL*)A->data;
1439 
1440   PetscFunctionBegin;
1441   info->block_size   = 1.0;
1442   info->nz_allocated = (double)a->maxallocmat;
1443   info->nz_used      = (double)a->sliidx[a->totalslices]; /* include padding zeros */
1444   info->nz_unneeded  = (double)(a->maxallocmat-a->sliidx[a->totalslices]);
1445   info->assemblies   = (double)A->num_ass;
1446   info->mallocs      = (double)A->info.mallocs;
1447   info->memory       = ((PetscObject)A)->mem;
1448   if (A->factortype) {
1449     info->fill_ratio_given  = A->info.fill_ratio_given;
1450     info->fill_ratio_needed = A->info.fill_ratio_needed;
1451     info->factor_mallocs    = A->info.factor_mallocs;
1452   } else {
1453     info->fill_ratio_given  = 0;
1454     info->fill_ratio_needed = 0;
1455     info->factor_mallocs    = 0;
1456   }
1457   PetscFunctionReturn(0);
1458 }
1459 
1460 PetscErrorCode MatSetValues_SeqSELL(Mat A,PetscInt m,const PetscInt im[],PetscInt n,const PetscInt in[],const PetscScalar v[],InsertMode is)
1461 {
1462   Mat_SeqSELL    *a=(Mat_SeqSELL*)A->data;
1463   PetscInt       shift,i,k,l,low,high,t,ii,row,col,nrow;
1464   PetscInt       *cp,nonew=a->nonew,lastcol=-1;
1465   MatScalar      *vp,value;
1466   PetscErrorCode ierr;
1467 
1468   PetscFunctionBegin;
1469   for (k=0; k<m; k++) { /* loop over added rows */
1470     row = im[k];
1471     if (row < 0) continue;
1472 #if defined(PETSC_USE_DEBUG)
1473     if (row >= A->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Row too large: row %D max %D",row,A->rmap->n-1);
1474 #endif
1475     shift = a->sliidx[row>>3]+(row&0x07); /* starting index of the row */
1476     cp    = a->colidx+shift; /* pointer to the row */
1477     vp    = a->val+shift; /* pointer to the row */
1478     nrow  = a->rlen[row];
1479     low   = 0;
1480     high  = nrow;
1481 
1482     for (l=0; l<n; l++) { /* loop over added columns */
1483       col = in[l];
1484       if (col<0) continue;
1485 #if defined(PETSC_USE_DEBUG)
1486       if (col >= A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Col too large: row %D max %D",col,A->cmap->n-1);
1487 #endif
1488       if (a->roworiented) {
1489         value = v[l+k*n];
1490       } else {
1491         value = v[k+l*m];
1492       }
1493       if ((value == 0.0 && a->ignorezeroentries) && (is == ADD_VALUES)) continue;
1494 
1495       /* search in this row for the specified colmun, i indicates the column to be set */
1496       if (col <= lastcol) low = 0;
1497       else high = nrow;
1498       lastcol = col;
1499       while (high-low > 5) {
1500         t = (low+high)/2;
1501         if (*(cp+t*8) > col) high = t;
1502         else low = t;
1503       }
1504       for (i=low; i<high; i++) {
1505         if (*(cp+i*8) > col) break;
1506         if (*(cp+i*8) == col) {
1507           if (is == ADD_VALUES) *(vp+i*8) += value;
1508           else *(vp+i*8) = value;
1509           low = i + 1;
1510           goto noinsert;
1511         }
1512       }
1513       if (value == 0.0 && a->ignorezeroentries) goto noinsert;
1514       if (nonew == 1) goto noinsert;
1515       if (nonew == -1) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Inserting a new nonzero (%D, %D) in the matrix", row, col);
1516       /* If the current row length exceeds the slice width (e.g. nrow==slice_width), allocate a new space, otherwise do nothing */
1517       MatSeqXSELLReallocateSELL(A,A->rmap->n,1,nrow,a->sliidx,row/8,row,col,a->colidx,a->val,cp,vp,nonew,MatScalar);
1518       /* add the new nonzero to the high position, shift the remaining elements in current row to the right by one slot */
1519       for (ii=nrow-1; ii>=i; ii--) {
1520         *(cp+(ii+1)*8) = *(cp+ii*8);
1521         *(vp+(ii+1)*8) = *(vp+ii*8);
1522       }
1523       a->rlen[row]++;
1524       *(cp+i*8) = col;
1525       *(vp+i*8) = value;
1526       a->nz++;
1527       A->nonzerostate++;
1528       low = i+1; high++; nrow++;
1529 noinsert:;
1530     }
1531     a->rlen[row] = nrow;
1532   }
1533   PetscFunctionReturn(0);
1534 }
1535 
1536 PetscErrorCode MatCopy_SeqSELL(Mat A,Mat B,MatStructure str)
1537 {
1538   PetscErrorCode ierr;
1539 
1540   PetscFunctionBegin;
1541   /* If the two matrices have the same copy implementation, use fast copy. */
1542   if (str == SAME_NONZERO_PATTERN && (A->ops->copy == B->ops->copy)) {
1543     Mat_SeqSELL *a=(Mat_SeqSELL*)A->data;
1544     Mat_SeqSELL *b=(Mat_SeqSELL*)B->data;
1545 
1546     if (a->sliidx[a->totalslices] != b->sliidx[b->totalslices]) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"Number of nonzeros in two matrices are different");
1547     ierr = PetscArraycpy(b->val,a->val,a->sliidx[a->totalslices]);CHKERRQ(ierr);
1548   } else {
1549     ierr = MatCopy_Basic(A,B,str);CHKERRQ(ierr);
1550   }
1551   PetscFunctionReturn(0);
1552 }
1553 
1554 PetscErrorCode MatSetUp_SeqSELL(Mat A)
1555 {
1556   PetscErrorCode ierr;
1557 
1558   PetscFunctionBegin;
1559   ierr = MatSeqSELLSetPreallocation(A,PETSC_DEFAULT,0);CHKERRQ(ierr);
1560   PetscFunctionReturn(0);
1561 }
1562 
1563 PetscErrorCode MatSeqSELLGetArray_SeqSELL(Mat A,PetscScalar *array[])
1564 {
1565   Mat_SeqSELL *a=(Mat_SeqSELL*)A->data;
1566 
1567   PetscFunctionBegin;
1568   *array = a->val;
1569   PetscFunctionReturn(0);
1570 }
1571 
1572 PetscErrorCode MatSeqSELLRestoreArray_SeqSELL(Mat A,PetscScalar *array[])
1573 {
1574   PetscFunctionBegin;
1575   PetscFunctionReturn(0);
1576 }
1577 
1578 PetscErrorCode MatRealPart_SeqSELL(Mat A)
1579 {
1580   Mat_SeqSELL *a=(Mat_SeqSELL*)A->data;
1581   PetscInt    i;
1582   MatScalar   *aval=a->val;
1583 
1584   PetscFunctionBegin;
1585   for (i=0; i<a->sliidx[a->totalslices]; i++) aval[i]=PetscRealPart(aval[i]);
1586   PetscFunctionReturn(0);
1587 }
1588 
1589 PetscErrorCode MatImaginaryPart_SeqSELL(Mat A)
1590 {
1591   Mat_SeqSELL    *a=(Mat_SeqSELL*)A->data;
1592   PetscInt       i;
1593   MatScalar      *aval=a->val;
1594   PetscErrorCode ierr;
1595 
1596   PetscFunctionBegin;
1597   for (i=0; i<a->sliidx[a->totalslices]; i++) aval[i] = PetscImaginaryPart(aval[i]);
1598   ierr = MatSeqSELLInvalidateDiagonal(A);CHKERRQ(ierr);
1599   PetscFunctionReturn(0);
1600 }
1601 
1602 PetscErrorCode MatScale_SeqSELL(Mat inA,PetscScalar alpha)
1603 {
1604   Mat_SeqSELL    *a=(Mat_SeqSELL*)inA->data;
1605   MatScalar      *aval=a->val;
1606   PetscScalar    oalpha=alpha;
1607   PetscBLASInt   one=1,size;
1608   PetscErrorCode ierr;
1609 
1610   PetscFunctionBegin;
1611   ierr = PetscBLASIntCast(a->sliidx[a->totalslices],&size);CHKERRQ(ierr);
1612   PetscStackCallBLAS("BLASscal",BLASscal_(&size,&oalpha,aval,&one));
1613   ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
1614   ierr = MatSeqSELLInvalidateDiagonal(inA);CHKERRQ(ierr);
1615   PetscFunctionReturn(0);
1616 }
1617 
1618 PetscErrorCode MatShift_SeqSELL(Mat Y,PetscScalar a)
1619 {
1620   Mat_SeqSELL    *y=(Mat_SeqSELL*)Y->data;
1621   PetscErrorCode ierr;
1622 
1623   PetscFunctionBegin;
1624   if (!Y->preallocated || !y->nz) {
1625     ierr = MatSeqSELLSetPreallocation(Y,1,NULL);CHKERRQ(ierr);
1626   }
1627   ierr = MatShift_Basic(Y,a);CHKERRQ(ierr);
1628   PetscFunctionReturn(0);
1629 }
1630 
1631 PetscErrorCode MatSOR_SeqSELL(Mat A,Vec bb,PetscReal omega,MatSORType flag,PetscReal fshift,PetscInt its,PetscInt lits,Vec xx)
1632 {
1633   Mat_SeqSELL       *a=(Mat_SeqSELL*)A->data;
1634   PetscScalar       *x,sum,*t;
1635   const MatScalar   *idiag=0,*mdiag;
1636   const PetscScalar *b,*xb;
1637   PetscInt          n,m=A->rmap->n,i,j,shift;
1638   const PetscInt    *diag;
1639   PetscErrorCode    ierr;
1640 
1641   PetscFunctionBegin;
1642   its = its*lits;
1643 
1644   if (fshift != a->fshift || omega != a->omega) a->idiagvalid = PETSC_FALSE; /* must recompute idiag[] */
1645   if (!a->idiagvalid) {ierr = MatInvertDiagonal_SeqSELL(A,omega,fshift);CHKERRQ(ierr);}
1646   a->fshift = fshift;
1647   a->omega  = omega;
1648 
1649   diag  = a->diag;
1650   t     = a->ssor_work;
1651   idiag = a->idiag;
1652   mdiag = a->mdiag;
1653 
1654   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1655   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
1656   /* We count flops by assuming the upper triangular and lower triangular parts have the same number of nonzeros */
1657   if (flag == SOR_APPLY_UPPER) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"SOR_APPLY_UPPER is not implemented");
1658   if (flag == SOR_APPLY_LOWER) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"SOR_APPLY_LOWER is not implemented");
1659   if (flag & SOR_EISENSTAT) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support yet for Eisenstat");
1660 
1661   if (flag & SOR_ZERO_INITIAL_GUESS) {
1662     if ((flag & SOR_FORWARD_SWEEP) || (flag & SOR_LOCAL_FORWARD_SWEEP)) {
1663       for (i=0; i<m; i++) {
1664         shift = a->sliidx[i>>3]+(i&0x07); /* starting index of the row i */
1665         sum   = b[i];
1666         n     = (diag[i]-shift)/8;
1667         for (j=0; j<n; j++) sum -= a->val[shift+j*8]*x[a->colidx[shift+j*8]];
1668         t[i]  = sum;
1669         x[i]  = sum*idiag[i];
1670       }
1671       xb   = t;
1672       ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
1673     } else xb = b;
1674     if ((flag & SOR_BACKWARD_SWEEP) || (flag & SOR_LOCAL_BACKWARD_SWEEP)) {
1675       for (i=m-1; i>=0; i--) {
1676         shift = a->sliidx[i>>3]+(i&0x07); /* starting index of the row i */
1677         sum   = xb[i];
1678         n     = a->rlen[i]-(diag[i]-shift)/8-1;
1679         for (j=1; j<=n; j++) sum -= a->val[diag[i]+j*8]*x[a->colidx[diag[i]+j*8]];
1680         if (xb == b) {
1681           x[i] = sum*idiag[i];
1682         } else {
1683           x[i] = (1.-omega)*x[i]+sum*idiag[i];  /* omega in idiag */
1684         }
1685       }
1686       ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); /* assumes 1/2 in upper */
1687     }
1688     its--;
1689   }
1690   while (its--) {
1691     if ((flag & SOR_FORWARD_SWEEP) || (flag & SOR_LOCAL_FORWARD_SWEEP)) {
1692       for (i=0; i<m; i++) {
1693         /* lower */
1694         shift = a->sliidx[i>>3]+(i&0x07); /* starting index of the row i */
1695         sum   = b[i];
1696         n     = (diag[i]-shift)/8;
1697         for (j=0; j<n; j++) sum -= a->val[shift+j*8]*x[a->colidx[shift+j*8]];
1698         t[i]  = sum;             /* save application of the lower-triangular part */
1699         /* upper */
1700         n     = a->rlen[i]-(diag[i]-shift)/8-1;
1701         for (j=1; j<=n; j++) sum -= a->val[diag[i]+j*8]*x[a->colidx[diag[i]+j*8]];
1702         x[i]  = (1.-omega)*x[i]+sum*idiag[i];  /* omega in idiag */
1703       }
1704       xb   = t;
1705       ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
1706     } else xb = b;
1707     if ((flag & SOR_BACKWARD_SWEEP) || (flag & SOR_LOCAL_BACKWARD_SWEEP)) {
1708       for (i=m-1; i>=0; i--) {
1709         shift = a->sliidx[i>>3]+(i&0x07); /* starting index of the row i */
1710         sum = xb[i];
1711         if (xb == b) {
1712           /* whole matrix (no checkpointing available) */
1713           n     = a->rlen[i];
1714           for (j=0; j<n; j++) sum -= a->val[shift+j*8]*x[a->colidx[shift+j*8]];
1715           x[i] = (1.-omega)*x[i]+(sum+mdiag[i]*x[i])*idiag[i];
1716         } else { /* lower-triangular part has been saved, so only apply upper-triangular */
1717           n     = a->rlen[i]-(diag[i]-shift)/8-1;
1718           for (j=1; j<=n; j++) sum -= a->val[diag[i]+j*8]*x[a->colidx[diag[i]+j*8]];
1719           x[i]  = (1.-omega)*x[i]+sum*idiag[i];  /* omega in idiag */
1720         }
1721       }
1722       if (xb == b) {
1723         ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
1724       } else {
1725         ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); /* assumes 1/2 in upper */
1726       }
1727     }
1728   }
1729   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1730   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
1731   PetscFunctionReturn(0);
1732 }
1733 
1734 /* -------------------------------------------------------------------*/
1735 static struct _MatOps MatOps_Values = {MatSetValues_SeqSELL,
1736                                        MatGetRow_SeqSELL,
1737                                        MatRestoreRow_SeqSELL,
1738                                        MatMult_SeqSELL,
1739                                /* 4*/  MatMultAdd_SeqSELL,
1740                                        MatMultTranspose_SeqSELL,
1741                                        MatMultTransposeAdd_SeqSELL,
1742                                        0,
1743                                        0,
1744                                        0,
1745                                /* 10*/ 0,
1746                                        0,
1747                                        0,
1748                                        MatSOR_SeqSELL,
1749                                        0,
1750                                /* 15*/ MatGetInfo_SeqSELL,
1751                                        MatEqual_SeqSELL,
1752                                        MatGetDiagonal_SeqSELL,
1753                                        MatDiagonalScale_SeqSELL,
1754                                        0,
1755                                /* 20*/ 0,
1756                                        MatAssemblyEnd_SeqSELL,
1757                                        MatSetOption_SeqSELL,
1758                                        MatZeroEntries_SeqSELL,
1759                                /* 24*/ 0,
1760                                        0,
1761                                        0,
1762                                        0,
1763                                        0,
1764                                /* 29*/ MatSetUp_SeqSELL,
1765                                        0,
1766                                        0,
1767                                        0,
1768                                        0,
1769                                /* 34*/ MatDuplicate_SeqSELL,
1770                                        0,
1771                                        0,
1772                                        0,
1773                                        0,
1774                                /* 39*/ 0,
1775                                        0,
1776                                        0,
1777                                        MatGetValues_SeqSELL,
1778                                        MatCopy_SeqSELL,
1779                                /* 44*/ 0,
1780                                        MatScale_SeqSELL,
1781                                        MatShift_SeqSELL,
1782                                        0,
1783                                        0,
1784                                /* 49*/ 0,
1785                                        0,
1786                                        0,
1787                                        0,
1788                                        0,
1789                                /* 54*/ MatFDColoringCreate_SeqXAIJ,
1790                                        0,
1791                                        0,
1792                                        0,
1793                                        0,
1794                                /* 59*/ 0,
1795                                        MatDestroy_SeqSELL,
1796                                        MatView_SeqSELL,
1797                                        0,
1798                                        0,
1799                                /* 64*/ 0,
1800                                        0,
1801                                        0,
1802                                        0,
1803                                        0,
1804                                /* 69*/ 0,
1805                                        0,
1806                                        0,
1807                                        0,
1808                                        0,
1809                                /* 74*/ 0,
1810                                        MatFDColoringApply_AIJ, /* reuse the FDColoring function for AIJ */
1811                                        0,
1812                                        0,
1813                                        0,
1814                                /* 79*/ 0,
1815                                        0,
1816                                        0,
1817                                        0,
1818                                        0,
1819                                /* 84*/ 0,
1820                                        0,
1821                                        0,
1822                                        0,
1823                                        0,
1824                                /* 89*/ 0,
1825                                        0,
1826                                        0,
1827                                        0,
1828                                        0,
1829                                /* 94*/ 0,
1830                                        0,
1831                                        0,
1832                                        0,
1833                                        0,
1834                                /* 99*/ 0,
1835                                        0,
1836                                        0,
1837                                        MatConjugate_SeqSELL,
1838                                        0,
1839                                /*104*/ 0,
1840                                        0,
1841                                        0,
1842                                        0,
1843                                        0,
1844                                /*109*/ 0,
1845                                        0,
1846                                        0,
1847                                        0,
1848                                        MatMissingDiagonal_SeqSELL,
1849                                /*114*/ 0,
1850                                        0,
1851                                        0,
1852                                        0,
1853                                        0,
1854                                /*119*/ 0,
1855                                        0,
1856                                        0,
1857                                        0,
1858                                        0,
1859                                /*124*/ 0,
1860                                        0,
1861                                        0,
1862                                        0,
1863                                        0,
1864                                /*129*/ 0,
1865                                        0,
1866                                        0,
1867                                        0,
1868                                        0,
1869                                /*134*/ 0,
1870                                        0,
1871                                        0,
1872                                        0,
1873                                        0,
1874                                /*139*/ 0,
1875                                        0,
1876                                        0,
1877                                        MatFDColoringSetUp_SeqXAIJ,
1878                                        0,
1879                                 /*144*/0
1880 };
1881 
1882 PetscErrorCode MatStoreValues_SeqSELL(Mat mat)
1883 {
1884   Mat_SeqSELL    *a=(Mat_SeqSELL*)mat->data;
1885   PetscErrorCode ierr;
1886 
1887   PetscFunctionBegin;
1888   if (!a->nonew) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ORDER,"Must call MatSetOption(A,MAT_NEW_NONZERO_LOCATIONS,PETSC_FALSE);first");
1889 
1890   /* allocate space for values if not already there */
1891   if (!a->saved_values) {
1892     ierr = PetscMalloc1(a->sliidx[a->totalslices]+1,&a->saved_values);CHKERRQ(ierr);
1893     ierr = PetscLogObjectMemory((PetscObject)mat,(a->sliidx[a->totalslices]+1)*sizeof(PetscScalar));CHKERRQ(ierr);
1894   }
1895 
1896   /* copy values over */
1897   ierr = PetscArraycpy(a->saved_values,a->val,a->sliidx[a->totalslices]);CHKERRQ(ierr);
1898   PetscFunctionReturn(0);
1899 }
1900 
1901 PetscErrorCode MatRetrieveValues_SeqSELL(Mat mat)
1902 {
1903   Mat_SeqSELL    *a=(Mat_SeqSELL*)mat->data;
1904   PetscErrorCode ierr;
1905 
1906   PetscFunctionBegin;
1907   if (!a->nonew) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ORDER,"Must call MatSetOption(A,MAT_NEW_NONZERO_LOCATIONS,PETSC_FALSE);first");
1908   if (!a->saved_values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ORDER,"Must call MatStoreValues(A);first");
1909   ierr = PetscArraycpy(a->val,a->saved_values,a->sliidx[a->totalslices]);CHKERRQ(ierr);
1910   PetscFunctionReturn(0);
1911 }
1912 
1913 /*@C
1914  MatSeqSELLRestoreArray - returns access to the array where the data for a MATSEQSELL matrix is stored obtained by MatSeqSELLGetArray()
1915 
1916  Not Collective
1917 
1918  Input Parameters:
1919  .  mat - a MATSEQSELL matrix
1920  .  array - pointer to the data
1921 
1922  Level: intermediate
1923 
1924  .seealso: MatSeqSELLGetArray(), MatSeqSELLRestoreArrayF90()
1925  @*/
1926 PetscErrorCode MatSeqSELLRestoreArray(Mat A,PetscScalar **array)
1927 {
1928   PetscErrorCode ierr;
1929 
1930   PetscFunctionBegin;
1931   ierr = PetscUseMethod(A,"MatSeqSELLRestoreArray_C",(Mat,PetscScalar**),(A,array));CHKERRQ(ierr);
1932   PetscFunctionReturn(0);
1933 }
1934 
1935 PETSC_EXTERN PetscErrorCode MatCreate_SeqSELL(Mat B)
1936 {
1937   Mat_SeqSELL    *b;
1938   PetscMPIInt    size;
1939   PetscErrorCode ierr;
1940 
1941   PetscFunctionBegin;
1942   ierr = MPI_Comm_size(PetscObjectComm((PetscObject)B),&size);CHKERRQ(ierr);
1943   if (size > 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Comm must be of size 1");
1944 
1945   ierr = PetscNewLog(B,&b);CHKERRQ(ierr);
1946 
1947   B->data = (void*)b;
1948 
1949   ierr = PetscMemcpy(B->ops,&MatOps_Values,sizeof(struct _MatOps));CHKERRQ(ierr);
1950 
1951   b->row                = 0;
1952   b->col                = 0;
1953   b->icol               = 0;
1954   b->reallocs           = 0;
1955   b->ignorezeroentries  = PETSC_FALSE;
1956   b->roworiented        = PETSC_TRUE;
1957   b->nonew              = 0;
1958   b->diag               = 0;
1959   b->solve_work         = 0;
1960   B->spptr              = 0;
1961   b->saved_values       = 0;
1962   b->idiag              = 0;
1963   b->mdiag              = 0;
1964   b->ssor_work          = 0;
1965   b->omega              = 1.0;
1966   b->fshift             = 0.0;
1967   b->idiagvalid         = PETSC_FALSE;
1968   b->keepnonzeropattern = PETSC_FALSE;
1969 
1970   ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQSELL);CHKERRQ(ierr);
1971   ierr = PetscObjectComposeFunction((PetscObject)B,"MatSeqSELLGetArray_C",MatSeqSELLGetArray_SeqSELL);CHKERRQ(ierr);
1972   ierr = PetscObjectComposeFunction((PetscObject)B,"MatSeqSELLRestoreArray_C",MatSeqSELLRestoreArray_SeqSELL);CHKERRQ(ierr);
1973   ierr = PetscObjectComposeFunction((PetscObject)B,"MatStoreValues_C",MatStoreValues_SeqSELL);CHKERRQ(ierr);
1974   ierr = PetscObjectComposeFunction((PetscObject)B,"MatRetrieveValues_C",MatRetrieveValues_SeqSELL);CHKERRQ(ierr);
1975   ierr = PetscObjectComposeFunction((PetscObject)B,"MatSeqSELLSetPreallocation_C",MatSeqSELLSetPreallocation_SeqSELL);CHKERRQ(ierr);
1976   ierr = PetscObjectComposeFunction((PetscObject)B,"MatConvert_seqsell_seqaij_C",MatConvert_SeqSELL_SeqAIJ);CHKERRQ(ierr);
1977   PetscFunctionReturn(0);
1978 }
1979 
1980 /*
1981  Given a matrix generated with MatGetFactor() duplicates all the information in A into B
1982  */
1983 PetscErrorCode MatDuplicateNoCreate_SeqSELL(Mat C,Mat A,MatDuplicateOption cpvalues,PetscBool mallocmatspace)
1984 {
1985   Mat_SeqSELL    *c,*a=(Mat_SeqSELL*)A->data;
1986   PetscInt       i,m=A->rmap->n;
1987   PetscInt       totalslices=a->totalslices;
1988   PetscErrorCode ierr;
1989 
1990   PetscFunctionBegin;
1991   c = (Mat_SeqSELL*)C->data;
1992 
1993   C->factortype = A->factortype;
1994   c->row        = 0;
1995   c->col        = 0;
1996   c->icol       = 0;
1997   c->reallocs   = 0;
1998 
1999   C->assembled = PETSC_TRUE;
2000 
2001   ierr = PetscLayoutReference(A->rmap,&C->rmap);CHKERRQ(ierr);
2002   ierr = PetscLayoutReference(A->cmap,&C->cmap);CHKERRQ(ierr);
2003 
2004   ierr = PetscMalloc1(8*totalslices,&c->rlen);CHKERRQ(ierr);
2005   ierr = PetscLogObjectMemory((PetscObject)C,m*sizeof(PetscInt));CHKERRQ(ierr);
2006   ierr = PetscMalloc1(totalslices+1,&c->sliidx);CHKERRQ(ierr);
2007   ierr = PetscLogObjectMemory((PetscObject)C, (totalslices+1)*sizeof(PetscInt));CHKERRQ(ierr);
2008 
2009   for (i=0; i<m; i++) c->rlen[i] = a->rlen[i];
2010   for (i=0; i<totalslices+1; i++) c->sliidx[i] = a->sliidx[i];
2011 
2012   /* allocate the matrix space */
2013   if (mallocmatspace) {
2014     ierr = PetscMalloc2(a->maxallocmat,&c->val,a->maxallocmat,&c->colidx);CHKERRQ(ierr);
2015     ierr = PetscLogObjectMemory((PetscObject)C,a->maxallocmat*(sizeof(PetscScalar)+sizeof(PetscInt)));CHKERRQ(ierr);
2016 
2017     c->singlemalloc = PETSC_TRUE;
2018 
2019     if (m > 0) {
2020       ierr = PetscArraycpy(c->colidx,a->colidx,a->maxallocmat);CHKERRQ(ierr);
2021       if (cpvalues == MAT_COPY_VALUES) {
2022         ierr = PetscArraycpy(c->val,a->val,a->maxallocmat);CHKERRQ(ierr);
2023       } else {
2024         ierr = PetscArrayzero(c->val,a->maxallocmat);CHKERRQ(ierr);
2025       }
2026     }
2027   }
2028 
2029   c->ignorezeroentries = a->ignorezeroentries;
2030   c->roworiented       = a->roworiented;
2031   c->nonew             = a->nonew;
2032   if (a->diag) {
2033     ierr = PetscMalloc1(m,&c->diag);CHKERRQ(ierr);
2034     ierr = PetscLogObjectMemory((PetscObject)C,m*sizeof(PetscInt));CHKERRQ(ierr);
2035     for (i=0; i<m; i++) {
2036       c->diag[i] = a->diag[i];
2037     }
2038   } else c->diag = 0;
2039 
2040   c->solve_work         = 0;
2041   c->saved_values       = 0;
2042   c->idiag              = 0;
2043   c->ssor_work          = 0;
2044   c->keepnonzeropattern = a->keepnonzeropattern;
2045   c->free_val           = PETSC_TRUE;
2046   c->free_colidx        = PETSC_TRUE;
2047 
2048   c->maxallocmat  = a->maxallocmat;
2049   c->maxallocrow  = a->maxallocrow;
2050   c->rlenmax      = a->rlenmax;
2051   c->nz           = a->nz;
2052   C->preallocated = PETSC_TRUE;
2053 
2054   c->nonzerorowcnt = a->nonzerorowcnt;
2055   C->nonzerostate  = A->nonzerostate;
2056 
2057   ierr = PetscFunctionListDuplicate(((PetscObject)A)->qlist,&((PetscObject)C)->qlist);CHKERRQ(ierr);
2058   PetscFunctionReturn(0);
2059 }
2060 
2061 PetscErrorCode MatDuplicate_SeqSELL(Mat A,MatDuplicateOption cpvalues,Mat *B)
2062 {
2063   PetscErrorCode ierr;
2064 
2065   PetscFunctionBegin;
2066   ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr);
2067   ierr = MatSetSizes(*B,A->rmap->n,A->cmap->n,A->rmap->n,A->cmap->n);CHKERRQ(ierr);
2068   if (!(A->rmap->n % A->rmap->bs) && !(A->cmap->n % A->cmap->bs)) {
2069     ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr);
2070   }
2071   ierr = MatSetType(*B,((PetscObject)A)->type_name);CHKERRQ(ierr);
2072   ierr = MatDuplicateNoCreate_SeqSELL(*B,A,cpvalues,PETSC_TRUE);CHKERRQ(ierr);
2073   PetscFunctionReturn(0);
2074 }
2075 
2076 /*@C
2077  MatCreateSeqSELL - Creates a sparse matrix in SELL format.
2078 
2079  Collective
2080 
2081  Input Parameters:
2082  +  comm - MPI communicator, set to PETSC_COMM_SELF
2083  .  m - number of rows
2084  .  n - number of columns
2085  .  rlenmax - maximum number of nonzeros in a row
2086  -  rlen - array containing the number of nonzeros in the various rows
2087  (possibly different for each row) or NULL
2088 
2089  Output Parameter:
2090  .  A - the matrix
2091 
2092  It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
2093  MatXXXXSetPreallocation() paradigm instead of this routine directly.
2094  [MatXXXXSetPreallocation() is, for example, MatSeqSELLSetPreallocation]
2095 
2096  Notes:
2097  If nnz is given then nz is ignored
2098 
2099  Specify the preallocated storage with either rlenmax or rlen (not both).
2100  Set rlenmax=PETSC_DEFAULT and rlen=NULL for PETSc to control dynamic memory
2101  allocation.  For large problems you MUST preallocate memory or you
2102  will get TERRIBLE performance, see the users' manual chapter on matrices.
2103 
2104  Level: intermediate
2105 
2106  .seealso: MatCreate(), MatCreateSELL(), MatSetValues(), MatCreateSeqSELLWithArrays()
2107 
2108  @*/
2109 PetscErrorCode MatCreateSeqSELL(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt maxallocrow,const PetscInt rlen[],Mat *A)
2110 {
2111   PetscErrorCode ierr;
2112 
2113   PetscFunctionBegin;
2114   ierr = MatCreate(comm,A);CHKERRQ(ierr);
2115   ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr);
2116   ierr = MatSetType(*A,MATSEQSELL);CHKERRQ(ierr);
2117   ierr = MatSeqSELLSetPreallocation_SeqSELL(*A,maxallocrow,rlen);CHKERRQ(ierr);
2118   PetscFunctionReturn(0);
2119 }
2120 
2121 PetscErrorCode MatEqual_SeqSELL(Mat A,Mat B,PetscBool * flg)
2122 {
2123   Mat_SeqSELL     *a=(Mat_SeqSELL*)A->data,*b=(Mat_SeqSELL*)B->data;
2124   PetscInt       totalslices=a->totalslices;
2125   PetscErrorCode ierr;
2126 
2127   PetscFunctionBegin;
2128   /* If the  matrix dimensions are not equal,or no of nonzeros */
2129   if ((A->rmap->n != B->rmap->n) || (A->cmap->n != B->cmap->n) ||(a->nz != b->nz) || (a->rlenmax != b->rlenmax)) {
2130     *flg = PETSC_FALSE;
2131     PetscFunctionReturn(0);
2132   }
2133   /* if the a->colidx are the same */
2134   ierr = PetscArraycmp(a->colidx,b->colidx,a->sliidx[totalslices],flg);CHKERRQ(ierr);
2135   if (!*flg) PetscFunctionReturn(0);
2136   /* if a->val are the same */
2137   ierr = PetscArraycmp(a->val,b->val,a->sliidx[totalslices],flg);CHKERRQ(ierr);
2138   PetscFunctionReturn(0);
2139 }
2140 
2141 PetscErrorCode MatSeqSELLInvalidateDiagonal(Mat A)
2142 {
2143   Mat_SeqSELL *a=(Mat_SeqSELL*)A->data;
2144 
2145   PetscFunctionBegin;
2146   a->idiagvalid  = PETSC_FALSE;
2147   PetscFunctionReturn(0);
2148 }
2149 
2150 PetscErrorCode MatConjugate_SeqSELL(Mat A)
2151 {
2152 #if defined(PETSC_USE_COMPLEX)
2153   Mat_SeqSELL *a=(Mat_SeqSELL*)A->data;
2154   PetscInt    i;
2155   PetscScalar *val = a->val;
2156 
2157   PetscFunctionBegin;
2158   for (i=0; i<a->sliidx[a->totalslices]; i++) {
2159     val[i] = PetscConj(val[i]);
2160   }
2161 #else
2162   PetscFunctionBegin;
2163 #endif
2164   PetscFunctionReturn(0);
2165 }
2166