xref: /petsc/src/mat/impls/sell/seq/sell.c (revision d2fd7bfc6f0fd2e1d083decbb7cc7d77e16824f0)
1 
2 /*
3   Defines the basic matrix operations for the SELL matrix storage format.
4 */
5 #include <../src/mat/impls/sell/seq/sell.h>  /*I   "petscmat.h"  I*/
6 #include <petscblaslapack.h>
7 #include <petsc/private/kernels/blocktranspose.h>
8 #if defined(PETSC_HAVE_IMMINTRIN_H) && (defined(__AVX512F__) || (defined(__AVX2__) && defined(__FMA__)) || defined(__AVX__)) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
9 
10   #include <immintrin.h>
11 
12   #if !defined(_MM_SCALE_8)
13   #define _MM_SCALE_8    8
14   #endif
15 
16   #if defined(__AVX512F__)
17   /* these do not work
18    vec_idx  = _mm512_loadunpackhi_epi32(vec_idx,acolidx);
19    vec_vals = _mm512_loadunpackhi_pd(vec_vals,aval);
20   */
21     #define AVX512_Mult_Private(vec_idx,vec_x,vec_vals,vec_y) \
22     /* if the mask bit is set, copy from acolidx, otherwise from vec_idx */ \
23     vec_idx  = _mm256_loadu_si256((__m256i const*)acolidx); \
24     vec_vals = _mm512_loadu_pd(aval); \
25     vec_x    = _mm512_i32gather_pd(vec_idx,x,_MM_SCALE_8); \
26     vec_y    = _mm512_fmadd_pd(vec_x,vec_vals,vec_y)
27   #elif defined(__AVX2__) && defined(__FMA__)
28     #define AVX2_Mult_Private(vec_idx,vec_x,vec_vals,vec_y) \
29     vec_vals = _mm256_loadu_pd(aval); \
30     vec_idx  = _mm_loadu_si128((__m128i const*)acolidx); /* SSE2 */ \
31     vec_x    = _mm256_i32gather_pd(x,vec_idx,_MM_SCALE_8); \
32     vec_y    = _mm256_fmadd_pd(vec_x,vec_vals,vec_y)
33   #endif
34 #endif  /* PETSC_HAVE_IMMINTRIN_H */
35 
36 /*@C
37  MatSeqSELLSetPreallocation - For good matrix assembly performance
38  the user should preallocate the matrix storage by setting the parameter nz
39  (or the array nnz).  By setting these parameters accurately, performance
40  during matrix assembly can be increased significantly.
41 
42  Collective
43 
44  Input Parameters:
45  +  B - The matrix
46  .  nz - number of nonzeros per row (same for all rows)
47  -  nnz - array containing the number of nonzeros in the various rows
48  (possibly different for each row) or NULL
49 
50  Notes:
51  If nnz is given then nz is ignored.
52 
53  Specify the preallocated storage with either nz or nnz (not both).
54  Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
55  allocation.  For large problems you MUST preallocate memory or you
56  will get TERRIBLE performance, see the users' manual chapter on matrices.
57 
58  You can call MatGetInfo() to get information on how effective the preallocation was;
59  for example the fields mallocs,nz_allocated,nz_used,nz_unneeded;
60  You can also run with the option -info and look for messages with the string
61  malloc in them to see if additional memory allocation was needed.
62 
63  Developers: Use nz of MAT_SKIP_ALLOCATION to not allocate any space for the matrix
64  entries or columns indices.
65 
66  The maximum number of nonzeos in any row should be as accurate as possible.
67  If it is underestimated, you will get bad performance due to reallocation
68  (MatSeqXSELLReallocateSELL).
69 
70  Level: intermediate
71 
72  .seealso: MatCreate(), MatCreateSELL(), MatSetValues(), MatGetInfo()
73 
74  @*/
75 PetscErrorCode MatSeqSELLSetPreallocation(Mat B,PetscInt rlenmax,const PetscInt rlen[])
76 {
77   PetscErrorCode ierr;
78 
79   PetscFunctionBegin;
80   PetscValidHeaderSpecific(B,MAT_CLASSID,1);
81   PetscValidType(B,1);
82   ierr = PetscTryMethod(B,"MatSeqSELLSetPreallocation_C",(Mat,PetscInt,const PetscInt[]),(B,rlenmax,rlen));CHKERRQ(ierr);
83   PetscFunctionReturn(0);
84 }
85 
86 PetscErrorCode MatSeqSELLSetPreallocation_SeqSELL(Mat B,PetscInt maxallocrow,const PetscInt rlen[])
87 {
88   Mat_SeqSELL    *b;
89   PetscInt       i,j,totalslices;
90   PetscBool      skipallocation=PETSC_FALSE,realalloc=PETSC_FALSE;
91   PetscErrorCode ierr;
92 
93   PetscFunctionBegin;
94   if (maxallocrow >= 0 || rlen) realalloc = PETSC_TRUE;
95   if (maxallocrow == MAT_SKIP_ALLOCATION) {
96     skipallocation = PETSC_TRUE;
97     maxallocrow    = 0;
98   }
99 
100   ierr = PetscLayoutSetUp(B->rmap);CHKERRQ(ierr);
101   ierr = PetscLayoutSetUp(B->cmap);CHKERRQ(ierr);
102 
103   /* FIXME: if one preallocates more space than needed, the matrix does not shrink automatically, but for best performance it should */
104   if (maxallocrow == PETSC_DEFAULT || maxallocrow == PETSC_DECIDE) maxallocrow = 5;
105   if (maxallocrow < 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"maxallocrow cannot be less than 0: value %D",maxallocrow);
106   if (rlen) {
107     for (i=0; i<B->rmap->n; i++) {
108       if (rlen[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"rlen cannot be less than 0: local row %D value %D",i,rlen[i]);
109       if (rlen[i] > B->cmap->n) SETERRQ3(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"rlen cannot be greater than row length: local row %D value %D rowlength %D",i,rlen[i],B->cmap->n);
110     }
111   }
112 
113   B->preallocated = PETSC_TRUE;
114 
115   b = (Mat_SeqSELL*)B->data;
116 
117   totalslices = B->rmap->n/8+((B->rmap->n & 0x07)?1:0); /* ceil(n/8) */
118   b->totalslices = totalslices;
119   if (!skipallocation) {
120     if (B->rmap->n & 0x07) {ierr = PetscInfo1(B,"Padding rows to the SEQSELL matrix because the number of rows is not the multiple of 8 (value %D)\n",B->rmap->n);CHKERRQ(ierr);}
121 
122     if (!b->sliidx) { /* sliidx gives the starting index of each slice, the last element is the total space allocated */
123       ierr = PetscMalloc1(totalslices+1,&b->sliidx);CHKERRQ(ierr);
124       ierr = PetscLogObjectMemory((PetscObject)B,(totalslices+1)*sizeof(PetscInt));CHKERRQ(ierr);
125     }
126     if (!rlen) { /* if rlen is not provided, allocate same space for all the slices */
127       if (maxallocrow == PETSC_DEFAULT || maxallocrow == PETSC_DECIDE) maxallocrow = 10;
128       else if (maxallocrow < 0) maxallocrow = 1;
129       for (i=0; i<=totalslices; i++) b->sliidx[i] = i*8*maxallocrow;
130     } else {
131       maxallocrow = 0;
132       b->sliidx[0] = 0;
133       for (i=1; i<totalslices; i++) {
134         b->sliidx[i] = 0;
135         for (j=0;j<8;j++) {
136           b->sliidx[i] = PetscMax(b->sliidx[i],rlen[8*(i-1)+j]);
137         }
138         maxallocrow = PetscMax(b->sliidx[i],maxallocrow);
139         ierr = PetscIntSumError(b->sliidx[i-1],8*b->sliidx[i],&b->sliidx[i]);CHKERRQ(ierr);
140       }
141       /* last slice */
142       b->sliidx[totalslices] = 0;
143       for (j=(totalslices-1)*8;j<B->rmap->n;j++) b->sliidx[totalslices] = PetscMax(b->sliidx[totalslices],rlen[j]);
144       maxallocrow = PetscMax(b->sliidx[totalslices],maxallocrow);
145       b->sliidx[totalslices] = b->sliidx[totalslices-1] + 8*b->sliidx[totalslices];
146     }
147 
148     /* allocate space for val, colidx, rlen */
149     /* FIXME: should B's old memory be unlogged? */
150     ierr = MatSeqXSELLFreeSELL(B,&b->val,&b->colidx);CHKERRQ(ierr);
151     /* FIXME: assuming an element of the bit array takes 8 bits */
152     ierr = PetscMalloc2(b->sliidx[totalslices],&b->val,b->sliidx[totalslices],&b->colidx);CHKERRQ(ierr);
153     ierr = PetscLogObjectMemory((PetscObject)B,b->sliidx[totalslices]*(sizeof(PetscScalar)+sizeof(PetscInt)));CHKERRQ(ierr);
154     /* b->rlen will count nonzeros in each row so far. We dont copy rlen to b->rlen because the matrix has not been set. */
155     ierr = PetscCalloc1(8*totalslices,&b->rlen);CHKERRQ(ierr);
156     ierr = PetscLogObjectMemory((PetscObject)B,8*totalslices*sizeof(PetscInt));CHKERRQ(ierr);
157 
158     b->singlemalloc = PETSC_TRUE;
159     b->free_val     = PETSC_TRUE;
160     b->free_colidx  = PETSC_TRUE;
161   } else {
162     b->free_val    = PETSC_FALSE;
163     b->free_colidx = PETSC_FALSE;
164   }
165 
166   b->nz               = 0;
167   b->maxallocrow      = maxallocrow;
168   b->rlenmax          = maxallocrow;
169   b->maxallocmat      = b->sliidx[totalslices];
170   B->info.nz_unneeded = (double)b->maxallocmat;
171   if (realalloc) {
172     ierr = MatSetOption(B,MAT_NEW_NONZERO_ALLOCATION_ERR,PETSC_TRUE);CHKERRQ(ierr);
173   }
174   PetscFunctionReturn(0);
175 }
176 
177 PetscErrorCode MatGetRow_SeqSELL(Mat A,PetscInt row,PetscInt *nz,PetscInt **idx,PetscScalar **v)
178 {
179   Mat_SeqSELL *a = (Mat_SeqSELL*)A->data;
180   PetscInt    shift;
181 
182   PetscFunctionBegin;
183   if (row < 0 || row >= A->rmap->n) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Row %D out of range",row);
184   if (nz) *nz = a->rlen[row];
185   shift = a->sliidx[row>>3]+(row&0x07);
186   if (!a->getrowcols) {
187     PetscErrorCode ierr;
188 
189     ierr = PetscMalloc2(a->rlenmax,&a->getrowcols,a->rlenmax,&a->getrowvals);CHKERRQ(ierr);
190   }
191   if (idx) {
192     PetscInt j;
193     for (j=0; j<a->rlen[row]; j++) a->getrowcols[j] = a->colidx[shift+8*j];
194     *idx = a->getrowcols;
195   }
196   if (v) {
197     PetscInt j;
198     for (j=0; j<a->rlen[row]; j++) a->getrowvals[j] = a->val[shift+8*j];
199     *v = a->getrowvals;
200   }
201   PetscFunctionReturn(0);
202 }
203 
204 PetscErrorCode MatRestoreRow_SeqSELL(Mat A,PetscInt row,PetscInt *nz,PetscInt **idx,PetscScalar **v)
205 {
206   PetscFunctionBegin;
207   PetscFunctionReturn(0);
208 }
209 
210 PetscErrorCode MatConvert_SeqSELL_SeqAIJ(Mat A, MatType newtype,MatReuse reuse,Mat *newmat)
211 {
212   Mat            B;
213   Mat_SeqSELL    *a=(Mat_SeqSELL*)A->data;
214   PetscInt       i;
215   PetscErrorCode ierr;
216 
217   PetscFunctionBegin;
218   if (reuse == MAT_REUSE_MATRIX) {
219     B    = *newmat;
220     ierr = MatZeroEntries(B);CHKERRQ(ierr);
221   } else {
222     ierr = MatCreate(PetscObjectComm((PetscObject)A),&B);CHKERRQ(ierr);
223     ierr = MatSetSizes(B,A->rmap->n,A->cmap->n,A->rmap->N,A->cmap->N);CHKERRQ(ierr);
224     ierr = MatSetType(B,MATSEQAIJ);CHKERRQ(ierr);
225     ierr = MatSeqAIJSetPreallocation(B,0,a->rlen);CHKERRQ(ierr);
226   }
227 
228   for (i=0; i<A->rmap->n; i++) {
229     PetscInt    nz = 0,*cols = NULL;
230     PetscScalar *vals = NULL;
231 
232     ierr = MatGetRow_SeqSELL(A,i,&nz,&cols,&vals);CHKERRQ(ierr);
233     ierr = MatSetValues(B,1,&i,nz,cols,vals,INSERT_VALUES);CHKERRQ(ierr);
234     ierr = MatRestoreRow_SeqSELL(A,i,&nz,&cols,&vals);CHKERRQ(ierr);
235   }
236 
237   ierr = MatAssemblyBegin(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
238   ierr = MatAssemblyEnd(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
239   B->rmap->bs = A->rmap->bs;
240 
241   if (reuse == MAT_INPLACE_MATRIX) {
242     ierr = MatHeaderReplace(A,&B);CHKERRQ(ierr);
243   } else {
244     *newmat = B;
245   }
246   PetscFunctionReturn(0);
247 }
248 
249 #include <../src/mat/impls/aij/seq/aij.h>
250 
251 PetscErrorCode MatConvert_SeqAIJ_SeqSELL(Mat A,MatType newtype,MatReuse reuse,Mat *newmat)
252 {
253   Mat               B;
254   Mat_SeqAIJ        *a=(Mat_SeqAIJ*)A->data;
255   PetscInt          *ai=a->i,m=A->rmap->N,n=A->cmap->N,i,*rowlengths,row,ncols;
256   const PetscInt    *cols;
257   const PetscScalar *vals;
258   PetscErrorCode    ierr;
259 
260   PetscFunctionBegin;
261 
262   if (reuse == MAT_REUSE_MATRIX) {
263     B = *newmat;
264   } else {
265     /* Can we just use ilen? */
266     ierr = PetscMalloc1(m,&rowlengths);CHKERRQ(ierr);
267     for (i=0; i<m; i++) {
268       rowlengths[i] = ai[i+1] - ai[i];
269     }
270 
271     ierr = MatCreate(PetscObjectComm((PetscObject)A),&B);CHKERRQ(ierr);
272     ierr = MatSetSizes(B,m,n,m,n);CHKERRQ(ierr);
273     ierr = MatSetType(B,MATSEQSELL);CHKERRQ(ierr);
274     ierr = MatSeqSELLSetPreallocation(B,0,rowlengths);CHKERRQ(ierr);
275     ierr = PetscFree(rowlengths);CHKERRQ(ierr);
276   }
277 
278   for (row=0; row<m; row++) {
279     ierr = MatGetRow(A,row,&ncols,&cols,&vals);CHKERRQ(ierr);
280     ierr = MatSetValues(B,1,&row,ncols,cols,vals,INSERT_VALUES);CHKERRQ(ierr);
281     ierr = MatRestoreRow(A,row,&ncols,&cols,&vals);CHKERRQ(ierr);
282   }
283   ierr = MatAssemblyBegin(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
284   ierr = MatAssemblyEnd(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
285   B->rmap->bs = A->rmap->bs;
286 
287   if (reuse == MAT_INPLACE_MATRIX) {
288     ierr = MatHeaderReplace(A,&B);CHKERRQ(ierr);
289   } else {
290     *newmat = B;
291   }
292   PetscFunctionReturn(0);
293 }
294 
295 PetscErrorCode MatMult_SeqSELL(Mat A,Vec xx,Vec yy)
296 {
297   Mat_SeqSELL       *a=(Mat_SeqSELL*)A->data;
298   PetscScalar       *y;
299   const PetscScalar *x;
300   const MatScalar   *aval=a->val;
301   PetscInt          totalslices=a->totalslices;
302   const PetscInt    *acolidx=a->colidx;
303   PetscInt          i,j;
304   PetscErrorCode    ierr;
305 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
306   __m512d           vec_x,vec_y,vec_vals;
307   __m256i           vec_idx;
308   __mmask8          mask;
309   __m512d           vec_x2,vec_y2,vec_vals2,vec_x3,vec_y3,vec_vals3,vec_x4,vec_y4,vec_vals4;
310   __m256i           vec_idx2,vec_idx3,vec_idx4;
311 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
312   __m128i           vec_idx;
313   __m256d           vec_x,vec_y,vec_y2,vec_vals;
314   MatScalar         yval;
315   PetscInt          r,rows_left,row,nnz_in_row;
316 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
317   __m128d           vec_x_tmp;
318   __m256d           vec_x,vec_y,vec_y2,vec_vals;
319   MatScalar         yval;
320   PetscInt          r,rows_left,row,nnz_in_row;
321 #else
322   PetscScalar       sum[8];
323 #endif
324 
325 #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
326 #pragma disjoint(*x,*y,*aval)
327 #endif
328 
329   PetscFunctionBegin;
330   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
331   ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
332 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
333   for (i=0; i<totalslices; i++) { /* loop over slices */
334     PetscPrefetchBlock(acolidx,a->sliidx[i+1]-a->sliidx[i],0,PETSC_PREFETCH_HINT_T0);
335     PetscPrefetchBlock(aval,a->sliidx[i+1]-a->sliidx[i],0,PETSC_PREFETCH_HINT_T0);
336 
337     vec_y  = _mm512_setzero_pd();
338     vec_y2 = _mm512_setzero_pd();
339     vec_y3 = _mm512_setzero_pd();
340     vec_y4 = _mm512_setzero_pd();
341 
342     j = a->sliidx[i]>>3; /* 8 bytes are read at each time, corresponding to a slice columnn */
343     switch ((a->sliidx[i+1]-a->sliidx[i])/8 & 3) {
344     case 3:
345       AVX512_Mult_Private(vec_idx,vec_x,vec_vals,vec_y);
346       acolidx += 8; aval += 8;
347       AVX512_Mult_Private(vec_idx2,vec_x2,vec_vals2,vec_y2);
348       acolidx += 8; aval += 8;
349       AVX512_Mult_Private(vec_idx3,vec_x3,vec_vals3,vec_y3);
350       acolidx += 8; aval += 8;
351       j += 3;
352       break;
353     case 2:
354       AVX512_Mult_Private(vec_idx,vec_x,vec_vals,vec_y);
355       acolidx += 8; aval += 8;
356       AVX512_Mult_Private(vec_idx2,vec_x2,vec_vals2,vec_y2);
357       acolidx += 8; aval += 8;
358       j += 2;
359       break;
360     case 1:
361       AVX512_Mult_Private(vec_idx,vec_x,vec_vals,vec_y);
362       acolidx += 8; aval += 8;
363       j += 1;
364       break;
365     }
366     #pragma novector
367     for (; j<(a->sliidx[i+1]>>3); j+=4) {
368       AVX512_Mult_Private(vec_idx,vec_x,vec_vals,vec_y);
369       acolidx += 8; aval += 8;
370       AVX512_Mult_Private(vec_idx2,vec_x2,vec_vals2,vec_y2);
371       acolidx += 8; aval += 8;
372       AVX512_Mult_Private(vec_idx3,vec_x3,vec_vals3,vec_y3);
373       acolidx += 8; aval += 8;
374       AVX512_Mult_Private(vec_idx4,vec_x4,vec_vals4,vec_y4);
375       acolidx += 8; aval += 8;
376     }
377 
378     vec_y = _mm512_add_pd(vec_y,vec_y2);
379     vec_y = _mm512_add_pd(vec_y,vec_y3);
380     vec_y = _mm512_add_pd(vec_y,vec_y4);
381     if (i == totalslices-1 && A->rmap->n & 0x07) { /* if last slice has padding rows */
382       mask = (__mmask8)(0xff >> (8-(A->rmap->n & 0x07)));
383       _mm512_mask_storeu_pd(&y[8*i],mask,vec_y);
384     } else {
385       _mm512_storeu_pd(&y[8*i],vec_y);
386     }
387   }
388 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
389   for (i=0; i<totalslices; i++) { /* loop over full slices */
390     PetscPrefetchBlock(acolidx,a->sliidx[i+1]-a->sliidx[i],0,PETSC_PREFETCH_HINT_T0);
391     PetscPrefetchBlock(aval,a->sliidx[i+1]-a->sliidx[i],0,PETSC_PREFETCH_HINT_T0);
392 
393     /* last slice may have padding rows. Don't use vectorization. */
394     if (i == totalslices-1 && (A->rmap->n & 0x07)) {
395       rows_left = A->rmap->n - 8*i;
396       for (r=0; r<rows_left; ++r) {
397         yval = (MatScalar)0;
398         row = 8*i + r;
399         nnz_in_row = a->rlen[row];
400         for (j=0; j<nnz_in_row; ++j) yval += aval[8*j+r] * x[acolidx[8*j+r]];
401         y[row] = yval;
402       }
403       break;
404     }
405 
406     vec_y  = _mm256_setzero_pd();
407     vec_y2 = _mm256_setzero_pd();
408 
409     /* Process slice of height 8 (512 bits) via two subslices of height 4 (256 bits) via AVX */
410     #pragma novector
411     #pragma unroll(2)
412     for (j=a->sliidx[i]; j<a->sliidx[i+1]; j+=8) {
413       AVX2_Mult_Private(vec_idx,vec_x,vec_vals,vec_y);
414       aval += 4; acolidx += 4;
415       AVX2_Mult_Private(vec_idx,vec_x,vec_vals,vec_y2);
416       aval += 4; acolidx += 4;
417     }
418 
419     _mm256_storeu_pd(y+i*8,vec_y);
420     _mm256_storeu_pd(y+i*8+4,vec_y2);
421   }
422 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
423   for (i=0; i<totalslices; i++) { /* loop over full slices */
424     PetscPrefetchBlock(acolidx,a->sliidx[i+1]-a->sliidx[i],0,PETSC_PREFETCH_HINT_T0);
425     PetscPrefetchBlock(aval,a->sliidx[i+1]-a->sliidx[i],0,PETSC_PREFETCH_HINT_T0);
426 
427     vec_y  = _mm256_setzero_pd();
428     vec_y2 = _mm256_setzero_pd();
429 
430     /* last slice may have padding rows. Don't use vectorization. */
431     if (i == totalslices-1 && (A->rmap->n & 0x07)) {
432       rows_left = A->rmap->n - 8*i;
433       for (r=0; r<rows_left; ++r) {
434         yval = (MatScalar)0;
435         row = 8*i + r;
436         nnz_in_row = a->rlen[row];
437         for (j=0; j<nnz_in_row; ++j) yval += aval[8*j + r] * x[acolidx[8*j + r]];
438         y[row] = yval;
439       }
440       break;
441     }
442 
443     /* Process slice of height 8 (512 bits) via two subslices of height 4 (256 bits) via AVX */
444     #pragma novector
445     #pragma unroll(2)
446     for (j=a->sliidx[i]; j<a->sliidx[i+1]; j+=8) {
447       vec_vals  = _mm256_loadu_pd(aval);
448       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
449       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
450       vec_x     = _mm256_insertf128_pd(vec_x,vec_x_tmp,0);
451       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
452       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
453       vec_x     = _mm256_insertf128_pd(vec_x,vec_x_tmp,1);
454       vec_y     = _mm256_add_pd(_mm256_mul_pd(vec_x,vec_vals),vec_y);
455       aval     += 4;
456 
457       vec_vals  = _mm256_loadu_pd(aval);
458       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
459       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
460       vec_x     = _mm256_insertf128_pd(vec_x,vec_x_tmp,0);
461       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
462       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
463       vec_x     = _mm256_insertf128_pd(vec_x,vec_x_tmp,1);
464       vec_y2    = _mm256_add_pd(_mm256_mul_pd(vec_x,vec_vals),vec_y2);
465       aval     += 4;
466     }
467 
468     _mm256_storeu_pd(y + i*8,     vec_y);
469     _mm256_storeu_pd(y + i*8 + 4, vec_y2);
470   }
471 #else
472   for (i=0; i<totalslices; i++) { /* loop over slices */
473     for (j=0; j<8; j++) sum[j] = 0.0;
474     for (j=a->sliidx[i]; j<a->sliidx[i+1]; j+=8) {
475       sum[0] += aval[j] * x[acolidx[j]];
476       sum[1] += aval[j+1] * x[acolidx[j+1]];
477       sum[2] += aval[j+2] * x[acolidx[j+2]];
478       sum[3] += aval[j+3] * x[acolidx[j+3]];
479       sum[4] += aval[j+4] * x[acolidx[j+4]];
480       sum[5] += aval[j+5] * x[acolidx[j+5]];
481       sum[6] += aval[j+6] * x[acolidx[j+6]];
482       sum[7] += aval[j+7] * x[acolidx[j+7]];
483     }
484     if (i == totalslices-1 && (A->rmap->n & 0x07)) { /* if last slice has padding rows */
485       for(j=0; j<(A->rmap->n & 0x07); j++) y[8*i+j] = sum[j];
486     } else {
487       for(j=0; j<8; j++) y[8*i+j] = sum[j];
488     }
489   }
490 #endif
491 
492   ierr = PetscLogFlops(2.0*a->nz-a->nonzerorowcnt);CHKERRQ(ierr); /* theoretical minimal FLOPs */
493   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
494   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
495   PetscFunctionReturn(0);
496 }
497 
498 #include <../src/mat/impls/aij/seq/ftn-kernels/fmultadd.h>
499 PetscErrorCode MatMultAdd_SeqSELL(Mat A,Vec xx,Vec yy,Vec zz)
500 {
501   Mat_SeqSELL       *a=(Mat_SeqSELL*)A->data;
502   PetscScalar       *y,*z;
503   const PetscScalar *x;
504   const MatScalar   *aval=a->val;
505   PetscInt          totalslices=a->totalslices;
506   const PetscInt    *acolidx=a->colidx;
507   PetscInt          i,j;
508   PetscErrorCode    ierr;
509 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
510   __m512d           vec_x,vec_y,vec_vals;
511   __m256i           vec_idx;
512   __mmask8          mask;
513   __m512d           vec_x2,vec_y2,vec_vals2,vec_x3,vec_y3,vec_vals3,vec_x4,vec_y4,vec_vals4;
514   __m256i           vec_idx2,vec_idx3,vec_idx4;
515 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
516   __m128d           vec_x_tmp;
517   __m256d           vec_x,vec_y,vec_y2,vec_vals;
518   MatScalar         yval;
519   PetscInt          r,row,nnz_in_row;
520 #else
521   PetscScalar       sum[8];
522 #endif
523 
524 #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
525 #pragma disjoint(*x,*y,*aval)
526 #endif
527 
528   PetscFunctionBegin;
529   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
530   ierr = VecGetArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
531 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
532   for (i=0; i<totalslices; i++) { /* loop over slices */
533     PetscPrefetchBlock(acolidx,a->sliidx[i+1]-a->sliidx[i],0,PETSC_PREFETCH_HINT_T0);
534     PetscPrefetchBlock(aval,a->sliidx[i+1]-a->sliidx[i],0,PETSC_PREFETCH_HINT_T0);
535 
536     if (i == totalslices-1 && A->rmap->n & 0x07) { /* if last slice has padding rows */
537       mask   = (__mmask8)(0xff >> (8-(A->rmap->n & 0x07)));
538       vec_y  = _mm512_mask_loadu_pd(vec_y,mask,&y[8*i]);
539     } else {
540       vec_y  = _mm512_loadu_pd(&y[8*i]);
541     }
542     vec_y2 = _mm512_setzero_pd();
543     vec_y3 = _mm512_setzero_pd();
544     vec_y4 = _mm512_setzero_pd();
545 
546     j = a->sliidx[i]>>3; /* 8 bytes are read at each time, corresponding to a slice columnn */
547     switch ((a->sliidx[i+1]-a->sliidx[i])/8 & 3) {
548     case 3:
549       AVX512_Mult_Private(vec_idx,vec_x,vec_vals,vec_y);
550       acolidx += 8; aval += 8;
551       AVX512_Mult_Private(vec_idx2,vec_x2,vec_vals2,vec_y2);
552       acolidx += 8; aval += 8;
553       AVX512_Mult_Private(vec_idx3,vec_x3,vec_vals3,vec_y3);
554       acolidx += 8; aval += 8;
555       j += 3;
556       break;
557     case 2:
558       AVX512_Mult_Private(vec_idx,vec_x,vec_vals,vec_y);
559       acolidx += 8; aval += 8;
560       AVX512_Mult_Private(vec_idx2,vec_x2,vec_vals2,vec_y2);
561       acolidx += 8; aval += 8;
562       j += 2;
563       break;
564     case 1:
565       AVX512_Mult_Private(vec_idx,vec_x,vec_vals,vec_y);
566       acolidx += 8; aval += 8;
567       j += 1;
568       break;
569     }
570     #pragma novector
571     for (; j<(a->sliidx[i+1]>>3); j+=4) {
572       AVX512_Mult_Private(vec_idx,vec_x,vec_vals,vec_y);
573       acolidx += 8; aval += 8;
574       AVX512_Mult_Private(vec_idx2,vec_x2,vec_vals2,vec_y2);
575       acolidx += 8; aval += 8;
576       AVX512_Mult_Private(vec_idx3,vec_x3,vec_vals3,vec_y3);
577       acolidx += 8; aval += 8;
578       AVX512_Mult_Private(vec_idx4,vec_x4,vec_vals4,vec_y4);
579       acolidx += 8; aval += 8;
580     }
581 
582     vec_y = _mm512_add_pd(vec_y,vec_y2);
583     vec_y = _mm512_add_pd(vec_y,vec_y3);
584     vec_y = _mm512_add_pd(vec_y,vec_y4);
585     if (i == totalslices-1 && A->rmap->n & 0x07) { /* if last slice has padding rows */
586       _mm512_mask_storeu_pd(&z[8*i],mask,vec_y);
587     } else {
588       _mm512_storeu_pd(&z[8*i],vec_y);
589     }
590   }
591 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
592   for (i=0; i<totalslices; i++) { /* loop over full slices */
593     PetscPrefetchBlock(acolidx,a->sliidx[i+1]-a->sliidx[i],0,PETSC_PREFETCH_HINT_T0);
594     PetscPrefetchBlock(aval,a->sliidx[i+1]-a->sliidx[i],0,PETSC_PREFETCH_HINT_T0);
595 
596     /* last slice may have padding rows. Don't use vectorization. */
597     if (i == totalslices-1 && (A->rmap->n & 0x07)) {
598       for (r=0; r<(A->rmap->n & 0x07); ++r) {
599         row        = 8*i + r;
600         yval       = (MatScalar)0.0;
601         nnz_in_row = a->rlen[row];
602         for (j=0; j<nnz_in_row; ++j) yval += aval[8*j+r] * x[acolidx[8*j+r]];
603         z[row] = y[row] + yval;
604       }
605       break;
606     }
607 
608     vec_y  = _mm256_loadu_pd(y+8*i);
609     vec_y2 = _mm256_loadu_pd(y+8*i+4);
610 
611     /* Process slice of height 8 (512 bits) via two subslices of height 4 (256 bits) via AVX */
612     for (j=a->sliidx[i]; j<a->sliidx[i+1]; j+=8) {
613       vec_vals  = _mm256_loadu_pd(aval);
614       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
615       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
616       vec_x     = _mm256_insertf128_pd(vec_x,vec_x_tmp,0);
617       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
618       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
619       vec_x     = _mm256_insertf128_pd(vec_x,vec_x_tmp,1);
620       vec_y     = _mm256_add_pd(_mm256_mul_pd(vec_x,vec_vals),vec_y);
621       aval     += 4;
622 
623       vec_vals  = _mm256_loadu_pd(aval);
624       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
625       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
626       vec_x     = _mm256_insertf128_pd(vec_x,vec_x_tmp,0);
627       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
628       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
629       vec_x     = _mm256_insertf128_pd(vec_x,vec_x_tmp,1);
630       vec_y2    = _mm256_add_pd(_mm256_mul_pd(vec_x,vec_vals),vec_y2);
631       aval     += 4;
632     }
633 
634     _mm256_storeu_pd(z+i*8,vec_y);
635     _mm256_storeu_pd(z+i*8+4,vec_y2);
636   }
637 #else
638   for (i=0; i<totalslices; i++) { /* loop over slices */
639     for (j=0; j<8; j++) sum[j] = 0.0;
640     for (j=a->sliidx[i]; j<a->sliidx[i+1]; j+=8) {
641       sum[0] += aval[j] * x[acolidx[j]];
642       sum[1] += aval[j+1] * x[acolidx[j+1]];
643       sum[2] += aval[j+2] * x[acolidx[j+2]];
644       sum[3] += aval[j+3] * x[acolidx[j+3]];
645       sum[4] += aval[j+4] * x[acolidx[j+4]];
646       sum[5] += aval[j+5] * x[acolidx[j+5]];
647       sum[6] += aval[j+6] * x[acolidx[j+6]];
648       sum[7] += aval[j+7] * x[acolidx[j+7]];
649     }
650     if (i == totalslices-1 && (A->rmap->n & 0x07)) {
651       for (j=0; j<(A->rmap->n & 0x07); j++) z[8*i+j] = y[8*i+j] + sum[j];
652     } else {
653       for (j=0; j<8; j++) z[8*i+j] = y[8*i+j] + sum[j];
654     }
655   }
656 #endif
657 
658   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
659   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
660   ierr = VecRestoreArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
661   PetscFunctionReturn(0);
662 }
663 
664 PetscErrorCode MatMultTransposeAdd_SeqSELL(Mat A,Vec xx,Vec zz,Vec yy)
665 {
666   Mat_SeqSELL       *a=(Mat_SeqSELL*)A->data;
667   PetscScalar       *y;
668   const PetscScalar *x;
669   const MatScalar   *aval=a->val;
670   const PetscInt    *acolidx=a->colidx;
671   PetscInt          i,j,r,row,nnz_in_row,totalslices=a->totalslices;
672   PetscErrorCode    ierr;
673 
674 #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
675 #pragma disjoint(*x,*y,*aval)
676 #endif
677 
678   PetscFunctionBegin;
679   if (A->symmetric) {
680     ierr = MatMultAdd_SeqSELL(A,xx,zz,yy);CHKERRQ(ierr);
681     PetscFunctionReturn(0);
682   }
683   if (zz != yy) { ierr = VecCopy(zz,yy);CHKERRQ(ierr); }
684   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
685   ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
686   for (i=0; i<a->totalslices; i++) { /* loop over slices */
687     if (i == totalslices-1 && (A->rmap->n & 0x07)) {
688       for (r=0; r<(A->rmap->n & 0x07); ++r) {
689         row        = 8*i + r;
690         nnz_in_row = a->rlen[row];
691         for (j=0; j<nnz_in_row; ++j) y[acolidx[8*j+r]] += aval[8*j+r] * x[row];
692       }
693       break;
694     }
695     for (j=a->sliidx[i]; j<a->sliidx[i+1]; j+=8) {
696       y[acolidx[j]]   += aval[j] * x[8*i];
697       y[acolidx[j+1]] += aval[j+1] * x[8*i+1];
698       y[acolidx[j+2]] += aval[j+2] * x[8*i+2];
699       y[acolidx[j+3]] += aval[j+3] * x[8*i+3];
700       y[acolidx[j+4]] += aval[j+4] * x[8*i+4];
701       y[acolidx[j+5]] += aval[j+5] * x[8*i+5];
702       y[acolidx[j+6]] += aval[j+6] * x[8*i+6];
703       y[acolidx[j+7]] += aval[j+7] * x[8*i+7];
704     }
705   }
706   ierr = PetscLogFlops(2.0*a->sliidx[a->totalslices]);CHKERRQ(ierr);
707   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
708   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
709   PetscFunctionReturn(0);
710 }
711 
712 PetscErrorCode MatMultTranspose_SeqSELL(Mat A,Vec xx,Vec yy)
713 {
714   PetscErrorCode ierr;
715 
716   PetscFunctionBegin;
717   if (A->symmetric) {
718     ierr = MatMult_SeqSELL(A,xx,yy);CHKERRQ(ierr);
719   } else {
720     ierr = VecSet(yy,0.0);CHKERRQ(ierr);
721     ierr = MatMultTransposeAdd_SeqSELL(A,xx,yy,yy);CHKERRQ(ierr);
722   }
723   PetscFunctionReturn(0);
724 }
725 
726 /*
727      Checks for missing diagonals
728 */
729 PetscErrorCode MatMissingDiagonal_SeqSELL(Mat A,PetscBool  *missing,PetscInt *d)
730 {
731   Mat_SeqSELL    *a=(Mat_SeqSELL*)A->data;
732   PetscInt       *diag,i;
733   PetscErrorCode ierr;
734 
735   PetscFunctionBegin;
736   *missing = PETSC_FALSE;
737   if (A->rmap->n > 0 && !(a->colidx)) {
738     *missing = PETSC_TRUE;
739     if (d) *d = 0;
740     ierr = PetscInfo(A,"Matrix has no entries therefore is missing diagonal\n");CHKERRQ(ierr);
741   } else {
742     diag = a->diag;
743     for (i=0; i<A->rmap->n; i++) {
744       if (diag[i] == -1) {
745         *missing = PETSC_TRUE;
746         if (d) *d = i;
747         ierr = PetscInfo1(A,"Matrix is missing diagonal number %D\n",i);CHKERRQ(ierr);
748         break;
749       }
750     }
751   }
752   PetscFunctionReturn(0);
753 }
754 
755 PetscErrorCode MatMarkDiagonal_SeqSELL(Mat A)
756 {
757   Mat_SeqSELL    *a=(Mat_SeqSELL*)A->data;
758   PetscInt       i,j,m=A->rmap->n,shift;
759   PetscErrorCode ierr;
760 
761   PetscFunctionBegin;
762   if (!a->diag) {
763     ierr         = PetscMalloc1(m,&a->diag);CHKERRQ(ierr);
764     ierr         = PetscLogObjectMemory((PetscObject)A,m*sizeof(PetscInt));CHKERRQ(ierr);
765     a->free_diag = PETSC_TRUE;
766   }
767   for (i=0; i<m; i++) { /* loop over rows */
768     shift = a->sliidx[i>>3]+(i&0x07); /* starting index of the row i */
769     a->diag[i] = -1;
770     for (j=0; j<a->rlen[i]; j++) {
771       if (a->colidx[shift+j*8] == i) {
772         a->diag[i] = shift+j*8;
773         break;
774       }
775     }
776   }
777   PetscFunctionReturn(0);
778 }
779 
780 /*
781   Negative shift indicates do not generate an error if there is a zero diagonal, just invert it anyways
782 */
783 PetscErrorCode MatInvertDiagonal_SeqSELL(Mat A,PetscScalar omega,PetscScalar fshift)
784 {
785   Mat_SeqSELL    *a=(Mat_SeqSELL*) A->data;
786   PetscInt       i,*diag,m = A->rmap->n;
787   MatScalar      *val = a->val;
788   PetscScalar    *idiag,*mdiag;
789   PetscErrorCode ierr;
790 
791   PetscFunctionBegin;
792   if (a->idiagvalid) PetscFunctionReturn(0);
793   ierr = MatMarkDiagonal_SeqSELL(A);CHKERRQ(ierr);
794   diag = a->diag;
795   if (!a->idiag) {
796     ierr = PetscMalloc3(m,&a->idiag,m,&a->mdiag,m,&a->ssor_work);CHKERRQ(ierr);
797     ierr = PetscLogObjectMemory((PetscObject)A, 3*m*sizeof(PetscScalar));CHKERRQ(ierr);
798     val  = a->val;
799   }
800   mdiag = a->mdiag;
801   idiag = a->idiag;
802 
803   if (omega == 1.0 && PetscRealPart(fshift) <= 0.0) {
804     for (i=0; i<m; i++) {
805       mdiag[i] = val[diag[i]];
806       if (!PetscAbsScalar(mdiag[i])) { /* zero diagonal */
807         if (PetscRealPart(fshift)) {
808           ierr = PetscInfo1(A,"Zero diagonal on row %D\n",i);CHKERRQ(ierr);
809           A->factorerrortype             = MAT_FACTOR_NUMERIC_ZEROPIVOT;
810           A->factorerror_zeropivot_value = 0.0;
811           A->factorerror_zeropivot_row   = i;
812         } else SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"Zero diagonal on row %D",i);
813       }
814       idiag[i] = 1.0/val[diag[i]];
815     }
816     ierr = PetscLogFlops(m);CHKERRQ(ierr);
817   } else {
818     for (i=0; i<m; i++) {
819       mdiag[i] = val[diag[i]];
820       idiag[i] = omega/(fshift + val[diag[i]]);
821     }
822     ierr = PetscLogFlops(2.0*m);CHKERRQ(ierr);
823   }
824   a->idiagvalid = PETSC_TRUE;
825   PetscFunctionReturn(0);
826 }
827 
828 PetscErrorCode MatZeroEntries_SeqSELL(Mat A)
829 {
830   Mat_SeqSELL    *a=(Mat_SeqSELL*)A->data;
831   PetscErrorCode ierr;
832 
833   PetscFunctionBegin;
834   ierr = PetscArrayzero(a->val,a->sliidx[a->totalslices]);CHKERRQ(ierr);
835   ierr = MatSeqSELLInvalidateDiagonal(A);CHKERRQ(ierr);
836   PetscFunctionReturn(0);
837 }
838 
839 PetscErrorCode MatDestroy_SeqSELL(Mat A)
840 {
841   Mat_SeqSELL    *a=(Mat_SeqSELL*)A->data;
842   PetscErrorCode ierr;
843 
844   PetscFunctionBegin;
845 #if defined(PETSC_USE_LOG)
846   PetscLogObjectState((PetscObject)A,"Rows=%D, Cols=%D, NZ=%D",A->rmap->n,A->cmap->n,a->nz);
847 #endif
848   ierr = MatSeqXSELLFreeSELL(A,&a->val,&a->colidx);CHKERRQ(ierr);
849   ierr = ISDestroy(&a->row);CHKERRQ(ierr);
850   ierr = ISDestroy(&a->col);CHKERRQ(ierr);
851   ierr = PetscFree(a->diag);CHKERRQ(ierr);
852   ierr = PetscFree(a->rlen);CHKERRQ(ierr);
853   ierr = PetscFree(a->sliidx);CHKERRQ(ierr);
854   ierr = PetscFree3(a->idiag,a->mdiag,a->ssor_work);CHKERRQ(ierr);
855   ierr = PetscFree(a->solve_work);CHKERRQ(ierr);
856   ierr = ISDestroy(&a->icol);CHKERRQ(ierr);
857   ierr = PetscFree(a->saved_values);CHKERRQ(ierr);
858   ierr = PetscFree2(a->getrowcols,a->getrowvals);CHKERRQ(ierr);
859 
860   ierr = PetscFree(A->data);CHKERRQ(ierr);
861 
862   ierr = PetscObjectChangeTypeName((PetscObject)A,0);CHKERRQ(ierr);
863   ierr = PetscObjectComposeFunction((PetscObject)A,"MatStoreValues_C",NULL);CHKERRQ(ierr);
864   ierr = PetscObjectComposeFunction((PetscObject)A,"MatRetrieveValues_C",NULL);CHKERRQ(ierr);
865 #if defined(PETSC_HAVE_ELEMENTAL)
866 #endif
867   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqSELLSetPreallocation_C",NULL);CHKERRQ(ierr);
868   PetscFunctionReturn(0);
869 }
870 
871 PetscErrorCode MatSetOption_SeqSELL(Mat A,MatOption op,PetscBool flg)
872 {
873   Mat_SeqSELL    *a=(Mat_SeqSELL*)A->data;
874   PetscErrorCode ierr;
875 
876   PetscFunctionBegin;
877   switch (op) {
878   case MAT_ROW_ORIENTED:
879     a->roworiented = flg;
880     break;
881   case MAT_KEEP_NONZERO_PATTERN:
882     a->keepnonzeropattern = flg;
883     break;
884   case MAT_NEW_NONZERO_LOCATIONS:
885     a->nonew = (flg ? 0 : 1);
886     break;
887   case MAT_NEW_NONZERO_LOCATION_ERR:
888     a->nonew = (flg ? -1 : 0);
889     break;
890   case MAT_NEW_NONZERO_ALLOCATION_ERR:
891     a->nonew = (flg ? -2 : 0);
892     break;
893   case MAT_UNUSED_NONZERO_LOCATION_ERR:
894     a->nounused = (flg ? -1 : 0);
895     break;
896   case MAT_NEW_DIAGONALS:
897   case MAT_IGNORE_OFF_PROC_ENTRIES:
898   case MAT_USE_HASH_TABLE:
899   case MAT_SORTED_FULL:
900     ierr = PetscInfo1(A,"Option %s ignored\n",MatOptions[op]);CHKERRQ(ierr);
901     break;
902   case MAT_SPD:
903   case MAT_SYMMETRIC:
904   case MAT_STRUCTURALLY_SYMMETRIC:
905   case MAT_HERMITIAN:
906   case MAT_SYMMETRY_ETERNAL:
907     /* These options are handled directly by MatSetOption() */
908     break;
909   default:
910     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unknown option %d",op);
911   }
912   PetscFunctionReturn(0);
913 }
914 
915 PetscErrorCode MatGetDiagonal_SeqSELL(Mat A,Vec v)
916 {
917   Mat_SeqSELL    *a=(Mat_SeqSELL*)A->data;
918   PetscInt       i,j,n,shift;
919   PetscScalar    *x,zero=0.0;
920   PetscErrorCode ierr;
921 
922   PetscFunctionBegin;
923   ierr = VecGetLocalSize(v,&n);CHKERRQ(ierr);
924   if (n != A->rmap->n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector");
925 
926   if (A->factortype == MAT_FACTOR_ILU || A->factortype == MAT_FACTOR_LU) {
927     PetscInt *diag=a->diag;
928     ierr = VecGetArray(v,&x);CHKERRQ(ierr);
929     for (i=0; i<n; i++) x[i] = 1.0/a->val[diag[i]];
930     ierr = VecRestoreArray(v,&x);CHKERRQ(ierr);
931     PetscFunctionReturn(0);
932   }
933 
934   ierr = VecSet(v,zero);CHKERRQ(ierr);
935   ierr = VecGetArray(v,&x);CHKERRQ(ierr);
936   for (i=0; i<n; i++) { /* loop over rows */
937     shift = a->sliidx[i>>3]+(i&0x07); /* starting index of the row i */
938     x[i] = 0;
939     for (j=0; j<a->rlen[i]; j++) {
940       if (a->colidx[shift+j*8] == i) {
941         x[i] = a->val[shift+j*8];
942         break;
943       }
944     }
945   }
946   ierr = VecRestoreArray(v,&x);CHKERRQ(ierr);
947   PetscFunctionReturn(0);
948 }
949 
950 PetscErrorCode MatDiagonalScale_SeqSELL(Mat A,Vec ll,Vec rr)
951 {
952   Mat_SeqSELL       *a=(Mat_SeqSELL*)A->data;
953   const PetscScalar *l,*r;
954   PetscInt          i,j,m,n,row;
955   PetscErrorCode    ierr;
956 
957   PetscFunctionBegin;
958   if (ll) {
959     /* The local size is used so that VecMPI can be passed to this routine
960        by MatDiagonalScale_MPISELL */
961     ierr = VecGetLocalSize(ll,&m);CHKERRQ(ierr);
962     if (m != A->rmap->n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Left scaling vector wrong length");
963     ierr = VecGetArrayRead(ll,&l);CHKERRQ(ierr);
964     for (i=0; i<a->totalslices; i++) { /* loop over slices */
965       if (i == a->totalslices-1 && (A->rmap->n & 0x07)) { /* if last slice has padding rows */
966         for (j=a->sliidx[i],row=0; j<a->sliidx[i+1]; j++,row=((row+1)&0x07)) {
967           if (row < (A->rmap->n & 0x07)) a->val[j] *= l[8*i+row];
968         }
969       } else {
970         for (j=a->sliidx[i],row=0; j<a->sliidx[i+1]; j++,row=((row+1)&0x07)) {
971           a->val[j] *= l[8*i+row];
972         }
973       }
974     }
975     ierr = VecRestoreArrayRead(ll,&l);CHKERRQ(ierr);
976     ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
977   }
978   if (rr) {
979     ierr = VecGetLocalSize(rr,&n);CHKERRQ(ierr);
980     if (n != A->cmap->n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Right scaling vector wrong length");
981     ierr = VecGetArrayRead(rr,&r);CHKERRQ(ierr);
982     for (i=0; i<a->totalslices; i++) { /* loop over slices */
983       if (i == a->totalslices-1 && (A->rmap->n & 0x07)) { /* if last slice has padding rows */
984         for (j=a->sliidx[i],row=0; j<a->sliidx[i+1]; j++,row=((row+1)&0x07)) {
985           if (row < (A->rmap->n & 0x07)) a->val[j] *= r[a->colidx[j]];
986         }
987       } else {
988         for (j=a->sliidx[i]; j<a->sliidx[i+1]; j++) {
989           a->val[j] *= r[a->colidx[j]];
990         }
991       }
992     }
993     ierr = VecRestoreArrayRead(rr,&r);CHKERRQ(ierr);
994     ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
995   }
996   ierr = MatSeqSELLInvalidateDiagonal(A);CHKERRQ(ierr);
997   PetscFunctionReturn(0);
998 }
999 
1000 extern PetscErrorCode MatSetValues_SeqSELL(Mat,PetscInt,const PetscInt[],PetscInt,const PetscInt[],const PetscScalar[],InsertMode);
1001 
1002 PetscErrorCode MatGetValues_SeqSELL(Mat A,PetscInt m,const PetscInt im[],PetscInt n,const PetscInt in[],PetscScalar v[])
1003 {
1004   Mat_SeqSELL *a=(Mat_SeqSELL*)A->data;
1005   PetscInt    *cp,i,k,low,high,t,row,col,l;
1006   PetscInt    shift;
1007   MatScalar   *vp;
1008 
1009   PetscFunctionBegin;
1010   for (k=0; k<m; k++) { /* loop over requested rows */
1011     row = im[k];
1012     if (row<0) continue;
1013 #if defined(PETSC_USE_DEBUG)
1014     if (row >= A->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Row too large: row %D max %D",row,A->rmap->n-1);
1015 #endif
1016     shift = a->sliidx[row>>3]+(row&0x07); /* starting index of the row */
1017     cp = a->colidx+shift; /* pointer to the row */
1018     vp = a->val+shift; /* pointer to the row */
1019     for (l=0; l<n; l++) { /* loop over requested columns */
1020       col = in[l];
1021       if (col<0) continue;
1022 #if defined(PETSC_USE_DEBUG)
1023       if (col >= A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Column too large: row %D max %D",col,A->cmap->n-1);
1024 #endif
1025       high = a->rlen[row]; low = 0; /* assume unsorted */
1026       while (high-low > 5) {
1027         t = (low+high)/2;
1028         if (*(cp+t*8) > col) high = t;
1029         else low = t;
1030       }
1031       for (i=low; i<high; i++) {
1032         if (*(cp+8*i) > col) break;
1033         if (*(cp+8*i) == col) {
1034           *v++ = *(vp+8*i);
1035           goto finished;
1036         }
1037       }
1038       *v++ = 0.0;
1039     finished:;
1040     }
1041   }
1042   PetscFunctionReturn(0);
1043 }
1044 
1045 PetscErrorCode MatView_SeqSELL_ASCII(Mat A,PetscViewer viewer)
1046 {
1047   Mat_SeqSELL       *a=(Mat_SeqSELL*)A->data;
1048   PetscInt          i,j,m=A->rmap->n,shift;
1049   const char        *name;
1050   PetscViewerFormat format;
1051   PetscErrorCode    ierr;
1052 
1053   PetscFunctionBegin;
1054   ierr = PetscViewerGetFormat(viewer,&format);CHKERRQ(ierr);
1055   if (format == PETSC_VIEWER_ASCII_MATLAB) {
1056     PetscInt nofinalvalue = 0;
1057     /*
1058     if (m && ((a->i[m] == a->i[m-1]) || (a->j[a->nz-1] != A->cmap->n-1))) {
1059       nofinalvalue = 1;
1060     }
1061     */
1062     ierr = PetscViewerASCIIUseTabs(viewer,PETSC_FALSE);CHKERRQ(ierr);
1063     ierr = PetscViewerASCIIPrintf(viewer,"%% Size = %D %D \n",m,A->cmap->n);CHKERRQ(ierr);
1064     ierr = PetscViewerASCIIPrintf(viewer,"%% Nonzeros = %D \n",a->nz);CHKERRQ(ierr);
1065 #if defined(PETSC_USE_COMPLEX)
1066     ierr = PetscViewerASCIIPrintf(viewer,"zzz = zeros(%D,4);\n",a->nz+nofinalvalue);CHKERRQ(ierr);
1067 #else
1068     ierr = PetscViewerASCIIPrintf(viewer,"zzz = zeros(%D,3);\n",a->nz+nofinalvalue);CHKERRQ(ierr);
1069 #endif
1070     ierr = PetscViewerASCIIPrintf(viewer,"zzz = [\n");CHKERRQ(ierr);
1071 
1072     for (i=0; i<m; i++) {
1073       shift = a->sliidx[i>>3]+(i&0x07);
1074       for (j=0; j<a->rlen[i]; j++) {
1075 #if defined(PETSC_USE_COMPLEX)
1076         ierr = PetscViewerASCIIPrintf(viewer,"%D %D  %18.16e %18.16e\n",i+1,a->colidx[shift+8*j]+1,(double)PetscRealPart(a->val[shift+8*j]),(double)PetscImaginaryPart(a->val[shift+8*j]));CHKERRQ(ierr);
1077 #else
1078         ierr = PetscViewerASCIIPrintf(viewer,"%D %D  %18.16e\n",i+1,a->colidx[shift+8*j]+1,(double)a->val[shift+8*j]);CHKERRQ(ierr);
1079 #endif
1080       }
1081     }
1082     /*
1083     if (nofinalvalue) {
1084 #if defined(PETSC_USE_COMPLEX)
1085       ierr = PetscViewerASCIIPrintf(viewer,"%D %D  %18.16e %18.16e\n",m,A->cmap->n,0.,0.);CHKERRQ(ierr);
1086 #else
1087       ierr = PetscViewerASCIIPrintf(viewer,"%D %D  %18.16e\n",m,A->cmap->n,0.0);CHKERRQ(ierr);
1088 #endif
1089     }
1090     */
1091     ierr = PetscObjectGetName((PetscObject)A,&name);CHKERRQ(ierr);
1092     ierr = PetscViewerASCIIPrintf(viewer,"];\n %s = spconvert(zzz);\n",name);CHKERRQ(ierr);
1093     ierr = PetscViewerASCIIUseTabs(viewer,PETSC_TRUE);CHKERRQ(ierr);
1094   } else if (format == PETSC_VIEWER_ASCII_FACTOR_INFO || format == PETSC_VIEWER_ASCII_INFO) {
1095     PetscFunctionReturn(0);
1096   } else if (format == PETSC_VIEWER_ASCII_COMMON) {
1097     ierr = PetscViewerASCIIUseTabs(viewer,PETSC_FALSE);CHKERRQ(ierr);
1098     for (i=0; i<m; i++) {
1099       ierr = PetscViewerASCIIPrintf(viewer,"row %D:",i);CHKERRQ(ierr);
1100       shift = a->sliidx[i>>3]+(i&0x07);
1101       for (j=0; j<a->rlen[i]; j++) {
1102 #if defined(PETSC_USE_COMPLEX)
1103         if (PetscImaginaryPart(a->val[shift+8*j]) > 0.0 && PetscRealPart(a->val[shift+8*j]) != 0.0) {
1104           ierr = PetscViewerASCIIPrintf(viewer," (%D, %g + %g i)",a->colidx[shift+8*j],(double)PetscRealPart(a->val[shift+8*j]),(double)PetscImaginaryPart(a->val[shift+8*j]));CHKERRQ(ierr);
1105         } else if (PetscImaginaryPart(a->val[shift+8*j]) < 0.0 && PetscRealPart(a->val[shift+8*j]) != 0.0) {
1106           ierr = PetscViewerASCIIPrintf(viewer," (%D, %g - %g i)",a->colidx[shift+8*j],(double)PetscRealPart(a->val[shift+8*j]),(double)-PetscImaginaryPart(a->val[shift+8*j]));CHKERRQ(ierr);
1107         } else if (PetscRealPart(a->val[shift+8*j]) != 0.0) {
1108           ierr = PetscViewerASCIIPrintf(viewer," (%D, %g) ",a->colidx[shift+8*j],(double)PetscRealPart(a->val[shift+8*j]));CHKERRQ(ierr);
1109         }
1110 #else
1111         if (a->val[shift+8*j] != 0.0) {ierr = PetscViewerASCIIPrintf(viewer," (%D, %g) ",a->colidx[shift+8*j],(double)a->val[shift+8*j]);CHKERRQ(ierr);}
1112 #endif
1113       }
1114       ierr = PetscViewerASCIIPrintf(viewer,"\n");CHKERRQ(ierr);
1115     }
1116     ierr = PetscViewerASCIIUseTabs(viewer,PETSC_TRUE);CHKERRQ(ierr);
1117   } else if (format == PETSC_VIEWER_ASCII_DENSE) {
1118     PetscInt    cnt=0,jcnt;
1119     PetscScalar value;
1120 #if defined(PETSC_USE_COMPLEX)
1121     PetscBool   realonly=PETSC_TRUE;
1122     for (i=0; i<a->sliidx[a->totalslices]; i++) {
1123       if (PetscImaginaryPart(a->val[i]) != 0.0) {
1124         realonly = PETSC_FALSE;
1125         break;
1126       }
1127     }
1128 #endif
1129 
1130     ierr = PetscViewerASCIIUseTabs(viewer,PETSC_FALSE);CHKERRQ(ierr);
1131     for (i=0; i<m; i++) {
1132       jcnt = 0;
1133       shift = a->sliidx[i>>3]+(i&0x07);
1134       for (j=0; j<A->cmap->n; j++) {
1135         if (jcnt < a->rlen[i] && j == a->colidx[shift+8*j]) {
1136           value = a->val[cnt++];
1137           jcnt++;
1138         } else {
1139           value = 0.0;
1140         }
1141 #if defined(PETSC_USE_COMPLEX)
1142         if (realonly) {
1143           ierr = PetscViewerASCIIPrintf(viewer," %7.5e ",(double)PetscRealPart(value));CHKERRQ(ierr);
1144         } else {
1145           ierr = PetscViewerASCIIPrintf(viewer," %7.5e+%7.5e i ",(double)PetscRealPart(value),(double)PetscImaginaryPart(value));CHKERRQ(ierr);
1146         }
1147 #else
1148         ierr = PetscViewerASCIIPrintf(viewer," %7.5e ",(double)value);CHKERRQ(ierr);
1149 #endif
1150       }
1151       ierr = PetscViewerASCIIPrintf(viewer,"\n");CHKERRQ(ierr);
1152     }
1153     ierr = PetscViewerASCIIUseTabs(viewer,PETSC_TRUE);CHKERRQ(ierr);
1154   } else if (format == PETSC_VIEWER_ASCII_MATRIXMARKET) {
1155     PetscInt fshift=1;
1156     ierr = PetscViewerASCIIUseTabs(viewer,PETSC_FALSE);CHKERRQ(ierr);
1157 #if defined(PETSC_USE_COMPLEX)
1158     ierr = PetscViewerASCIIPrintf(viewer,"%%%%MatrixMarket matrix coordinate complex general\n");CHKERRQ(ierr);
1159 #else
1160     ierr = PetscViewerASCIIPrintf(viewer,"%%%%MatrixMarket matrix coordinate real general\n");CHKERRQ(ierr);
1161 #endif
1162     ierr = PetscViewerASCIIPrintf(viewer,"%D %D %D\n", m, A->cmap->n, a->nz);CHKERRQ(ierr);
1163     for (i=0; i<m; i++) {
1164       shift = a->sliidx[i>>3]+(i&0x07);
1165       for (j=0; j<a->rlen[i]; j++) {
1166 #if defined(PETSC_USE_COMPLEX)
1167         ierr = PetscViewerASCIIPrintf(viewer,"%D %D %g %g\n",i+fshift,a->colidx[shift+8*j]+fshift,(double)PetscRealPart(a->val[shift+8*j]),(double)PetscImaginaryPart(a->val[shift+8*j]));CHKERRQ(ierr);
1168 #else
1169         ierr = PetscViewerASCIIPrintf(viewer,"%D %D %g\n",i+fshift,a->colidx[shift+8*j]+fshift,(double)a->val[shift+8*j]);CHKERRQ(ierr);
1170 #endif
1171       }
1172     }
1173     ierr = PetscViewerASCIIUseTabs(viewer,PETSC_TRUE);CHKERRQ(ierr);
1174   } else if (format == PETSC_VIEWER_NATIVE) {
1175     for (i=0; i<a->totalslices; i++) { /* loop over slices */
1176       PetscInt row;
1177       ierr = PetscViewerASCIIPrintf(viewer,"slice %D: %D %D\n",i,a->sliidx[i],a->sliidx[i+1]);CHKERRQ(ierr);
1178       for (j=a->sliidx[i],row=0; j<a->sliidx[i+1]; j++,row=((row+1)&0x07)) {
1179 #if defined(PETSC_USE_COMPLEX)
1180         if (PetscImaginaryPart(a->val[j]) > 0.0) {
1181           ierr = PetscViewerASCIIPrintf(viewer,"  %D %D %g + %g i\n",8*i+row,a->colidx[j],(double)PetscRealPart(a->val[j]),(double)PetscImaginaryPart(a->val[j]));CHKERRQ(ierr);
1182         } else if (PetscImaginaryPart(a->val[j]) < 0.0) {
1183           ierr = PetscViewerASCIIPrintf(viewer,"  %D %D %g - %g i\n",8*i+row,a->colidx[j],(double)PetscRealPart(a->val[j]),-(double)PetscImaginaryPart(a->val[j]));CHKERRQ(ierr);
1184         } else {
1185           ierr = PetscViewerASCIIPrintf(viewer,"  %D %D %g\n",8*i+row,a->colidx[j],(double)PetscRealPart(a->val[j]));CHKERRQ(ierr);
1186         }
1187 #else
1188         ierr = PetscViewerASCIIPrintf(viewer,"  %D %D %g\n",8*i+row,a->colidx[j],(double)a->val[j]);CHKERRQ(ierr);
1189 #endif
1190       }
1191     }
1192   } else {
1193     ierr = PetscViewerASCIIUseTabs(viewer,PETSC_FALSE);CHKERRQ(ierr);
1194     if (A->factortype) {
1195       for (i=0; i<m; i++) {
1196         shift = a->sliidx[i>>3]+(i&0x07);
1197         ierr = PetscViewerASCIIPrintf(viewer,"row %D:",i);CHKERRQ(ierr);
1198         /* L part */
1199         for (j=shift; j<a->diag[i]; j+=8) {
1200 #if defined(PETSC_USE_COMPLEX)
1201           if (PetscImaginaryPart(a->val[shift+8*j]) > 0.0) {
1202             ierr = PetscViewerASCIIPrintf(viewer," (%D, %g + %g i)",a->colidx[j],(double)PetscRealPart(a->val[j]),(double)PetscImaginaryPart(a->val[j]));CHKERRQ(ierr);
1203           } else if (PetscImaginaryPart(a->val[shift+8*j]) < 0.0) {
1204             ierr = PetscViewerASCIIPrintf(viewer," (%D, %g - %g i)",a->colidx[j],(double)PetscRealPart(a->val[j]),(double)(-PetscImaginaryPart(a->val[j])));CHKERRQ(ierr);
1205           } else {
1206             ierr = PetscViewerASCIIPrintf(viewer," (%D, %g) ",a->colidx[j],(double)PetscRealPart(a->val[j]));CHKERRQ(ierr);
1207           }
1208 #else
1209           ierr = PetscViewerASCIIPrintf(viewer," (%D, %g) ",a->colidx[j],(double)a->val[j]);CHKERRQ(ierr);
1210 #endif
1211         }
1212         /* diagonal */
1213         j = a->diag[i];
1214 #if defined(PETSC_USE_COMPLEX)
1215         if (PetscImaginaryPart(a->val[j]) > 0.0) {
1216           ierr = PetscViewerASCIIPrintf(viewer," (%D, %g + %g i)",a->colidx[j],(double)PetscRealPart(1.0/a->val[j]),(double)PetscImaginaryPart(1.0/a->val[j]));CHKERRQ(ierr);
1217         } else if (PetscImaginaryPart(a->val[j]) < 0.0) {
1218           ierr = PetscViewerASCIIPrintf(viewer," (%D, %g - %g i)",a->colidx[j],(double)PetscRealPart(1.0/a->val[j]),(double)(-PetscImaginaryPart(1.0/a->val[j])));CHKERRQ(ierr);
1219         } else {
1220           ierr = PetscViewerASCIIPrintf(viewer," (%D, %g) ",a->colidx[j],(double)PetscRealPart(1.0/a->val[j]));CHKERRQ(ierr);
1221         }
1222 #else
1223         ierr = PetscViewerASCIIPrintf(viewer," (%D, %g) ",a->colidx[j],(double)(1.0/a->val[j]));CHKERRQ(ierr);
1224 #endif
1225 
1226         /* U part */
1227         for (j=a->diag[i]+1; j<shift+8*a->rlen[i]; j+=8) {
1228 #if defined(PETSC_USE_COMPLEX)
1229           if (PetscImaginaryPart(a->val[j]) > 0.0) {
1230             ierr = PetscViewerASCIIPrintf(viewer," (%D, %g + %g i)",a->colidx[j],(double)PetscRealPart(a->val[j]),(double)PetscImaginaryPart(a->val[j]));CHKERRQ(ierr);
1231           } else if (PetscImaginaryPart(a->val[j]) < 0.0) {
1232             ierr = PetscViewerASCIIPrintf(viewer," (%D, %g - %g i)",a->colidx[j],(double)PetscRealPart(a->val[j]),(double)(-PetscImaginaryPart(a->val[j])));CHKERRQ(ierr);
1233           } else {
1234             ierr = PetscViewerASCIIPrintf(viewer," (%D, %g) ",a->colidx[j],(double)PetscRealPart(a->val[j]));CHKERRQ(ierr);
1235           }
1236 #else
1237           ierr = PetscViewerASCIIPrintf(viewer," (%D, %g) ",a->colidx[j],(double)a->val[j]);CHKERRQ(ierr);
1238 #endif
1239         }
1240         ierr = PetscViewerASCIIPrintf(viewer,"\n");CHKERRQ(ierr);
1241       }
1242     } else {
1243       for (i=0; i<m; i++) {
1244         shift = a->sliidx[i>>3]+(i&0x07);
1245         ierr = PetscViewerASCIIPrintf(viewer,"row %D:",i);CHKERRQ(ierr);
1246         for (j=0; j<a->rlen[i]; j++) {
1247 #if defined(PETSC_USE_COMPLEX)
1248           if (PetscImaginaryPart(a->val[j]) > 0.0) {
1249             ierr = PetscViewerASCIIPrintf(viewer," (%D, %g + %g i)",a->colidx[shift+8*j],(double)PetscRealPart(a->val[shift+8*j]),(double)PetscImaginaryPart(a->val[shift+8*j]));CHKERRQ(ierr);
1250           } else if (PetscImaginaryPart(a->val[j]) < 0.0) {
1251             ierr = PetscViewerASCIIPrintf(viewer," (%D, %g - %g i)",a->colidx[shift+8*j],(double)PetscRealPart(a->val[shift+8*j]),(double)-PetscImaginaryPart(a->val[shift+8*j]));CHKERRQ(ierr);
1252           } else {
1253             ierr = PetscViewerASCIIPrintf(viewer," (%D, %g) ",a->colidx[shift+8*j],(double)PetscRealPart(a->val[shift+8*j]));CHKERRQ(ierr);
1254           }
1255 #else
1256           ierr = PetscViewerASCIIPrintf(viewer," (%D, %g) ",a->colidx[shift+8*j],(double)a->val[shift+8*j]);CHKERRQ(ierr);
1257 #endif
1258         }
1259         ierr = PetscViewerASCIIPrintf(viewer,"\n");CHKERRQ(ierr);
1260       }
1261     }
1262     ierr = PetscViewerASCIIUseTabs(viewer,PETSC_TRUE);CHKERRQ(ierr);
1263   }
1264   ierr = PetscViewerFlush(viewer);CHKERRQ(ierr);
1265   PetscFunctionReturn(0);
1266 }
1267 
1268 #include <petscdraw.h>
1269 PetscErrorCode MatView_SeqSELL_Draw_Zoom(PetscDraw draw,void *Aa)
1270 {
1271   Mat               A=(Mat)Aa;
1272   Mat_SeqSELL       *a=(Mat_SeqSELL*)A->data;
1273   PetscInt          i,j,m=A->rmap->n,shift;
1274   int               color;
1275   PetscReal         xl,yl,xr,yr,x_l,x_r,y_l,y_r;
1276   PetscViewer       viewer;
1277   PetscViewerFormat format;
1278   PetscErrorCode    ierr;
1279 
1280   PetscFunctionBegin;
1281   ierr = PetscObjectQuery((PetscObject)A,"Zoomviewer",(PetscObject*)&viewer);CHKERRQ(ierr);
1282   ierr = PetscViewerGetFormat(viewer,&format);CHKERRQ(ierr);
1283   ierr = PetscDrawGetCoordinates(draw,&xl,&yl,&xr,&yr);CHKERRQ(ierr);
1284 
1285   /* loop over matrix elements drawing boxes */
1286 
1287   if (format != PETSC_VIEWER_DRAW_CONTOUR) {
1288     ierr = PetscDrawCollectiveBegin(draw);CHKERRQ(ierr);
1289     /* Blue for negative, Cyan for zero and  Red for positive */
1290     color = PETSC_DRAW_BLUE;
1291     for (i=0; i<m; i++) {
1292       shift = a->sliidx[i>>3]+(i&0x07); /* starting index of the row i */
1293       y_l = m - i - 1.0; y_r = y_l + 1.0;
1294       for (j=0; j<a->rlen[i]; j++) {
1295         x_l = a->colidx[shift+j*8]; x_r = x_l + 1.0;
1296         if (PetscRealPart(a->val[shift+8*j]) >=  0.) continue;
1297         ierr = PetscDrawRectangle(draw,x_l,y_l,x_r,y_r,color,color,color,color);CHKERRQ(ierr);
1298       }
1299     }
1300     color = PETSC_DRAW_CYAN;
1301     for (i=0; i<m; i++) {
1302       shift = a->sliidx[i>>3]+(i&0x07);
1303       y_l = m - i - 1.0; y_r = y_l + 1.0;
1304       for (j=0; j<a->rlen[i]; j++) {
1305         x_l = a->colidx[shift+j*8]; x_r = x_l + 1.0;
1306         if (a->val[shift+8*j] !=  0.) continue;
1307         ierr = PetscDrawRectangle(draw,x_l,y_l,x_r,y_r,color,color,color,color);CHKERRQ(ierr);
1308       }
1309     }
1310     color = PETSC_DRAW_RED;
1311     for (i=0; i<m; i++) {
1312       shift = a->sliidx[i>>3]+(i&0x07);
1313       y_l = m - i - 1.0; y_r = y_l + 1.0;
1314       for (j=0; j<a->rlen[i]; j++) {
1315         x_l = a->colidx[shift+j*8]; x_r = x_l + 1.0;
1316         if (PetscRealPart(a->val[shift+8*j]) <=  0.) continue;
1317         ierr = PetscDrawRectangle(draw,x_l,y_l,x_r,y_r,color,color,color,color);CHKERRQ(ierr);
1318       }
1319     }
1320     ierr = PetscDrawCollectiveEnd(draw);CHKERRQ(ierr);
1321   } else {
1322     /* use contour shading to indicate magnitude of values */
1323     /* first determine max of all nonzero values */
1324     PetscReal minv=0.0,maxv=0.0;
1325     PetscInt  count=0;
1326     PetscDraw popup;
1327     for (i=0; i<a->sliidx[a->totalslices]; i++) {
1328       if (PetscAbsScalar(a->val[i]) > maxv) maxv = PetscAbsScalar(a->val[i]);
1329     }
1330     if (minv >= maxv) maxv = minv + PETSC_SMALL;
1331     ierr = PetscDrawGetPopup(draw,&popup);CHKERRQ(ierr);
1332     ierr = PetscDrawScalePopup(popup,minv,maxv);CHKERRQ(ierr);
1333 
1334     ierr = PetscDrawCollectiveBegin(draw);CHKERRQ(ierr);
1335     for (i=0; i<m; i++) {
1336       shift = a->sliidx[i>>3]+(i&0x07);
1337       y_l = m - i - 1.0;
1338       y_r = y_l + 1.0;
1339       for (j=0; j<a->rlen[i]; j++) {
1340         x_l = a->colidx[shift+j*8];
1341         x_r = x_l + 1.0;
1342         color = PetscDrawRealToColor(PetscAbsScalar(a->val[count]),minv,maxv);
1343         ierr = PetscDrawRectangle(draw,x_l,y_l,x_r,y_r,color,color,color,color);CHKERRQ(ierr);
1344         count++;
1345       }
1346     }
1347     ierr = PetscDrawCollectiveEnd(draw);CHKERRQ(ierr);
1348   }
1349   PetscFunctionReturn(0);
1350 }
1351 
1352 #include <petscdraw.h>
1353 PetscErrorCode MatView_SeqSELL_Draw(Mat A,PetscViewer viewer)
1354 {
1355   PetscDraw      draw;
1356   PetscReal      xr,yr,xl,yl,h,w;
1357   PetscBool      isnull;
1358   PetscErrorCode ierr;
1359 
1360   PetscFunctionBegin;
1361   ierr = PetscViewerDrawGetDraw(viewer,0,&draw);CHKERRQ(ierr);
1362   ierr = PetscDrawIsNull(draw,&isnull);CHKERRQ(ierr);
1363   if (isnull) PetscFunctionReturn(0);
1364 
1365   xr   = A->cmap->n; yr  = A->rmap->n; h = yr/10.0; w = xr/10.0;
1366   xr  += w;          yr += h;         xl = -w;     yl = -h;
1367   ierr = PetscDrawSetCoordinates(draw,xl,yl,xr,yr);CHKERRQ(ierr);
1368   ierr = PetscObjectCompose((PetscObject)A,"Zoomviewer",(PetscObject)viewer);CHKERRQ(ierr);
1369   ierr = PetscDrawZoom(draw,MatView_SeqSELL_Draw_Zoom,A);CHKERRQ(ierr);
1370   ierr = PetscObjectCompose((PetscObject)A,"Zoomviewer",NULL);CHKERRQ(ierr);
1371   ierr = PetscDrawSave(draw);CHKERRQ(ierr);
1372   PetscFunctionReturn(0);
1373 }
1374 
1375 PetscErrorCode MatView_SeqSELL(Mat A,PetscViewer viewer)
1376 {
1377   PetscBool      iascii,isbinary,isdraw;
1378   PetscErrorCode ierr;
1379 
1380   PetscFunctionBegin;
1381   ierr = PetscObjectTypeCompare((PetscObject)viewer,PETSCVIEWERASCII,&iascii);CHKERRQ(ierr);
1382   ierr = PetscObjectTypeCompare((PetscObject)viewer,PETSCVIEWERBINARY,&isbinary);CHKERRQ(ierr);
1383   ierr = PetscObjectTypeCompare((PetscObject)viewer,PETSCVIEWERDRAW,&isdraw);CHKERRQ(ierr);
1384   if (iascii) {
1385     ierr = MatView_SeqSELL_ASCII(A,viewer);CHKERRQ(ierr);
1386   } else if (isbinary) {
1387     /* ierr = MatView_SeqSELL_Binary(A,viewer);CHKERRQ(ierr); */
1388   } else if (isdraw) {
1389     ierr = MatView_SeqSELL_Draw(A,viewer);CHKERRQ(ierr);
1390   }
1391   PetscFunctionReturn(0);
1392 }
1393 
1394 PetscErrorCode MatAssemblyEnd_SeqSELL(Mat A,MatAssemblyType mode)
1395 {
1396   Mat_SeqSELL    *a=(Mat_SeqSELL*)A->data;
1397   PetscInt       i,shift,row_in_slice,row,nrow,*cp,lastcol,j,k;
1398   MatScalar      *vp;
1399   PetscErrorCode ierr;
1400 
1401   PetscFunctionBegin;
1402   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0);
1403   /* To do: compress out the unused elements */
1404   ierr = MatMarkDiagonal_SeqSELL(A);CHKERRQ(ierr);
1405   ierr = PetscInfo6(A,"Matrix size: %D X %D; storage space: %D allocated %D used (%D nonzeros+%D paddedzeros)\n",A->rmap->n,A->cmap->n,a->maxallocmat,a->sliidx[a->totalslices],a->nz,a->sliidx[a->totalslices]-a->nz);CHKERRQ(ierr);
1406   ierr = PetscInfo1(A,"Number of mallocs during MatSetValues() is %D\n",a->reallocs);CHKERRQ(ierr);
1407   ierr = PetscInfo1(A,"Maximum nonzeros in any row is %D\n",a->rlenmax);CHKERRQ(ierr);
1408   /* Set unused slots for column indices to last valid column index. Set unused slots for values to zero. This allows for a use of unmasked intrinsics -> higher performance */
1409   for (i=0; i<a->totalslices; ++i) {
1410     shift = a->sliidx[i];    /* starting index of the slice */
1411     cp    = a->colidx+shift; /* pointer to the column indices of the slice */
1412     vp    = a->val+shift;    /* pointer to the nonzero values of the slice */
1413     for (row_in_slice=0; row_in_slice<8; ++row_in_slice) { /* loop over rows in the slice */
1414       row  = 8*i + row_in_slice;
1415       nrow = a->rlen[row]; /* number of nonzeros in row */
1416       /*
1417         Search for the nearest nonzero. Normally setting the index to zero may cause extra communication.
1418         But if the entire slice are empty, it is fine to use 0 since the index will not be loaded.
1419       */
1420       lastcol = 0;
1421       if (nrow>0) { /* nonempty row */
1422         lastcol = cp[8*(nrow-1)+row_in_slice]; /* use the index from the last nonzero at current row */
1423       } else if (!row_in_slice) { /* first row of the currect slice is empty */
1424         for (j=1;j<8;j++) {
1425           if (a->rlen[8*i+j]) {
1426             lastcol = cp[j];
1427             break;
1428           }
1429         }
1430       } else {
1431         if (a->sliidx[i+1] != shift) lastcol = cp[row_in_slice-1]; /* use the index from the previous row */
1432       }
1433 
1434       for (k=nrow; k<(a->sliidx[i+1]-shift)/8; ++k) {
1435         cp[8*k+row_in_slice] = lastcol;
1436         vp[8*k+row_in_slice] = (MatScalar)0;
1437       }
1438     }
1439   }
1440 
1441   A->info.mallocs += a->reallocs;
1442   a->reallocs      = 0;
1443 
1444   ierr = MatSeqSELLInvalidateDiagonal(A);CHKERRQ(ierr);
1445   PetscFunctionReturn(0);
1446 }
1447 
1448 PetscErrorCode MatGetInfo_SeqSELL(Mat A,MatInfoType flag,MatInfo *info)
1449 {
1450   Mat_SeqSELL *a=(Mat_SeqSELL*)A->data;
1451 
1452   PetscFunctionBegin;
1453   info->block_size   = 1.0;
1454   info->nz_allocated = a->maxallocmat;
1455   info->nz_used      = a->sliidx[a->totalslices]; /* include padding zeros */
1456   info->nz_unneeded  = (a->maxallocmat-a->sliidx[a->totalslices]);
1457   info->assemblies   = A->num_ass;
1458   info->mallocs      = A->info.mallocs;
1459   info->memory       = ((PetscObject)A)->mem;
1460   if (A->factortype) {
1461     info->fill_ratio_given  = A->info.fill_ratio_given;
1462     info->fill_ratio_needed = A->info.fill_ratio_needed;
1463     info->factor_mallocs    = A->info.factor_mallocs;
1464   } else {
1465     info->fill_ratio_given  = 0;
1466     info->fill_ratio_needed = 0;
1467     info->factor_mallocs    = 0;
1468   }
1469   PetscFunctionReturn(0);
1470 }
1471 
1472 PetscErrorCode MatSetValues_SeqSELL(Mat A,PetscInt m,const PetscInt im[],PetscInt n,const PetscInt in[],const PetscScalar v[],InsertMode is)
1473 {
1474   Mat_SeqSELL    *a=(Mat_SeqSELL*)A->data;
1475   PetscInt       shift,i,k,l,low,high,t,ii,row,col,nrow;
1476   PetscInt       *cp,nonew=a->nonew,lastcol=-1;
1477   MatScalar      *vp,value;
1478   PetscErrorCode ierr;
1479 
1480   PetscFunctionBegin;
1481   for (k=0; k<m; k++) { /* loop over added rows */
1482     row = im[k];
1483     if (row < 0) continue;
1484 #if defined(PETSC_USE_DEBUG)
1485     if (row >= A->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Row too large: row %D max %D",row,A->rmap->n-1);
1486 #endif
1487     shift = a->sliidx[row>>3]+(row&0x07); /* starting index of the row */
1488     cp    = a->colidx+shift; /* pointer to the row */
1489     vp    = a->val+shift; /* pointer to the row */
1490     nrow  = a->rlen[row];
1491     low   = 0;
1492     high  = nrow;
1493 
1494     for (l=0; l<n; l++) { /* loop over added columns */
1495       col = in[l];
1496       if (col<0) continue;
1497 #if defined(PETSC_USE_DEBUG)
1498       if (col >= A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Col too large: row %D max %D",col,A->cmap->n-1);
1499 #endif
1500       if (a->roworiented) {
1501         value = v[l+k*n];
1502       } else {
1503         value = v[k+l*m];
1504       }
1505       if ((value == 0.0 && a->ignorezeroentries) && (is == ADD_VALUES)) continue;
1506 
1507       /* search in this row for the specified colmun, i indicates the column to be set */
1508       if (col <= lastcol) low = 0;
1509       else high = nrow;
1510       lastcol = col;
1511       while (high-low > 5) {
1512         t = (low+high)/2;
1513         if (*(cp+t*8) > col) high = t;
1514         else low = t;
1515       }
1516       for (i=low; i<high; i++) {
1517         if (*(cp+i*8) > col) break;
1518         if (*(cp+i*8) == col) {
1519           if (is == ADD_VALUES) *(vp+i*8) += value;
1520           else *(vp+i*8) = value;
1521           low = i + 1;
1522           goto noinsert;
1523         }
1524       }
1525       if (value == 0.0 && a->ignorezeroentries) goto noinsert;
1526       if (nonew == 1) goto noinsert;
1527       if (nonew == -1) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Inserting a new nonzero (%D, %D) in the matrix", row, col);
1528       /* If the current row length exceeds the slice width (e.g. nrow==slice_width), allocate a new space, otherwise do nothing */
1529       MatSeqXSELLReallocateSELL(A,A->rmap->n,1,nrow,a->sliidx,row/8,row,col,a->colidx,a->val,cp,vp,nonew,MatScalar);
1530       /* add the new nonzero to the high position, shift the remaining elements in current row to the right by one slot */
1531       for (ii=nrow-1; ii>=i; ii--) {
1532         *(cp+(ii+1)*8) = *(cp+ii*8);
1533         *(vp+(ii+1)*8) = *(vp+ii*8);
1534       }
1535       a->rlen[row]++;
1536       *(cp+i*8) = col;
1537       *(vp+i*8) = value;
1538       a->nz++;
1539       A->nonzerostate++;
1540       low = i+1; high++; nrow++;
1541 noinsert:;
1542     }
1543     a->rlen[row] = nrow;
1544   }
1545   PetscFunctionReturn(0);
1546 }
1547 
1548 PetscErrorCode MatCopy_SeqSELL(Mat A,Mat B,MatStructure str)
1549 {
1550   PetscErrorCode ierr;
1551 
1552   PetscFunctionBegin;
1553   /* If the two matrices have the same copy implementation, use fast copy. */
1554   if (str == SAME_NONZERO_PATTERN && (A->ops->copy == B->ops->copy)) {
1555     Mat_SeqSELL *a=(Mat_SeqSELL*)A->data;
1556     Mat_SeqSELL *b=(Mat_SeqSELL*)B->data;
1557 
1558     if (a->sliidx[a->totalslices] != b->sliidx[b->totalslices]) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"Number of nonzeros in two matrices are different");
1559     ierr = PetscArraycpy(b->val,a->val,a->sliidx[a->totalslices]);CHKERRQ(ierr);
1560   } else {
1561     ierr = MatCopy_Basic(A,B,str);CHKERRQ(ierr);
1562   }
1563   PetscFunctionReturn(0);
1564 }
1565 
1566 PetscErrorCode MatSetUp_SeqSELL(Mat A)
1567 {
1568   PetscErrorCode ierr;
1569 
1570   PetscFunctionBegin;
1571   ierr = MatSeqSELLSetPreallocation(A,PETSC_DEFAULT,0);CHKERRQ(ierr);
1572   PetscFunctionReturn(0);
1573 }
1574 
1575 PetscErrorCode MatSeqSELLGetArray_SeqSELL(Mat A,PetscScalar *array[])
1576 {
1577   Mat_SeqSELL *a=(Mat_SeqSELL*)A->data;
1578 
1579   PetscFunctionBegin;
1580   *array = a->val;
1581   PetscFunctionReturn(0);
1582 }
1583 
1584 PetscErrorCode MatSeqSELLRestoreArray_SeqSELL(Mat A,PetscScalar *array[])
1585 {
1586   PetscFunctionBegin;
1587   PetscFunctionReturn(0);
1588 }
1589 
1590 PetscErrorCode MatRealPart_SeqSELL(Mat A)
1591 {
1592   Mat_SeqSELL *a=(Mat_SeqSELL*)A->data;
1593   PetscInt    i;
1594   MatScalar   *aval=a->val;
1595 
1596   PetscFunctionBegin;
1597   for (i=0; i<a->sliidx[a->totalslices]; i++) aval[i]=PetscRealPart(aval[i]);
1598   PetscFunctionReturn(0);
1599 }
1600 
1601 PetscErrorCode MatImaginaryPart_SeqSELL(Mat A)
1602 {
1603   Mat_SeqSELL    *a=(Mat_SeqSELL*)A->data;
1604   PetscInt       i;
1605   MatScalar      *aval=a->val;
1606   PetscErrorCode ierr;
1607 
1608   PetscFunctionBegin;
1609   for (i=0; i<a->sliidx[a->totalslices]; i++) aval[i] = PetscImaginaryPart(aval[i]);
1610   ierr = MatSeqSELLInvalidateDiagonal(A);CHKERRQ(ierr);
1611   PetscFunctionReturn(0);
1612 }
1613 
1614 PetscErrorCode MatScale_SeqSELL(Mat inA,PetscScalar alpha)
1615 {
1616   Mat_SeqSELL    *a=(Mat_SeqSELL*)inA->data;
1617   MatScalar      *aval=a->val;
1618   PetscScalar    oalpha=alpha;
1619   PetscBLASInt   one=1,size;
1620   PetscErrorCode ierr;
1621 
1622   PetscFunctionBegin;
1623   ierr = PetscBLASIntCast(a->sliidx[a->totalslices],&size);CHKERRQ(ierr);
1624   PetscStackCallBLAS("BLASscal",BLASscal_(&size,&oalpha,aval,&one));
1625   ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
1626   ierr = MatSeqSELLInvalidateDiagonal(inA);CHKERRQ(ierr);
1627   PetscFunctionReturn(0);
1628 }
1629 
1630 PetscErrorCode MatShift_SeqSELL(Mat Y,PetscScalar a)
1631 {
1632   Mat_SeqSELL    *y=(Mat_SeqSELL*)Y->data;
1633   PetscErrorCode ierr;
1634 
1635   PetscFunctionBegin;
1636   if (!Y->preallocated || !y->nz) {
1637     ierr = MatSeqSELLSetPreallocation(Y,1,NULL);CHKERRQ(ierr);
1638   }
1639   ierr = MatShift_Basic(Y,a);CHKERRQ(ierr);
1640   PetscFunctionReturn(0);
1641 }
1642 
1643 PetscErrorCode MatSOR_SeqSELL(Mat A,Vec bb,PetscReal omega,MatSORType flag,PetscReal fshift,PetscInt its,PetscInt lits,Vec xx)
1644 {
1645   Mat_SeqSELL       *a=(Mat_SeqSELL*)A->data;
1646   PetscScalar       *x,sum,*t;
1647   const MatScalar   *idiag=0,*mdiag;
1648   const PetscScalar *b,*xb;
1649   PetscInt          n,m=A->rmap->n,i,j,shift;
1650   const PetscInt    *diag;
1651   PetscErrorCode    ierr;
1652 
1653   PetscFunctionBegin;
1654   its = its*lits;
1655 
1656   if (fshift != a->fshift || omega != a->omega) a->idiagvalid = PETSC_FALSE; /* must recompute idiag[] */
1657   if (!a->idiagvalid) {ierr = MatInvertDiagonal_SeqSELL(A,omega,fshift);CHKERRQ(ierr);}
1658   a->fshift = fshift;
1659   a->omega  = omega;
1660 
1661   diag  = a->diag;
1662   t     = a->ssor_work;
1663   idiag = a->idiag;
1664   mdiag = a->mdiag;
1665 
1666   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1667   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
1668   /* We count flops by assuming the upper triangular and lower triangular parts have the same number of nonzeros */
1669   if (flag == SOR_APPLY_UPPER) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"SOR_APPLY_UPPER is not implemented");
1670   if (flag == SOR_APPLY_LOWER) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"SOR_APPLY_LOWER is not implemented");
1671   if (flag & SOR_EISENSTAT) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support yet for Eisenstat");
1672 
1673   if (flag & SOR_ZERO_INITIAL_GUESS) {
1674     if ((flag & SOR_FORWARD_SWEEP) || (flag & SOR_LOCAL_FORWARD_SWEEP)) {
1675       for (i=0; i<m; i++) {
1676         shift = a->sliidx[i>>3]+(i&0x07); /* starting index of the row i */
1677         sum   = b[i];
1678         n     = (diag[i]-shift)/8;
1679         for (j=0; j<n; j++) sum -= a->val[shift+j*8]*x[a->colidx[shift+j*8]];
1680         t[i]  = sum;
1681         x[i]  = sum*idiag[i];
1682       }
1683       xb   = t;
1684       ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
1685     } else xb = b;
1686     if ((flag & SOR_BACKWARD_SWEEP) || (flag & SOR_LOCAL_BACKWARD_SWEEP)) {
1687       for (i=m-1; i>=0; i--) {
1688         shift = a->sliidx[i>>3]+(i&0x07); /* starting index of the row i */
1689         sum   = xb[i];
1690         n     = a->rlen[i]-(diag[i]-shift)/8-1;
1691         for (j=1; j<=n; j++) sum -= a->val[diag[i]+j*8]*x[a->colidx[diag[i]+j*8]];
1692         if (xb == b) {
1693           x[i] = sum*idiag[i];
1694         } else {
1695           x[i] = (1.-omega)*x[i]+sum*idiag[i];  /* omega in idiag */
1696         }
1697       }
1698       ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); /* assumes 1/2 in upper */
1699     }
1700     its--;
1701   }
1702   while (its--) {
1703     if ((flag & SOR_FORWARD_SWEEP) || (flag & SOR_LOCAL_FORWARD_SWEEP)) {
1704       for (i=0; i<m; i++) {
1705         /* lower */
1706         shift = a->sliidx[i>>3]+(i&0x07); /* starting index of the row i */
1707         sum   = b[i];
1708         n     = (diag[i]-shift)/8;
1709         for (j=0; j<n; j++) sum -= a->val[shift+j*8]*x[a->colidx[shift+j*8]];
1710         t[i]  = sum;             /* save application of the lower-triangular part */
1711         /* upper */
1712         n     = a->rlen[i]-(diag[i]-shift)/8-1;
1713         for (j=1; j<=n; j++) sum -= a->val[diag[i]+j*8]*x[a->colidx[diag[i]+j*8]];
1714         x[i]  = (1.-omega)*x[i]+sum*idiag[i];  /* omega in idiag */
1715       }
1716       xb   = t;
1717       ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
1718     } else xb = b;
1719     if ((flag & SOR_BACKWARD_SWEEP) || (flag & SOR_LOCAL_BACKWARD_SWEEP)) {
1720       for (i=m-1; i>=0; i--) {
1721         shift = a->sliidx[i>>3]+(i&0x07); /* starting index of the row i */
1722         sum = xb[i];
1723         if (xb == b) {
1724           /* whole matrix (no checkpointing available) */
1725           n     = a->rlen[i];
1726           for (j=0; j<n; j++) sum -= a->val[shift+j*8]*x[a->colidx[shift+j*8]];
1727           x[i] = (1.-omega)*x[i]+(sum+mdiag[i]*x[i])*idiag[i];
1728         } else { /* lower-triangular part has been saved, so only apply upper-triangular */
1729           n     = a->rlen[i]-(diag[i]-shift)/8-1;
1730           for (j=1; j<=n; j++) sum -= a->val[diag[i]+j*8]*x[a->colidx[diag[i]+j*8]];
1731           x[i]  = (1.-omega)*x[i]+sum*idiag[i];  /* omega in idiag */
1732         }
1733       }
1734       if (xb == b) {
1735         ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
1736       } else {
1737         ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); /* assumes 1/2 in upper */
1738       }
1739     }
1740   }
1741   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1742   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
1743   PetscFunctionReturn(0);
1744 }
1745 
1746 /* -------------------------------------------------------------------*/
1747 static struct _MatOps MatOps_Values = {MatSetValues_SeqSELL,
1748                                        MatGetRow_SeqSELL,
1749                                        MatRestoreRow_SeqSELL,
1750                                        MatMult_SeqSELL,
1751                                /* 4*/  MatMultAdd_SeqSELL,
1752                                        MatMultTranspose_SeqSELL,
1753                                        MatMultTransposeAdd_SeqSELL,
1754                                        0,
1755                                        0,
1756                                        0,
1757                                /* 10*/ 0,
1758                                        0,
1759                                        0,
1760                                        MatSOR_SeqSELL,
1761                                        0,
1762                                /* 15*/ MatGetInfo_SeqSELL,
1763                                        MatEqual_SeqSELL,
1764                                        MatGetDiagonal_SeqSELL,
1765                                        MatDiagonalScale_SeqSELL,
1766                                        0,
1767                                /* 20*/ 0,
1768                                        MatAssemblyEnd_SeqSELL,
1769                                        MatSetOption_SeqSELL,
1770                                        MatZeroEntries_SeqSELL,
1771                                /* 24*/ 0,
1772                                        0,
1773                                        0,
1774                                        0,
1775                                        0,
1776                                /* 29*/ MatSetUp_SeqSELL,
1777                                        0,
1778                                        0,
1779                                        0,
1780                                        0,
1781                                /* 34*/ MatDuplicate_SeqSELL,
1782                                        0,
1783                                        0,
1784                                        0,
1785                                        0,
1786                                /* 39*/ 0,
1787                                        0,
1788                                        0,
1789                                        MatGetValues_SeqSELL,
1790                                        MatCopy_SeqSELL,
1791                                /* 44*/ 0,
1792                                        MatScale_SeqSELL,
1793                                        MatShift_SeqSELL,
1794                                        0,
1795                                        0,
1796                                /* 49*/ 0,
1797                                        0,
1798                                        0,
1799                                        0,
1800                                        0,
1801                                /* 54*/ MatFDColoringCreate_SeqXAIJ,
1802                                        0,
1803                                        0,
1804                                        0,
1805                                        0,
1806                                /* 59*/ 0,
1807                                        MatDestroy_SeqSELL,
1808                                        MatView_SeqSELL,
1809                                        0,
1810                                        0,
1811                                /* 64*/ 0,
1812                                        0,
1813                                        0,
1814                                        0,
1815                                        0,
1816                                /* 69*/ 0,
1817                                        0,
1818                                        0,
1819                                        0,
1820                                        0,
1821                                /* 74*/ 0,
1822                                        MatFDColoringApply_AIJ, /* reuse the FDColoring function for AIJ */
1823                                        0,
1824                                        0,
1825                                        0,
1826                                /* 79*/ 0,
1827                                        0,
1828                                        0,
1829                                        0,
1830                                        0,
1831                                /* 84*/ 0,
1832                                        0,
1833                                        0,
1834                                        0,
1835                                        0,
1836                                /* 89*/ 0,
1837                                        0,
1838                                        0,
1839                                        0,
1840                                        0,
1841                                /* 94*/ 0,
1842                                        0,
1843                                        0,
1844                                        0,
1845                                        0,
1846                                /* 99*/ 0,
1847                                        0,
1848                                        0,
1849                                        MatConjugate_SeqSELL,
1850                                        0,
1851                                /*104*/ 0,
1852                                        0,
1853                                        0,
1854                                        0,
1855                                        0,
1856                                /*109*/ 0,
1857                                        0,
1858                                        0,
1859                                        0,
1860                                        MatMissingDiagonal_SeqSELL,
1861                                /*114*/ 0,
1862                                        0,
1863                                        0,
1864                                        0,
1865                                        0,
1866                                /*119*/ 0,
1867                                        0,
1868                                        0,
1869                                        0,
1870                                        0,
1871                                /*124*/ 0,
1872                                        0,
1873                                        0,
1874                                        0,
1875                                        0,
1876                                /*129*/ 0,
1877                                        0,
1878                                        0,
1879                                        0,
1880                                        0,
1881                                /*134*/ 0,
1882                                        0,
1883                                        0,
1884                                        0,
1885                                        0,
1886                                /*139*/ 0,
1887                                        0,
1888                                        0,
1889                                        MatFDColoringSetUp_SeqXAIJ,
1890                                        0,
1891                                 /*144*/0
1892 };
1893 
1894 PetscErrorCode MatStoreValues_SeqSELL(Mat mat)
1895 {
1896   Mat_SeqSELL    *a=(Mat_SeqSELL*)mat->data;
1897   PetscErrorCode ierr;
1898 
1899   PetscFunctionBegin;
1900   if (!a->nonew) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ORDER,"Must call MatSetOption(A,MAT_NEW_NONZERO_LOCATIONS,PETSC_FALSE);first");
1901 
1902   /* allocate space for values if not already there */
1903   if (!a->saved_values) {
1904     ierr = PetscMalloc1(a->sliidx[a->totalslices]+1,&a->saved_values);CHKERRQ(ierr);
1905     ierr = PetscLogObjectMemory((PetscObject)mat,(a->sliidx[a->totalslices]+1)*sizeof(PetscScalar));CHKERRQ(ierr);
1906   }
1907 
1908   /* copy values over */
1909   ierr = PetscArraycpy(a->saved_values,a->val,a->sliidx[a->totalslices]);CHKERRQ(ierr);
1910   PetscFunctionReturn(0);
1911 }
1912 
1913 PetscErrorCode MatRetrieveValues_SeqSELL(Mat mat)
1914 {
1915   Mat_SeqSELL    *a=(Mat_SeqSELL*)mat->data;
1916   PetscErrorCode ierr;
1917 
1918   PetscFunctionBegin;
1919   if (!a->nonew) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ORDER,"Must call MatSetOption(A,MAT_NEW_NONZERO_LOCATIONS,PETSC_FALSE);first");
1920   if (!a->saved_values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ORDER,"Must call MatStoreValues(A);first");
1921   ierr = PetscArraycpy(a->val,a->saved_values,a->sliidx[a->totalslices]);CHKERRQ(ierr);
1922   PetscFunctionReturn(0);
1923 }
1924 
1925 /*@C
1926  MatSeqSELLRestoreArray - returns access to the array where the data for a MATSEQSELL matrix is stored obtained by MatSeqSELLGetArray()
1927 
1928  Not Collective
1929 
1930  Input Parameters:
1931  .  mat - a MATSEQSELL matrix
1932  .  array - pointer to the data
1933 
1934  Level: intermediate
1935 
1936  .seealso: MatSeqSELLGetArray(), MatSeqSELLRestoreArrayF90()
1937  @*/
1938 PetscErrorCode MatSeqSELLRestoreArray(Mat A,PetscScalar **array)
1939 {
1940   PetscErrorCode ierr;
1941 
1942   PetscFunctionBegin;
1943   ierr = PetscUseMethod(A,"MatSeqSELLRestoreArray_C",(Mat,PetscScalar**),(A,array));CHKERRQ(ierr);
1944   PetscFunctionReturn(0);
1945 }
1946 
1947 PETSC_EXTERN PetscErrorCode MatCreate_SeqSELL(Mat B)
1948 {
1949   Mat_SeqSELL    *b;
1950   PetscMPIInt    size;
1951   PetscErrorCode ierr;
1952 
1953   PetscFunctionBegin;
1954   ierr = MPI_Comm_size(PetscObjectComm((PetscObject)B),&size);CHKERRQ(ierr);
1955   if (size > 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Comm must be of size 1");
1956 
1957   ierr = PetscNewLog(B,&b);CHKERRQ(ierr);
1958 
1959   B->data = (void*)b;
1960 
1961   ierr = PetscMemcpy(B->ops,&MatOps_Values,sizeof(struct _MatOps));CHKERRQ(ierr);
1962 
1963   b->row                = 0;
1964   b->col                = 0;
1965   b->icol               = 0;
1966   b->reallocs           = 0;
1967   b->ignorezeroentries  = PETSC_FALSE;
1968   b->roworiented        = PETSC_TRUE;
1969   b->nonew              = 0;
1970   b->diag               = 0;
1971   b->solve_work         = 0;
1972   B->spptr              = 0;
1973   b->saved_values       = 0;
1974   b->idiag              = 0;
1975   b->mdiag              = 0;
1976   b->ssor_work          = 0;
1977   b->omega              = 1.0;
1978   b->fshift             = 0.0;
1979   b->idiagvalid         = PETSC_FALSE;
1980   b->keepnonzeropattern = PETSC_FALSE;
1981 
1982   ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQSELL);CHKERRQ(ierr);
1983   ierr = PetscObjectComposeFunction((PetscObject)B,"MatSeqSELLGetArray_C",MatSeqSELLGetArray_SeqSELL);CHKERRQ(ierr);
1984   ierr = PetscObjectComposeFunction((PetscObject)B,"MatSeqSELLRestoreArray_C",MatSeqSELLRestoreArray_SeqSELL);CHKERRQ(ierr);
1985   ierr = PetscObjectComposeFunction((PetscObject)B,"MatStoreValues_C",MatStoreValues_SeqSELL);CHKERRQ(ierr);
1986   ierr = PetscObjectComposeFunction((PetscObject)B,"MatRetrieveValues_C",MatRetrieveValues_SeqSELL);CHKERRQ(ierr);
1987   ierr = PetscObjectComposeFunction((PetscObject)B,"MatSeqSELLSetPreallocation_C",MatSeqSELLSetPreallocation_SeqSELL);CHKERRQ(ierr);
1988   ierr = PetscObjectComposeFunction((PetscObject)B,"MatConvert_seqsell_seqaij_C",MatConvert_SeqSELL_SeqAIJ);CHKERRQ(ierr);
1989   PetscFunctionReturn(0);
1990 }
1991 
1992 /*
1993  Given a matrix generated with MatGetFactor() duplicates all the information in A into B
1994  */
1995 PetscErrorCode MatDuplicateNoCreate_SeqSELL(Mat C,Mat A,MatDuplicateOption cpvalues,PetscBool mallocmatspace)
1996 {
1997   Mat_SeqSELL    *c,*a=(Mat_SeqSELL*)A->data;
1998   PetscInt       i,m=A->rmap->n;
1999   PetscInt       totalslices=a->totalslices;
2000   PetscErrorCode ierr;
2001 
2002   PetscFunctionBegin;
2003   c = (Mat_SeqSELL*)C->data;
2004 
2005   C->factortype = A->factortype;
2006   c->row        = 0;
2007   c->col        = 0;
2008   c->icol       = 0;
2009   c->reallocs   = 0;
2010 
2011   C->assembled = PETSC_TRUE;
2012 
2013   ierr = PetscLayoutReference(A->rmap,&C->rmap);CHKERRQ(ierr);
2014   ierr = PetscLayoutReference(A->cmap,&C->cmap);CHKERRQ(ierr);
2015 
2016   ierr = PetscMalloc1(8*totalslices,&c->rlen);CHKERRQ(ierr);
2017   ierr = PetscLogObjectMemory((PetscObject)C,m*sizeof(PetscInt));CHKERRQ(ierr);
2018   ierr = PetscMalloc1(totalslices+1,&c->sliidx);CHKERRQ(ierr);
2019   ierr = PetscLogObjectMemory((PetscObject)C, (totalslices+1)*sizeof(PetscInt));CHKERRQ(ierr);
2020 
2021   for (i=0; i<m; i++) c->rlen[i] = a->rlen[i];
2022   for (i=0; i<totalslices+1; i++) c->sliidx[i] = a->sliidx[i];
2023 
2024   /* allocate the matrix space */
2025   if (mallocmatspace) {
2026     ierr = PetscMalloc2(a->maxallocmat,&c->val,a->maxallocmat,&c->colidx);CHKERRQ(ierr);
2027     ierr = PetscLogObjectMemory((PetscObject)C,a->maxallocmat*(sizeof(PetscScalar)+sizeof(PetscInt)));CHKERRQ(ierr);
2028 
2029     c->singlemalloc = PETSC_TRUE;
2030 
2031     if (m > 0) {
2032       ierr = PetscArraycpy(c->colidx,a->colidx,a->maxallocmat);CHKERRQ(ierr);
2033       if (cpvalues == MAT_COPY_VALUES) {
2034         ierr = PetscArraycpy(c->val,a->val,a->maxallocmat);CHKERRQ(ierr);
2035       } else {
2036         ierr = PetscArrayzero(c->val,a->maxallocmat);CHKERRQ(ierr);
2037       }
2038     }
2039   }
2040 
2041   c->ignorezeroentries = a->ignorezeroentries;
2042   c->roworiented       = a->roworiented;
2043   c->nonew             = a->nonew;
2044   if (a->diag) {
2045     ierr = PetscMalloc1(m,&c->diag);CHKERRQ(ierr);
2046     ierr = PetscLogObjectMemory((PetscObject)C,m*sizeof(PetscInt));CHKERRQ(ierr);
2047     for (i=0; i<m; i++) {
2048       c->diag[i] = a->diag[i];
2049     }
2050   } else c->diag = 0;
2051 
2052   c->solve_work         = 0;
2053   c->saved_values       = 0;
2054   c->idiag              = 0;
2055   c->ssor_work          = 0;
2056   c->keepnonzeropattern = a->keepnonzeropattern;
2057   c->free_val           = PETSC_TRUE;
2058   c->free_colidx        = PETSC_TRUE;
2059 
2060   c->maxallocmat  = a->maxallocmat;
2061   c->maxallocrow  = a->maxallocrow;
2062   c->rlenmax      = a->rlenmax;
2063   c->nz           = a->nz;
2064   C->preallocated = PETSC_TRUE;
2065 
2066   c->nonzerorowcnt = a->nonzerorowcnt;
2067   C->nonzerostate  = A->nonzerostate;
2068 
2069   ierr = PetscFunctionListDuplicate(((PetscObject)A)->qlist,&((PetscObject)C)->qlist);CHKERRQ(ierr);
2070   PetscFunctionReturn(0);
2071 }
2072 
2073 PetscErrorCode MatDuplicate_SeqSELL(Mat A,MatDuplicateOption cpvalues,Mat *B)
2074 {
2075   PetscErrorCode ierr;
2076 
2077   PetscFunctionBegin;
2078   ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr);
2079   ierr = MatSetSizes(*B,A->rmap->n,A->cmap->n,A->rmap->n,A->cmap->n);CHKERRQ(ierr);
2080   if (!(A->rmap->n % A->rmap->bs) && !(A->cmap->n % A->cmap->bs)) {
2081     ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr);
2082   }
2083   ierr = MatSetType(*B,((PetscObject)A)->type_name);CHKERRQ(ierr);
2084   ierr = MatDuplicateNoCreate_SeqSELL(*B,A,cpvalues,PETSC_TRUE);CHKERRQ(ierr);
2085   PetscFunctionReturn(0);
2086 }
2087 
2088 /*@C
2089  MatCreateSeqSELL - Creates a sparse matrix in SELL format.
2090 
2091  Collective
2092 
2093  Input Parameters:
2094  +  comm - MPI communicator, set to PETSC_COMM_SELF
2095  .  m - number of rows
2096  .  n - number of columns
2097  .  rlenmax - maximum number of nonzeros in a row
2098  -  rlen - array containing the number of nonzeros in the various rows
2099  (possibly different for each row) or NULL
2100 
2101  Output Parameter:
2102  .  A - the matrix
2103 
2104  It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
2105  MatXXXXSetPreallocation() paradigm instead of this routine directly.
2106  [MatXXXXSetPreallocation() is, for example, MatSeqSELLSetPreallocation]
2107 
2108  Notes:
2109  If nnz is given then nz is ignored
2110 
2111  Specify the preallocated storage with either rlenmax or rlen (not both).
2112  Set rlenmax=PETSC_DEFAULT and rlen=NULL for PETSc to control dynamic memory
2113  allocation.  For large problems you MUST preallocate memory or you
2114  will get TERRIBLE performance, see the users' manual chapter on matrices.
2115 
2116  Level: intermediate
2117 
2118  .seealso: MatCreate(), MatCreateSELL(), MatSetValues(), MatCreateSeqSELLWithArrays()
2119 
2120  @*/
2121 PetscErrorCode MatCreateSeqSELL(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt maxallocrow,const PetscInt rlen[],Mat *A)
2122 {
2123   PetscErrorCode ierr;
2124 
2125   PetscFunctionBegin;
2126   ierr = MatCreate(comm,A);CHKERRQ(ierr);
2127   ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr);
2128   ierr = MatSetType(*A,MATSEQSELL);CHKERRQ(ierr);
2129   ierr = MatSeqSELLSetPreallocation_SeqSELL(*A,maxallocrow,rlen);CHKERRQ(ierr);
2130   PetscFunctionReturn(0);
2131 }
2132 
2133 PetscErrorCode MatEqual_SeqSELL(Mat A,Mat B,PetscBool * flg)
2134 {
2135   Mat_SeqSELL     *a=(Mat_SeqSELL*)A->data,*b=(Mat_SeqSELL*)B->data;
2136   PetscInt       totalslices=a->totalslices;
2137   PetscErrorCode ierr;
2138 
2139   PetscFunctionBegin;
2140   /* If the  matrix dimensions are not equal,or no of nonzeros */
2141   if ((A->rmap->n != B->rmap->n) || (A->cmap->n != B->cmap->n) ||(a->nz != b->nz) || (a->rlenmax != b->rlenmax)) {
2142     *flg = PETSC_FALSE;
2143     PetscFunctionReturn(0);
2144   }
2145   /* if the a->colidx are the same */
2146   ierr = PetscArraycmp(a->colidx,b->colidx,a->sliidx[totalslices],flg);CHKERRQ(ierr);
2147   if (!*flg) PetscFunctionReturn(0);
2148   /* if a->val are the same */
2149   ierr = PetscArraycmp(a->val,b->val,a->sliidx[totalslices],flg);CHKERRQ(ierr);
2150   PetscFunctionReturn(0);
2151 }
2152 
2153 PetscErrorCode MatSeqSELLInvalidateDiagonal(Mat A)
2154 {
2155   Mat_SeqSELL *a=(Mat_SeqSELL*)A->data;
2156 
2157   PetscFunctionBegin;
2158   a->idiagvalid  = PETSC_FALSE;
2159   PetscFunctionReturn(0);
2160 }
2161 
2162 PetscErrorCode MatConjugate_SeqSELL(Mat A)
2163 {
2164 #if defined(PETSC_USE_COMPLEX)
2165   Mat_SeqSELL *a=(Mat_SeqSELL*)A->data;
2166   PetscInt    i;
2167   PetscScalar *val = a->val;
2168 
2169   PetscFunctionBegin;
2170   for (i=0; i<a->sliidx[a->totalslices]; i++) {
2171     val[i] = PetscConj(val[i]);
2172   }
2173 #else
2174   PetscFunctionBegin;
2175 #endif
2176   PetscFunctionReturn(0);
2177 }
2178