xref: /petsc/src/mat/impls/sell/seq/sell.c (revision fa2f3a98544cffa8147e57031dd6780dd55215d2)
1 
2 /*
3   Defines the basic matrix operations for the SELL matrix storage format.
4 */
5 #include <../src/mat/impls/sell/seq/sell.h>  /*I   "petscmat.h"  I*/
6 #include <petscblaslapack.h>
7 #include <petsc/private/kernels/blocktranspose.h>
8 #if defined(PETSC_HAVE_IMMINTRIN_H) && (defined(__AVX512F__) || (defined(__AVX2__) && defined(__FMA__)) || defined(__AVX__)) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
9 
10   #include <immintrin.h>
11 
12   #if !defined(_MM_SCALE_8)
13   #define _MM_SCALE_8    8
14   #endif
15 
16   #if defined(__AVX512F__)
17   /* these do not work
18    vec_idx  = _mm512_loadunpackhi_epi32(vec_idx,acolidx);
19    vec_vals = _mm512_loadunpackhi_pd(vec_vals,aval);
20   */
21     #define AVX512_Mult_Private(vec_idx,vec_x,vec_vals,vec_y) \
22     /* if the mask bit is set, copy from acolidx, otherwise from vec_idx */ \
23     vec_idx  = _mm256_loadu_si256((__m256i const*)acolidx); \
24     vec_vals = _mm512_loadu_pd(aval); \
25     vec_x    = _mm512_i32gather_pd(vec_idx,x,_MM_SCALE_8); \
26     vec_y    = _mm512_fmadd_pd(vec_x,vec_vals,vec_y)
27   #elif defined(__AVX2__) && defined(__FMA__)
28     #define AVX2_Mult_Private(vec_idx,vec_x,vec_vals,vec_y) \
29     vec_vals = _mm256_loadu_pd(aval); \
30     vec_idx  = _mm_loadu_si128((__m128i const*)acolidx); /* SSE2 */ \
31     vec_x    = _mm256_i32gather_pd(x,vec_idx,_MM_SCALE_8); \
32     vec_y    = _mm256_fmadd_pd(vec_x,vec_vals,vec_y)
33   #endif
34 #endif  /* PETSC_HAVE_IMMINTRIN_H */
35 
36 /*@C
37  MatSeqSELLSetPreallocation - For good matrix assembly performance
38  the user should preallocate the matrix storage by setting the parameter nz
39  (or the array nnz).  By setting these parameters accurately, performance
40  during matrix assembly can be increased significantly.
41 
42  Collective on MPI_Comm
43 
44  Input Parameters:
45  +  B - The matrix
46  .  nz - number of nonzeros per row (same for all rows)
47  -  nnz - array containing the number of nonzeros in the various rows
48  (possibly different for each row) or NULL
49 
50  Notes:
51  If nnz is given then nz is ignored.
52 
53  Specify the preallocated storage with either nz or nnz (not both).
54  Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
55  allocation.  For large problems you MUST preallocate memory or you
56  will get TERRIBLE performance, see the users' manual chapter on matrices.
57 
58  You can call MatGetInfo() to get information on how effective the preallocation was;
59  for example the fields mallocs,nz_allocated,nz_used,nz_unneeded;
60  You can also run with the option -info and look for messages with the string
61  malloc in them to see if additional memory allocation was needed.
62 
63  Developers: Use nz of MAT_SKIP_ALLOCATION to not allocate any space for the matrix
64  entries or columns indices.
65 
66  The maximum number of nonzeos in any row should be as accuate as possible.
67  If it is underesitmated, you will get bad performance due to reallocation
68  (MatSeqXSELLReallocateSELL).
69 
70  Level: intermediate
71 
72  .seealso: MatCreate(), MatCreateSELL(), MatSetValues(), MatGetInfo()
73 
74  @*/
75 PetscErrorCode MatSeqSELLSetPreallocation(Mat B,PetscInt rlenmax,const PetscInt rlen[])
76 {
77   PetscErrorCode ierr;
78 
79   PetscFunctionBegin;
80   PetscValidHeaderSpecific(B,MAT_CLASSID,1);
81   PetscValidType(B,1);
82   ierr = PetscTryMethod(B,"MatSeqSELLSetPreallocation_C",(Mat,PetscInt,const PetscInt[]),(B,rlenmax,rlen));CHKERRQ(ierr);
83   PetscFunctionReturn(0);
84 }
85 
86 PetscErrorCode MatSeqSELLSetPreallocation_SeqSELL(Mat B,PetscInt maxallocrow,const PetscInt rlen[])
87 {
88   Mat_SeqSELL    *b;
89   PetscInt       i,j,totalslices;
90   PetscBool      skipallocation=PETSC_FALSE,realalloc=PETSC_FALSE;
91   PetscErrorCode ierr;
92 
93   PetscFunctionBegin;
94   if (maxallocrow >= 0 || rlen) realalloc = PETSC_TRUE;
95   if (maxallocrow == MAT_SKIP_ALLOCATION) {
96     skipallocation = PETSC_TRUE;
97     maxallocrow    = 0;
98   }
99 
100   ierr = PetscLayoutSetUp(B->rmap);CHKERRQ(ierr);
101   ierr = PetscLayoutSetUp(B->cmap);CHKERRQ(ierr);
102 
103   /* FIXME: if one preallocates more space than needed, the matrix does not shrink automatically, but for best performance it should */
104   if (maxallocrow == PETSC_DEFAULT || maxallocrow == PETSC_DECIDE) maxallocrow = 5;
105   if (maxallocrow < 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"maxallocrow cannot be less than 0: value %D",maxallocrow);
106   if (rlen) {
107     for (i=0; i<B->rmap->n; i++) {
108       if (rlen[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"rlen cannot be less than 0: local row %D value %D",i,rlen[i]);
109       if (rlen[i] > B->cmap->n) SETERRQ3(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"rlen cannot be greater than row length: local row %D value %D rowlength %D",i,rlen[i],B->cmap->n);
110     }
111   }
112 
113   B->preallocated = PETSC_TRUE;
114 
115   b = (Mat_SeqSELL*)B->data;
116 
117   totalslices = B->rmap->n/8+((B->rmap->n & 0x07)?1:0); /* ceil(n/8) */
118   b->totalslices = totalslices;
119   if (!skipallocation) {
120     if (B->rmap->n & 0x07) PetscInfo1(B,"Padding rows to the SEQSELL matrix because the number of rows is not the multiple of 8 (value %D)\n",B->rmap->n);
121 
122     if (!b->sliidx) { /* sliidx gives the starting index of each slice, the last element is the total space allocated */
123       ierr = PetscMalloc1(totalslices+1,&b->sliidx);CHKERRQ(ierr);
124       ierr = PetscLogObjectMemory((PetscObject)B,(totalslices+1)*sizeof(PetscInt));CHKERRQ(ierr);
125     }
126     if (!rlen) { /* if rlen is not provided, allocate same space for all the slices */
127       if (maxallocrow == PETSC_DEFAULT || maxallocrow == PETSC_DECIDE) maxallocrow = 10;
128       else if (maxallocrow < 0) maxallocrow = 1;
129       for (i=0; i<=totalslices; i++) b->sliidx[i] = i*8*maxallocrow;
130     } else {
131       maxallocrow = 0;
132       b->sliidx[0] = 0;
133       for (i=1; i<totalslices; i++) {
134         b->sliidx[i] = 0;
135         for (j=0;j<8;j++) {
136           b->sliidx[i] = PetscMax(b->sliidx[i],rlen[8*(i-1)+j]);
137         }
138         maxallocrow = PetscMax(b->sliidx[i],maxallocrow);
139         b->sliidx[i] = b->sliidx[i-1] + 8*b->sliidx[i];
140       }
141       /* last slice */
142       b->sliidx[totalslices] = 0;
143       for (j=(totalslices-1)*8;j<B->rmap->n;j++) b->sliidx[totalslices] = PetscMax(b->sliidx[totalslices],rlen[j]);
144       maxallocrow = PetscMax(b->sliidx[totalslices],maxallocrow);
145       b->sliidx[totalslices] = b->sliidx[totalslices-1] + 8*b->sliidx[totalslices];
146     }
147 
148     /* allocate space for val, colidx, rlen */
149     /* FIXME: should B's old memory be unlogged? */
150     ierr = MatSeqXSELLFreeSELL(B,&b->val,&b->colidx);CHKERRQ(ierr);
151     /* FIXME: assuming an element of the bit array takes 8 bits */
152     ierr = PetscMalloc2(b->sliidx[totalslices],&b->val,b->sliidx[totalslices],&b->colidx);CHKERRQ(ierr);
153     ierr = PetscLogObjectMemory((PetscObject)B,b->sliidx[totalslices]*(sizeof(PetscScalar)+sizeof(PetscInt)));CHKERRQ(ierr);
154     /* b->rlen will count nonzeros in each row so far. We dont copy rlen to b->rlen because the matrix has not been set. */
155     ierr = PetscCalloc1(8*totalslices,&b->rlen);CHKERRQ(ierr);
156     ierr = PetscLogObjectMemory((PetscObject)B,8*totalslices*sizeof(PetscInt));CHKERRQ(ierr);
157 
158     b->singlemalloc = PETSC_TRUE;
159     b->free_val     = PETSC_TRUE;
160     b->free_colidx  = PETSC_TRUE;
161   } else {
162     b->free_val    = PETSC_FALSE;
163     b->free_colidx = PETSC_FALSE;
164   }
165 
166   b->nz               = 0;
167   b->maxallocrow      = maxallocrow;
168   b->rlenmax          = maxallocrow;
169   b->maxallocmat      = b->sliidx[totalslices];
170   B->info.nz_unneeded = (double)b->maxallocmat;
171   if (realalloc) {
172     ierr = MatSetOption(B,MAT_NEW_NONZERO_ALLOCATION_ERR,PETSC_TRUE);CHKERRQ(ierr);
173   }
174   PetscFunctionReturn(0);
175 }
176 
177 PetscErrorCode MatGetRow_SeqSELL(Mat A,PetscInt row,PetscInt *nz,PetscInt **idx,PetscScalar **v)
178 {
179   Mat_SeqSELL *a = (Mat_SeqSELL*)A->data;
180   PetscInt    shift;
181 
182   PetscFunctionBegin;
183   if (row < 0 || row >= A->rmap->n) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Row %D out of range",row);
184   if (nz) *nz = a->rlen[row];
185   shift = a->sliidx[row>>3]+(row&0x07);
186   if (!a->getrowcols) {
187     PetscErrorCode ierr;
188 
189     ierr = PetscMalloc2(a->rlenmax,&a->getrowcols,a->rlenmax,&a->getrowvals);CHKERRQ(ierr);
190   }
191   if (idx) {
192     PetscInt j;
193     for (j=0; j<a->rlen[row]; j++) a->getrowcols[j] = a->colidx[shift+8*j];
194     *idx = a->getrowcols;
195   }
196   if (v) {
197     PetscInt j;
198     for (j=0; j<a->rlen[row]; j++) a->getrowvals[j] = a->val[shift+8*j];
199     *v = a->getrowvals;
200   }
201   PetscFunctionReturn(0);
202 }
203 
204 PetscErrorCode MatRestoreRow_SeqSELL(Mat A,PetscInt row,PetscInt *nz,PetscInt **idx,PetscScalar **v)
205 {
206   PetscFunctionBegin;
207   PetscFunctionReturn(0);
208 }
209 
210 PetscErrorCode MatConvert_SeqSELL_SeqAIJ(Mat A, MatType newtype,MatReuse reuse,Mat *newmat)
211 {
212   Mat            B;
213   Mat_SeqSELL    *a=(Mat_SeqSELL*)A->data;
214   PetscInt       i;
215   PetscErrorCode ierr;
216 
217   PetscFunctionBegin;
218   if (reuse == MAT_REUSE_MATRIX) {
219     B    = *newmat;
220     ierr = MatZeroEntries(B);CHKERRQ(ierr);
221   } else {
222     ierr = MatCreate(PetscObjectComm((PetscObject)A),&B);CHKERRQ(ierr);
223     ierr = MatSetSizes(B,A->rmap->n,A->cmap->n,A->rmap->N,A->cmap->N);CHKERRQ(ierr);
224     ierr = MatSetType(B,MATSEQAIJ);CHKERRQ(ierr);
225     ierr = MatSeqAIJSetPreallocation(B,0,a->rlen);CHKERRQ(ierr);
226   }
227 
228   for (i=0; i<A->rmap->n; i++) {
229     PetscInt    nz,*cols;
230     PetscScalar *vals;
231 
232     ierr = MatGetRow_SeqSELL(A,i,&nz,&cols,&vals);CHKERRQ(ierr);
233     ierr = MatSetValues(B,1,&i,nz,cols,vals,INSERT_VALUES);CHKERRQ(ierr);
234     ierr = MatRestoreRow_SeqSELL(A,i,&nz,&cols,&vals);CHKERRQ(ierr);
235   }
236 
237   ierr = MatAssemblyBegin(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
238   ierr = MatAssemblyEnd(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
239   B->rmap->bs = A->rmap->bs;
240 
241   if (reuse == MAT_INPLACE_MATRIX) {
242     ierr = MatHeaderReplace(A,&B);CHKERRQ(ierr);
243   } else {
244     *newmat = B;
245   }
246   PetscFunctionReturn(0);
247 }
248 
249 #include <../src/mat/impls/aij/seq/aij.h>
250 
251 PetscErrorCode MatConvert_SeqAIJ_SeqSELL(Mat A,MatType newtype,MatReuse reuse,Mat *newmat)
252 {
253   Mat               B;
254   Mat_SeqAIJ        *a=(Mat_SeqAIJ*)A->data;
255   PetscInt          *ai=a->i,m=A->rmap->N,n=A->cmap->N,i,*rowlengths,row,ncols;
256   const PetscInt    *cols;
257   const PetscScalar *vals;
258   PetscErrorCode    ierr;
259 
260   PetscFunctionBegin;
261   if (A->rmap->bs > 1) {
262     ierr = MatConvert_Basic(A,newtype,reuse,newmat);CHKERRQ(ierr);
263     PetscFunctionReturn(0);
264   }
265 
266   if (reuse == MAT_REUSE_MATRIX) {
267     B = *newmat;
268   } else {
269     /* Can we just use ilen? */
270     ierr = PetscMalloc1(m,&rowlengths);CHKERRQ(ierr);
271     for (i=0; i<m; i++) {
272       rowlengths[i] = ai[i+1] - ai[i];
273     }
274 
275     ierr = MatCreate(PetscObjectComm((PetscObject)A),&B);CHKERRQ(ierr);
276     ierr = MatSetSizes(B,m,n,m,n);CHKERRQ(ierr);
277     ierr = MatSetType(B,MATSEQSELL);CHKERRQ(ierr);
278     ierr = MatSeqSELLSetPreallocation(B,0,rowlengths);CHKERRQ(ierr);
279     ierr = PetscFree(rowlengths);CHKERRQ(ierr);
280   }
281 
282   for (row=0; row<m; row++) {
283     ierr = MatGetRow(A,row,&ncols,&cols,&vals);CHKERRQ(ierr);
284     ierr = MatSetValues(B,1,&row,ncols,cols,vals,INSERT_VALUES);CHKERRQ(ierr);
285     ierr = MatRestoreRow(A,row,&ncols,&cols,&vals);CHKERRQ(ierr);
286   }
287   ierr = MatAssemblyBegin(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
288   ierr = MatAssemblyEnd(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
289   B->rmap->bs = A->rmap->bs;
290 
291   if (reuse == MAT_INPLACE_MATRIX) {
292     ierr = MatHeaderReplace(A,&B);CHKERRQ(ierr);
293   } else {
294     *newmat = B;
295   }
296   PetscFunctionReturn(0);
297 }
298 
299 PetscErrorCode MatMult_SeqSELL(Mat A,Vec xx,Vec yy)
300 {
301   Mat_SeqSELL       *a=(Mat_SeqSELL*)A->data;
302   PetscScalar       *y;
303   const PetscScalar *x;
304   const MatScalar   *aval=a->val;
305   PetscInt          totalslices=a->totalslices;
306   const PetscInt    *acolidx=a->colidx;
307   PetscInt          i,j;
308   PetscErrorCode    ierr;
309 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
310   __m512d           vec_x,vec_y,vec_vals;
311   __m256i           vec_idx;
312   __mmask8          mask;
313   __m512d           vec_x2,vec_y2,vec_vals2,vec_x3,vec_y3,vec_vals3,vec_x4,vec_y4,vec_vals4;
314   __m256i           vec_idx2,vec_idx3,vec_idx4;
315 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
316   __m128i           vec_idx;
317   __m256d           vec_x,vec_y,vec_y2,vec_vals;
318   MatScalar         yval;
319   PetscInt          r,rows_left,row,nnz_in_row;
320 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
321   __m128d           vec_x_tmp;
322   __m256d           vec_x,vec_y,vec_y2,vec_vals;
323   MatScalar         yval;
324   PetscInt          r,rows_left,row,nnz_in_row;
325 #else
326   PetscScalar       sum[8];
327 #endif
328 
329 #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
330 #pragma disjoint(*x,*y,*aval)
331 #endif
332 
333   PetscFunctionBegin;
334   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
335   ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
336 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
337   for (i=0; i<totalslices; i++) { /* loop over slices */
338     PetscPrefetchBlock(acolidx,a->sliidx[i+1]-a->sliidx[i],0,PETSC_PREFETCH_HINT_T0);
339     PetscPrefetchBlock(aval,a->sliidx[i+1]-a->sliidx[i],0,PETSC_PREFETCH_HINT_T0);
340 
341     vec_y  = _mm512_setzero_pd();
342     vec_y2 = _mm512_setzero_pd();
343     vec_y3 = _mm512_setzero_pd();
344     vec_y4 = _mm512_setzero_pd();
345 
346     j = a->sliidx[i]>>3; /* 8 bytes are read at each time, corresponding to a slice columnn */
347     switch ((a->sliidx[i+1]-a->sliidx[i])/8 & 3) {
348     case 3:
349       AVX512_Mult_Private(vec_idx,vec_x,vec_vals,vec_y);
350       acolidx += 8; aval += 8;
351       AVX512_Mult_Private(vec_idx2,vec_x2,vec_vals2,vec_y2);
352       acolidx += 8; aval += 8;
353       AVX512_Mult_Private(vec_idx3,vec_x3,vec_vals3,vec_y3);
354       acolidx += 8; aval += 8;
355       j += 3;
356       break;
357     case 2:
358       AVX512_Mult_Private(vec_idx,vec_x,vec_vals,vec_y);
359       acolidx += 8; aval += 8;
360       AVX512_Mult_Private(vec_idx2,vec_x2,vec_vals2,vec_y2);
361       acolidx += 8; aval += 8;
362       j += 2;
363       break;
364     case 1:
365       AVX512_Mult_Private(vec_idx,vec_x,vec_vals,vec_y);
366       acolidx += 8; aval += 8;
367       j += 1;
368       break;
369     }
370     #pragma novector
371     for (; j<(a->sliidx[i+1]>>3); j+=4) {
372       AVX512_Mult_Private(vec_idx,vec_x,vec_vals,vec_y);
373       acolidx += 8; aval += 8;
374       AVX512_Mult_Private(vec_idx2,vec_x2,vec_vals2,vec_y2);
375       acolidx += 8; aval += 8;
376       AVX512_Mult_Private(vec_idx3,vec_x3,vec_vals3,vec_y3);
377       acolidx += 8; aval += 8;
378       AVX512_Mult_Private(vec_idx4,vec_x4,vec_vals4,vec_y4);
379       acolidx += 8; aval += 8;
380     }
381 
382     vec_y = _mm512_add_pd(vec_y,vec_y2);
383     vec_y = _mm512_add_pd(vec_y,vec_y3);
384     vec_y = _mm512_add_pd(vec_y,vec_y4);
385     if (i == totalslices-1 && A->rmap->n & 0x07) { /* if last slice has padding rows */
386       mask = (__mmask8)(0xff >> (8-(A->rmap->n & 0x07)));
387       _mm512_mask_storeu_pd(&y[8*i],mask,vec_y);
388     } else {
389       _mm512_storeu_pd(&y[8*i],vec_y);
390     }
391   }
392 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
393   for (i=0; i<totalslices; i++) { /* loop over full slices */
394     PetscPrefetchBlock(acolidx,a->sliidx[i+1]-a->sliidx[i],0,PETSC_PREFETCH_HINT_T0);
395     PetscPrefetchBlock(aval,a->sliidx[i+1]-a->sliidx[i],0,PETSC_PREFETCH_HINT_T0);
396 
397     /* last slice may have padding rows. Don't use vectorization. */
398     if (i == totalslices-1 && (A->rmap->n & 0x07)) {
399       rows_left = A->rmap->n - 8*i;
400       for (r=0; r<rows_left; ++r) {
401         yval = (MatScalar)0;
402         row = 8*i + r;
403         nnz_in_row = a->rlen[row];
404         for (j=0; j<nnz_in_row; ++j) yval += aval[8*j+r] * x[acolidx[8*j+r]];
405         y[row] = yval;
406       }
407       break;
408     }
409 
410     vec_y  = _mm256_setzero_pd();
411     vec_y2 = _mm256_setzero_pd();
412 
413     /* Process slice of height 8 (512 bits) via two subslices of height 4 (256 bits) via AVX */
414     #pragma novector
415     #pragma unroll(2)
416     for (j=a->sliidx[i]; j<a->sliidx[i+1]; j+=8) {
417       AVX2_Mult_Private(vec_idx,vec_x,vec_vals,vec_y);
418       aval += 4; acolidx += 4;
419       AVX2_Mult_Private(vec_idx,vec_x,vec_vals,vec_y2);
420       aval += 4; acolidx += 4;
421     }
422 
423     _mm256_storeu_pd(y+i*8,vec_y);
424     _mm256_storeu_pd(y+i*8+4,vec_y2);
425   }
426 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
427   for (i=0; i<totalslices; i++) { /* loop over full slices */
428     PetscPrefetchBlock(acolidx,a->sliidx[i+1]-a->sliidx[i],0,PETSC_PREFETCH_HINT_T0);
429     PetscPrefetchBlock(aval,a->sliidx[i+1]-a->sliidx[i],0,PETSC_PREFETCH_HINT_T0);
430 
431     vec_y  = _mm256_setzero_pd();
432     vec_y2 = _mm256_setzero_pd();
433 
434     /* last slice may have padding rows. Don't use vectorization. */
435     if (i == totalslices-1 && (A->rmap->n & 0x07)) {
436       rows_left = A->rmap->n - 8*i;
437       for (r=0; r<rows_left; ++r) {
438         yval = (MatScalar)0;
439         row = 8*i + r;
440         nnz_in_row = a->rlen[row];
441         for (j=0; j<nnz_in_row; ++j) yval += aval[8*j + r] * x[acolidx[8*j + r]];
442         y[row] = yval;
443       }
444       break;
445     }
446 
447     /* Process slice of height 8 (512 bits) via two subslices of height 4 (256 bits) via AVX */
448     #pragma novector
449     #pragma unroll(2)
450     for (j=a->sliidx[i]; j<a->sliidx[i+1]; j+=8) {
451       vec_vals  = _mm256_loadu_pd(aval);
452       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
453       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
454       vec_x     = _mm256_insertf128_pd(vec_x,vec_x_tmp,0);
455       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
456       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
457       vec_x     = _mm256_insertf128_pd(vec_x,vec_x_tmp,1);
458       vec_y     = _mm256_add_pd(_mm256_mul_pd(vec_x,vec_vals),vec_y);
459       aval     += 4;
460 
461       vec_vals  = _mm256_loadu_pd(aval);
462       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
463       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
464       vec_x     = _mm256_insertf128_pd(vec_x,vec_x_tmp,0);
465       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
466       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
467       vec_x     = _mm256_insertf128_pd(vec_x,vec_x_tmp,1);
468       vec_y2    = _mm256_add_pd(_mm256_mul_pd(vec_x,vec_vals),vec_y2);
469       aval     += 4;
470     }
471 
472     _mm256_storeu_pd(y + i*8,     vec_y);
473     _mm256_storeu_pd(y + i*8 + 4, vec_y2);
474   }
475 #else
476   for (i=0; i<totalslices; i++) { /* loop over slices */
477     for (j=0; j<8; j++) sum[j] = 0.0;
478     for (j=a->sliidx[i]; j<a->sliidx[i+1]; j+=8) {
479       sum[0] += aval[j] * x[acolidx[j]];
480       sum[1] += aval[j+1] * x[acolidx[j+1]];
481       sum[2] += aval[j+2] * x[acolidx[j+2]];
482       sum[3] += aval[j+3] * x[acolidx[j+3]];
483       sum[4] += aval[j+4] * x[acolidx[j+4]];
484       sum[5] += aval[j+5] * x[acolidx[j+5]];
485       sum[6] += aval[j+6] * x[acolidx[j+6]];
486       sum[7] += aval[j+7] * x[acolidx[j+7]];
487     }
488     if (i == totalslices-1 && (A->rmap->n & 0x07)) { /* if last slice has padding rows */
489       for(j=0; j<(A->rmap->n & 0x07); j++) y[8*i+j] = sum[j];
490     } else {
491       for(j=0; j<8; j++) y[8*i+j] = sum[j];
492     }
493   }
494 #endif
495 
496   ierr = PetscLogFlops(2.0*a->nz-a->nonzerorowcnt);CHKERRQ(ierr); /* theoretical minimal FLOPs */
497   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
498   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
499   PetscFunctionReturn(0);
500 }
501 
502 #include <../src/mat/impls/aij/seq/ftn-kernels/fmultadd.h>
503 PetscErrorCode MatMultAdd_SeqSELL(Mat A,Vec xx,Vec yy,Vec zz)
504 {
505   Mat_SeqSELL       *a=(Mat_SeqSELL*)A->data;
506   PetscScalar       *y,*z;
507   const PetscScalar *x;
508   const MatScalar   *aval=a->val;
509   PetscInt          totalslices=a->totalslices;
510   const PetscInt    *acolidx=a->colidx;
511   PetscInt          i,j;
512   PetscErrorCode    ierr;
513 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
514   __m512d           vec_x,vec_y,vec_vals;
515   __m256i           vec_idx;
516   __mmask8          mask;
517   __m512d           vec_x2,vec_y2,vec_vals2,vec_x3,vec_y3,vec_vals3,vec_x4,vec_y4,vec_vals4;
518   __m256i           vec_idx2,vec_idx3,vec_idx4;
519 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
520   __m128d           vec_x_tmp;
521   __m256d           vec_x,vec_y,vec_y2,vec_vals;
522   MatScalar         yval;
523   PetscInt          r,row,nnz_in_row;
524 #else
525   PetscScalar       sum[8];
526 #endif
527 
528 #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
529 #pragma disjoint(*x,*y,*aval)
530 #endif
531 
532   PetscFunctionBegin;
533   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
534   ierr = VecGetArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
535 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
536   for (i=0; i<totalslices; i++) { /* loop over slices */
537     PetscPrefetchBlock(acolidx,a->sliidx[i+1]-a->sliidx[i],0,PETSC_PREFETCH_HINT_T0);
538     PetscPrefetchBlock(aval,a->sliidx[i+1]-a->sliidx[i],0,PETSC_PREFETCH_HINT_T0);
539 
540     if (i == totalslices-1 && A->rmap->n & 0x07) { /* if last slice has padding rows */
541       mask   = (__mmask8)(0xff >> (8-(A->rmap->n & 0x07)));
542       vec_y  = _mm512_mask_loadu_pd(vec_y,mask,&y[8*i]);
543     } else {
544       vec_y  = _mm512_loadu_pd(&y[8*i]);
545     }
546     vec_y2 = _mm512_setzero_pd();
547     vec_y3 = _mm512_setzero_pd();
548     vec_y4 = _mm512_setzero_pd();
549 
550     j = a->sliidx[i]>>3; /* 8 bytes are read at each time, corresponding to a slice columnn */
551     switch ((a->sliidx[i+1]-a->sliidx[i])/8 & 3) {
552     case 3:
553       AVX512_Mult_Private(vec_idx,vec_x,vec_vals,vec_y);
554       acolidx += 8; aval += 8;
555       AVX512_Mult_Private(vec_idx2,vec_x2,vec_vals2,vec_y2);
556       acolidx += 8; aval += 8;
557       AVX512_Mult_Private(vec_idx3,vec_x3,vec_vals3,vec_y3);
558       acolidx += 8; aval += 8;
559       j += 3;
560       break;
561     case 2:
562       AVX512_Mult_Private(vec_idx,vec_x,vec_vals,vec_y);
563       acolidx += 8; aval += 8;
564       AVX512_Mult_Private(vec_idx2,vec_x2,vec_vals2,vec_y2);
565       acolidx += 8; aval += 8;
566       j += 2;
567       break;
568     case 1:
569       AVX512_Mult_Private(vec_idx,vec_x,vec_vals,vec_y);
570       acolidx += 8; aval += 8;
571       j += 1;
572       break;
573     }
574     #pragma novector
575     for (; j<(a->sliidx[i+1]>>3); j+=4) {
576       AVX512_Mult_Private(vec_idx,vec_x,vec_vals,vec_y);
577       acolidx += 8; aval += 8;
578       AVX512_Mult_Private(vec_idx2,vec_x2,vec_vals2,vec_y2);
579       acolidx += 8; aval += 8;
580       AVX512_Mult_Private(vec_idx3,vec_x3,vec_vals3,vec_y3);
581       acolidx += 8; aval += 8;
582       AVX512_Mult_Private(vec_idx4,vec_x4,vec_vals4,vec_y4);
583       acolidx += 8; aval += 8;
584     }
585 
586     vec_y = _mm512_add_pd(vec_y,vec_y2);
587     vec_y = _mm512_add_pd(vec_y,vec_y3);
588     vec_y = _mm512_add_pd(vec_y,vec_y4);
589     if (i == totalslices-1 && A->rmap->n & 0x07) { /* if last slice has padding rows */
590       _mm512_mask_storeu_pd(&z[8*i],mask,vec_y);
591     } else {
592       _mm512_storeu_pd(&z[8*i],vec_y);
593     }
594   }
595 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
596   for (i=0; i<totalslices; i++) { /* loop over full slices */
597     PetscPrefetchBlock(acolidx,a->sliidx[i+1]-a->sliidx[i],0,PETSC_PREFETCH_HINT_T0);
598     PetscPrefetchBlock(aval,a->sliidx[i+1]-a->sliidx[i],0,PETSC_PREFETCH_HINT_T0);
599 
600     /* last slice may have padding rows. Don't use vectorization. */
601     if (i == totalslices-1 && (A->rmap->n & 0x07)) {
602       for (r=0; r<(A->rmap->n & 0x07); ++r) {
603         row        = 8*i + r;
604         yval       = (MatScalar)0.0;
605         nnz_in_row = a->rlen[row];
606         for (j=0; j<nnz_in_row; ++j) yval += aval[8*j+r] * x[acolidx[8*j+r]];
607         z[row] = y[row] + yval;
608       }
609       break;
610     }
611 
612     vec_y  = _mm256_loadu_pd(y+8*i);
613     vec_y2 = _mm256_loadu_pd(y+8*i+4);
614 
615     /* Process slice of height 8 (512 bits) via two subslices of height 4 (256 bits) via AVX */
616     for (j=a->sliidx[i]; j<a->sliidx[i+1]; j+=8) {
617       vec_vals  = _mm256_loadu_pd(aval);
618       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
619       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
620       vec_x     = _mm256_insertf128_pd(vec_x,vec_x_tmp,0);
621       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
622       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
623       vec_x     = _mm256_insertf128_pd(vec_x,vec_x_tmp,1);
624       vec_y     = _mm256_add_pd(_mm256_mul_pd(vec_x,vec_vals),vec_y);
625       aval     += 4;
626 
627       vec_vals  = _mm256_loadu_pd(aval);
628       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
629       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
630       vec_x     = _mm256_insertf128_pd(vec_x,vec_x_tmp,0);
631       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
632       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
633       vec_x     = _mm256_insertf128_pd(vec_x,vec_x_tmp,1);
634       vec_y2    = _mm256_add_pd(_mm256_mul_pd(vec_x,vec_vals),vec_y2);
635       aval     += 4;
636     }
637 
638     _mm256_storeu_pd(z+i*8,vec_y);
639     _mm256_storeu_pd(z+i*8+4,vec_y2);
640   }
641 #else
642   for (i=0; i<totalslices; i++) { /* loop over slices */
643     for (j=0; j<8; j++) sum[j] = 0.0;
644     for (j=a->sliidx[i]; j<a->sliidx[i+1]; j+=8) {
645       sum[0] += aval[j] * x[acolidx[j]];
646       sum[1] += aval[j+1] * x[acolidx[j+1]];
647       sum[2] += aval[j+2] * x[acolidx[j+2]];
648       sum[3] += aval[j+3] * x[acolidx[j+3]];
649       sum[4] += aval[j+4] * x[acolidx[j+4]];
650       sum[5] += aval[j+5] * x[acolidx[j+5]];
651       sum[6] += aval[j+6] * x[acolidx[j+6]];
652       sum[7] += aval[j+7] * x[acolidx[j+7]];
653     }
654     if (i == totalslices-1 && (A->rmap->n & 0x07)) {
655       for (j=0; j<(A->rmap->n & 0x07); j++) z[8*i+j] = y[8*i+j] + sum[j];
656     } else {
657       for (j=0; j<8; j++) z[8*i+j] = y[8*i+j] + sum[j];
658     }
659   }
660 #endif
661 
662   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
663   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
664   ierr = VecRestoreArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
665   PetscFunctionReturn(0);
666 }
667 
668 PetscErrorCode MatMultTransposeAdd_SeqSELL(Mat A,Vec xx,Vec zz,Vec yy)
669 {
670   Mat_SeqSELL       *a=(Mat_SeqSELL*)A->data;
671   PetscScalar       *y;
672   const PetscScalar *x;
673   const MatScalar   *aval=a->val;
674   const PetscInt    *acolidx=a->colidx;
675   PetscInt          i,j,r,row,nnz_in_row,totalslices=a->totalslices;
676   PetscErrorCode    ierr;
677 
678 #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
679 #pragma disjoint(*x,*y,*aval)
680 #endif
681 
682   PetscFunctionBegin;
683   if (A->symmetric) {
684     ierr = MatMultAdd_SeqSELL(A,xx,zz,yy);CHKERRQ(ierr);
685     PetscFunctionReturn(0);
686   }
687   if (zz != yy) { ierr = VecCopy(zz,yy);CHKERRQ(ierr); }
688   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
689   ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
690   for (i=0; i<a->totalslices; i++) { /* loop over slices */
691     if (i == totalslices-1 && (A->rmap->n & 0x07)) {
692       for (r=0; r<(A->rmap->n & 0x07); ++r) {
693         row        = 8*i + r;
694         nnz_in_row = a->rlen[row];
695         for (j=0; j<nnz_in_row; ++j) y[acolidx[8*j+r]] += aval[8*j+r] * x[row];
696       }
697       break;
698     }
699     for (j=a->sliidx[i]; j<a->sliidx[i+1]; j+=8) {
700       y[acolidx[j]]   += aval[j] * x[8*i];
701       y[acolidx[j+1]] += aval[j+1] * x[8*i+1];
702       y[acolidx[j+2]] += aval[j+2] * x[8*i+2];
703       y[acolidx[j+3]] += aval[j+3] * x[8*i+3];
704       y[acolidx[j+4]] += aval[j+4] * x[8*i+4];
705       y[acolidx[j+5]] += aval[j+5] * x[8*i+5];
706       y[acolidx[j+6]] += aval[j+6] * x[8*i+6];
707       y[acolidx[j+7]] += aval[j+7] * x[8*i+7];
708     }
709   }
710   ierr = PetscLogFlops(2.0*a->sliidx[a->totalslices]);CHKERRQ(ierr);
711   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
712   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
713   PetscFunctionReturn(0);
714 }
715 
716 PetscErrorCode MatMultTranspose_SeqSELL(Mat A,Vec xx,Vec yy)
717 {
718   PetscErrorCode ierr;
719 
720   PetscFunctionBegin;
721   if (A->symmetric) {
722     ierr = MatMult_SeqSELL(A,xx,yy);CHKERRQ(ierr);
723   } else {
724     ierr = VecSet(yy,0.0);CHKERRQ(ierr);
725     ierr = MatMultTransposeAdd_SeqSELL(A,xx,yy,yy);CHKERRQ(ierr);
726   }
727   PetscFunctionReturn(0);
728 }
729 
730 /*
731      Checks for missing diagonals
732 */
733 PetscErrorCode MatMissingDiagonal_SeqSELL(Mat A,PetscBool  *missing,PetscInt *d)
734 {
735   Mat_SeqSELL *a=(Mat_SeqSELL*)A->data;
736   PetscInt    *diag,i;
737 
738   PetscFunctionBegin;
739   *missing = PETSC_FALSE;
740   if (A->rmap->n > 0 && !(a->colidx)) {
741     *missing = PETSC_TRUE;
742     if (d) *d = 0;
743     PetscInfo(A,"Matrix has no entries therefore is missing diagonal\n");
744   } else {
745     diag = a->diag;
746     for (i=0; i<A->rmap->n; i++) {
747       if (diag[i] == -1) {
748         *missing = PETSC_TRUE;
749         if (d) *d = i;
750         PetscInfo1(A,"Matrix is missing diagonal number %D\n",i);
751         break;
752       }
753     }
754   }
755   PetscFunctionReturn(0);
756 }
757 
758 PetscErrorCode MatMarkDiagonal_SeqSELL(Mat A)
759 {
760   Mat_SeqSELL    *a=(Mat_SeqSELL*)A->data;
761   PetscInt       i,j,m=A->rmap->n,shift;
762   PetscErrorCode ierr;
763 
764   PetscFunctionBegin;
765   if (!a->diag) {
766     ierr         = PetscMalloc1(m,&a->diag);CHKERRQ(ierr);
767     ierr         = PetscLogObjectMemory((PetscObject)A,m*sizeof(PetscInt));CHKERRQ(ierr);
768     a->free_diag = PETSC_TRUE;
769   }
770   for (i=0; i<m; i++) { /* loop over rows */
771     shift = a->sliidx[i>>3]+(i&0x07); /* starting index of the row i */
772     a->diag[i] = -1;
773     for (j=0; j<a->rlen[i]; j++) {
774       if (a->colidx[shift+j*8] == i) {
775         a->diag[i] = shift+j*8;
776         break;
777       }
778     }
779   }
780   PetscFunctionReturn(0);
781 }
782 
783 /*
784   Negative shift indicates do not generate an error if there is a zero diagonal, just invert it anyways
785 */
786 PetscErrorCode MatInvertDiagonal_SeqSELL(Mat A,PetscScalar omega,PetscScalar fshift)
787 {
788   Mat_SeqSELL    *a=(Mat_SeqSELL*) A->data;
789   PetscInt       i,*diag,m = A->rmap->n;
790   MatScalar      *val = a->val;
791   PetscScalar    *idiag,*mdiag;
792   PetscErrorCode ierr;
793 
794   PetscFunctionBegin;
795   if (a->idiagvalid) PetscFunctionReturn(0);
796   ierr = MatMarkDiagonal_SeqSELL(A);CHKERRQ(ierr);
797   diag = a->diag;
798   if (!a->idiag) {
799     ierr = PetscMalloc3(m,&a->idiag,m,&a->mdiag,m,&a->ssor_work);CHKERRQ(ierr);
800     ierr = PetscLogObjectMemory((PetscObject)A, 3*m*sizeof(PetscScalar));CHKERRQ(ierr);
801     val  = a->val;
802   }
803   mdiag = a->mdiag;
804   idiag = a->idiag;
805 
806   if (omega == 1.0 && PetscRealPart(fshift) <= 0.0) {
807     for (i=0; i<m; i++) {
808       mdiag[i] = val[diag[i]];
809       if (!PetscAbsScalar(mdiag[i])) { /* zero diagonal */
810         if (PetscRealPart(fshift)) {
811           ierr = PetscInfo1(A,"Zero diagonal on row %D\n",i);CHKERRQ(ierr);
812           A->factorerrortype             = MAT_FACTOR_NUMERIC_ZEROPIVOT;
813           A->factorerror_zeropivot_value = 0.0;
814           A->factorerror_zeropivot_row   = i;
815         } else SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"Zero diagonal on row %D",i);
816       }
817       idiag[i] = 1.0/val[diag[i]];
818     }
819     ierr = PetscLogFlops(m);CHKERRQ(ierr);
820   } else {
821     for (i=0; i<m; i++) {
822       mdiag[i] = val[diag[i]];
823       idiag[i] = omega/(fshift + val[diag[i]]);
824     }
825     ierr = PetscLogFlops(2.0*m);CHKERRQ(ierr);
826   }
827   a->idiagvalid = PETSC_TRUE;
828   PetscFunctionReturn(0);
829 }
830 
831 PetscErrorCode MatZeroEntries_SeqSELL(Mat A)
832 {
833   Mat_SeqSELL    *a=(Mat_SeqSELL*)A->data;
834   PetscErrorCode ierr;
835 
836   PetscFunctionBegin;
837   ierr = PetscMemzero(a->val,(a->sliidx[a->totalslices])*sizeof(PetscScalar));CHKERRQ(ierr);
838   ierr = MatSeqSELLInvalidateDiagonal(A);CHKERRQ(ierr);
839   PetscFunctionReturn(0);
840 }
841 
842 PetscErrorCode MatDestroy_SeqSELL(Mat A)
843 {
844   Mat_SeqSELL    *a=(Mat_SeqSELL*)A->data;
845   PetscErrorCode ierr;
846 
847   PetscFunctionBegin;
848 #if defined(PETSC_USE_LOG)
849   PetscLogObjectState((PetscObject)A,"Rows=%D, Cols=%D, NZ=%D",A->rmap->n,A->cmap->n,a->nz);
850 #endif
851   ierr = MatSeqXSELLFreeSELL(A,&a->val,&a->colidx);CHKERRQ(ierr);
852   ierr = ISDestroy(&a->row);CHKERRQ(ierr);
853   ierr = ISDestroy(&a->col);CHKERRQ(ierr);
854   ierr = PetscFree(a->diag);CHKERRQ(ierr);
855   ierr = PetscFree(a->rlen);CHKERRQ(ierr);
856   ierr = PetscFree(a->sliidx);CHKERRQ(ierr);
857   ierr = PetscFree3(a->idiag,a->mdiag,a->ssor_work);CHKERRQ(ierr);
858   ierr = PetscFree(a->solve_work);CHKERRQ(ierr);
859   ierr = ISDestroy(&a->icol);CHKERRQ(ierr);
860   ierr = PetscFree(a->saved_values);CHKERRQ(ierr);
861   ierr = PetscFree2(a->getrowcols,a->getrowvals);CHKERRQ(ierr);
862 
863   ierr = PetscFree(A->data);CHKERRQ(ierr);
864 
865   ierr = PetscObjectChangeTypeName((PetscObject)A,0);CHKERRQ(ierr);
866   ierr = PetscObjectComposeFunction((PetscObject)A,"MatStoreValues_C",NULL);CHKERRQ(ierr);
867   ierr = PetscObjectComposeFunction((PetscObject)A,"MatRetrieveValues_C",NULL);CHKERRQ(ierr);
868 #if defined(PETSC_HAVE_ELEMENTAL)
869 #endif
870   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqSELLSetPreallocation_C",NULL);CHKERRQ(ierr);
871   PetscFunctionReturn(0);
872 }
873 
874 PetscErrorCode MatSetOption_SeqSELL(Mat A,MatOption op,PetscBool flg)
875 {
876   Mat_SeqSELL    *a=(Mat_SeqSELL*)A->data;
877   PetscErrorCode ierr;
878 
879   PetscFunctionBegin;
880   switch (op) {
881   case MAT_ROW_ORIENTED:
882     a->roworiented = flg;
883     break;
884   case MAT_KEEP_NONZERO_PATTERN:
885     a->keepnonzeropattern = flg;
886     break;
887   case MAT_NEW_NONZERO_LOCATIONS:
888     a->nonew = (flg ? 0 : 1);
889     break;
890   case MAT_NEW_NONZERO_LOCATION_ERR:
891     a->nonew = (flg ? -1 : 0);
892     break;
893   case MAT_NEW_NONZERO_ALLOCATION_ERR:
894     a->nonew = (flg ? -2 : 0);
895     break;
896   case MAT_UNUSED_NONZERO_LOCATION_ERR:
897     a->nounused = (flg ? -1 : 0);
898     break;
899   case MAT_NEW_DIAGONALS:
900   case MAT_IGNORE_OFF_PROC_ENTRIES:
901   case MAT_USE_HASH_TABLE:
902     ierr = PetscInfo1(A,"Option %s ignored\n",MatOptions[op]);CHKERRQ(ierr);
903     break;
904   case MAT_SPD:
905   case MAT_SYMMETRIC:
906   case MAT_STRUCTURALLY_SYMMETRIC:
907   case MAT_HERMITIAN:
908   case MAT_SYMMETRY_ETERNAL:
909     /* These options are handled directly by MatSetOption() */
910     break;
911   default:
912     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unknown option %d",op);
913   }
914   PetscFunctionReturn(0);
915 }
916 
917 PetscErrorCode MatGetDiagonal_SeqSELL(Mat A,Vec v)
918 {
919   Mat_SeqSELL    *a=(Mat_SeqSELL*)A->data;
920   PetscInt       i,j,n,shift;
921   PetscScalar    *x,zero=0.0;
922   PetscErrorCode ierr;
923 
924   PetscFunctionBegin;
925   ierr = VecGetLocalSize(v,&n);CHKERRQ(ierr);
926   if (n != A->rmap->n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector");
927 
928   if (A->factortype == MAT_FACTOR_ILU || A->factortype == MAT_FACTOR_LU) {
929     PetscInt *diag=a->diag;
930     ierr = VecGetArray(v,&x);CHKERRQ(ierr);
931     for (i=0; i<n; i++) x[i] = 1.0/a->val[diag[i]];
932     ierr = VecRestoreArray(v,&x);CHKERRQ(ierr);
933     PetscFunctionReturn(0);
934   }
935 
936   ierr = VecSet(v,zero);CHKERRQ(ierr);
937   ierr = VecGetArray(v,&x);CHKERRQ(ierr);
938   for (i=0; i<n; i++) { /* loop over rows */
939     shift = a->sliidx[i>>3]+(i&0x07); /* starting index of the row i */
940     x[i] = 0;
941     for (j=0; j<a->rlen[i]; j++) {
942       if (a->colidx[shift+j*8] == i) {
943         x[i] = a->val[shift+j*8];
944         break;
945       }
946     }
947   }
948   ierr = VecRestoreArray(v,&x);CHKERRQ(ierr);
949   PetscFunctionReturn(0);
950 }
951 
952 PetscErrorCode MatDiagonalScale_SeqSELL(Mat A,Vec ll,Vec rr)
953 {
954   Mat_SeqSELL       *a=(Mat_SeqSELL*)A->data;
955   const PetscScalar *l,*r;
956   PetscInt          i,j,m,n,row;
957   PetscErrorCode    ierr;
958 
959   PetscFunctionBegin;
960   if (ll) {
961     /* The local size is used so that VecMPI can be passed to this routine
962        by MatDiagonalScale_MPISELL */
963     ierr = VecGetLocalSize(ll,&m);CHKERRQ(ierr);
964     if (m != A->rmap->n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Left scaling vector wrong length");
965     ierr = VecGetArrayRead(ll,&l);CHKERRQ(ierr);
966     for (i=0; i<a->totalslices; i++) { /* loop over slices */
967       for (j=a->sliidx[i],row=0; j<a->sliidx[i+1]; j++,row=((row+1)&0x07)) {
968         a->val[j] *= l[8*i+row];
969       }
970     }
971     ierr = VecRestoreArrayRead(ll,&l);CHKERRQ(ierr);
972     ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
973   }
974   if (rr) {
975     ierr = VecGetLocalSize(rr,&n);CHKERRQ(ierr);
976     if (n != A->cmap->n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Right scaling vector wrong length");
977     ierr = VecGetArrayRead(rr,&r);CHKERRQ(ierr);
978     for (i=0; i<a->totalslices; i++) { /* loop over slices */
979       for (j=a->sliidx[i]; j<a->sliidx[i+1]; j++) {
980         a->val[j] *= r[a->colidx[j]];
981       }
982     }
983     ierr = VecRestoreArrayRead(rr,&r);CHKERRQ(ierr);
984     ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
985   }
986   ierr = MatSeqSELLInvalidateDiagonal(A);CHKERRQ(ierr);
987   PetscFunctionReturn(0);
988 }
989 
990 extern PetscErrorCode MatSetValues_SeqSELL(Mat,PetscInt,const PetscInt[],PetscInt,const PetscInt[],const PetscScalar[],InsertMode);
991 
992 PetscErrorCode MatGetValues_SeqSELL(Mat A,PetscInt m,const PetscInt im[],PetscInt n,const PetscInt in[],PetscScalar v[])
993 {
994   Mat_SeqSELL *a=(Mat_SeqSELL*)A->data;
995   PetscInt    *cp,i,k,low,high,t,row,col,l;
996   PetscInt    shift;
997   MatScalar   *vp;
998 
999   PetscFunctionBegin;
1000   for (k=0; k<m; k++) { /* loop over requested rows */
1001     row = im[k];
1002     if (row<0) continue;
1003 #if defined(PETSC_USE_DEBUG)
1004     if (row >= A->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Row too large: row %D max %D",row,A->rmap->n-1);
1005 #endif
1006     shift = a->sliidx[row>>3]+(row&0x07); /* starting index of the row */
1007     cp = a->colidx+shift; /* pointer to the row */
1008     vp = a->val+shift; /* pointer to the row */
1009     for (l=0; l<n; l++) { /* loop over requested columns */
1010       col = in[l];
1011       if (col<0) continue;
1012 #if defined(PETSC_USE_DEBUG)
1013       if (col >= A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Column too large: row %D max %D",col,A->cmap->n-1);
1014 #endif
1015       high = a->rlen[row]; low = 0; /* assume unsorted */
1016       while (high-low > 5) {
1017         t = (low+high)/2;
1018         if (*(cp+t*8) > col) high = t;
1019         else low = t;
1020       }
1021       for (i=low; i<high; i++) {
1022         if (*(cp+8*i) > col) break;
1023         if (*(cp+8*i) == col) {
1024           *v++ = *(vp+8*i);
1025           goto finished;
1026         }
1027       }
1028       *v++ = 0.0;
1029     finished:;
1030     }
1031   }
1032   PetscFunctionReturn(0);
1033 }
1034 
1035 PetscErrorCode MatView_SeqSELL_ASCII(Mat A,PetscViewer viewer)
1036 {
1037   Mat_SeqSELL       *a=(Mat_SeqSELL*)A->data;
1038   PetscInt          i,j,m=A->rmap->n,shift;
1039   const char        *name;
1040   PetscViewerFormat format;
1041   PetscErrorCode    ierr;
1042 
1043   PetscFunctionBegin;
1044   ierr = PetscViewerGetFormat(viewer,&format);CHKERRQ(ierr);
1045   if (format == PETSC_VIEWER_ASCII_MATLAB) {
1046     PetscInt nofinalvalue = 0;
1047     /*
1048     if (m && ((a->i[m] == a->i[m-1]) || (a->j[a->nz-1] != A->cmap->n-1))) {
1049       nofinalvalue = 1;
1050     }
1051     */
1052     ierr = PetscViewerASCIIUseTabs(viewer,PETSC_FALSE);CHKERRQ(ierr);
1053     ierr = PetscViewerASCIIPrintf(viewer,"%% Size = %D %D \n",m,A->cmap->n);CHKERRQ(ierr);
1054     ierr = PetscViewerASCIIPrintf(viewer,"%% Nonzeros = %D \n",a->nz);CHKERRQ(ierr);
1055 #if defined(PETSC_USE_COMPLEX)
1056     ierr = PetscViewerASCIIPrintf(viewer,"zzz = zeros(%D,4);\n",a->nz+nofinalvalue);CHKERRQ(ierr);
1057 #else
1058     ierr = PetscViewerASCIIPrintf(viewer,"zzz = zeros(%D,3);\n",a->nz+nofinalvalue);CHKERRQ(ierr);
1059 #endif
1060     ierr = PetscViewerASCIIPrintf(viewer,"zzz = [\n");CHKERRQ(ierr);
1061 
1062     for (i=0; i<m; i++) {
1063       shift = a->sliidx[i>>3]+(i&0x07);
1064       for (j=0; j<a->rlen[i]; j++) {
1065 #if defined(PETSC_USE_COMPLEX)
1066         ierr = PetscViewerASCIIPrintf(viewer,"%D %D  %18.16e %18.16e\n",i+1,a->colidx[shift+8*j]+1,(double)PetscRealPart(a->val[shift+8*j]),(double)PetscImaginaryPart(a->val[shift+8*j]));CHKERRQ(ierr);
1067 #else
1068         ierr = PetscViewerASCIIPrintf(viewer,"%D %D  %18.16e\n",i+1,a->colidx[shift+8*j]+1,(double)a->val[shift+8*j]);CHKERRQ(ierr);
1069 #endif
1070       }
1071     }
1072     /*
1073     if (nofinalvalue) {
1074 #if defined(PETSC_USE_COMPLEX)
1075       ierr = PetscViewerASCIIPrintf(viewer,"%D %D  %18.16e %18.16e\n",m,A->cmap->n,0.,0.);CHKERRQ(ierr);
1076 #else
1077       ierr = PetscViewerASCIIPrintf(viewer,"%D %D  %18.16e\n",m,A->cmap->n,0.0);CHKERRQ(ierr);
1078 #endif
1079     }
1080     */
1081     ierr = PetscObjectGetName((PetscObject)A,&name);CHKERRQ(ierr);
1082     ierr = PetscViewerASCIIPrintf(viewer,"];\n %s = spconvert(zzz);\n",name);CHKERRQ(ierr);
1083     ierr = PetscViewerASCIIUseTabs(viewer,PETSC_TRUE);CHKERRQ(ierr);
1084   } else if (format == PETSC_VIEWER_ASCII_FACTOR_INFO || format == PETSC_VIEWER_ASCII_INFO) {
1085     PetscFunctionReturn(0);
1086   } else if (format == PETSC_VIEWER_ASCII_COMMON) {
1087     ierr = PetscViewerASCIIUseTabs(viewer,PETSC_FALSE);CHKERRQ(ierr);
1088     for (i=0; i<m; i++) {
1089       ierr = PetscViewerASCIIPrintf(viewer,"row %D:",i);CHKERRQ(ierr);
1090       shift = a->sliidx[i>>3]+(i&0x07);
1091       for (j=0; j<a->rlen[i]; j++) {
1092 #if defined(PETSC_USE_COMPLEX)
1093         if (PetscImaginaryPart(a->val[shift+8*j]) > 0.0 && PetscRealPart(a->val[shift+8*j]) != 0.0) {
1094           ierr = PetscViewerASCIIPrintf(viewer," (%D, %g + %g i)",a->colidx[shift+8*j],(double)PetscRealPart(a->val[shift+8*j]),(double)PetscImaginaryPart(a->val[shift+8*j]));CHKERRQ(ierr);
1095         } else if (PetscImaginaryPart(a->val[shift+8*j]) < 0.0 && PetscRealPart(a->val[shift+8*j]) != 0.0) {
1096           ierr = PetscViewerASCIIPrintf(viewer," (%D, %g - %g i)",a->colidx[shift+8*j],(double)PetscRealPart(a->val[shift+8*j]),(double)-PetscImaginaryPart(a->val[shift+8*j]));CHKERRQ(ierr);
1097         } else if (PetscRealPart(a->val[shift+8*j]) != 0.0) {
1098           ierr = PetscViewerASCIIPrintf(viewer," (%D, %g) ",a->colidx[shift+8*j],(double)PetscRealPart(a->val[shift+8*j]));CHKERRQ(ierr);
1099         }
1100 #else
1101         if (a->val[shift+8*j] != 0.0) {ierr = PetscViewerASCIIPrintf(viewer," (%D, %g) ",a->colidx[shift+8*j],(double)a->val[shift+8*j]);CHKERRQ(ierr);}
1102 #endif
1103       }
1104       ierr = PetscViewerASCIIPrintf(viewer,"\n");CHKERRQ(ierr);
1105     }
1106     ierr = PetscViewerASCIIUseTabs(viewer,PETSC_TRUE);CHKERRQ(ierr);
1107   } else if (format == PETSC_VIEWER_ASCII_DENSE) {
1108     PetscInt    cnt=0,jcnt;
1109     PetscScalar value;
1110 #if defined(PETSC_USE_COMPLEX)
1111     PetscBool   realonly=PETSC_TRUE;
1112     for (i=0; i<a->sliidx[a->totalslices]; i++) {
1113       if (PetscImaginaryPart(a->val[i]) != 0.0) {
1114         realonly = PETSC_FALSE;
1115         break;
1116       }
1117     }
1118 #endif
1119 
1120     ierr = PetscViewerASCIIUseTabs(viewer,PETSC_FALSE);CHKERRQ(ierr);
1121     for (i=0; i<m; i++) {
1122       jcnt = 0;
1123       shift = a->sliidx[i>>3]+(i&0x07);
1124       for (j=0; j<A->cmap->n; j++) {
1125         if (jcnt < a->rlen[i] && j == a->colidx[shift+8*j]) {
1126           value = a->val[cnt++];
1127           jcnt++;
1128         } else {
1129           value = 0.0;
1130         }
1131 #if defined(PETSC_USE_COMPLEX)
1132         if (realonly) {
1133           ierr = PetscViewerASCIIPrintf(viewer," %7.5e ",(double)PetscRealPart(value));CHKERRQ(ierr);
1134         } else {
1135           ierr = PetscViewerASCIIPrintf(viewer," %7.5e+%7.5e i ",(double)PetscRealPart(value),(double)PetscImaginaryPart(value));CHKERRQ(ierr);
1136         }
1137 #else
1138         ierr = PetscViewerASCIIPrintf(viewer," %7.5e ",(double)value);CHKERRQ(ierr);
1139 #endif
1140       }
1141       ierr = PetscViewerASCIIPrintf(viewer,"\n");CHKERRQ(ierr);
1142     }
1143     ierr = PetscViewerASCIIUseTabs(viewer,PETSC_TRUE);CHKERRQ(ierr);
1144   } else if (format == PETSC_VIEWER_ASCII_MATRIXMARKET) {
1145     PetscInt fshift=1;
1146     ierr = PetscViewerASCIIUseTabs(viewer,PETSC_FALSE);CHKERRQ(ierr);
1147 #if defined(PETSC_USE_COMPLEX)
1148     ierr = PetscViewerASCIIPrintf(viewer,"%%%%MatrixMarket matrix coordinate complex general\n");CHKERRQ(ierr);
1149 #else
1150     ierr = PetscViewerASCIIPrintf(viewer,"%%%%MatrixMarket matrix coordinate real general\n");CHKERRQ(ierr);
1151 #endif
1152     ierr = PetscViewerASCIIPrintf(viewer,"%D %D %D\n", m, A->cmap->n, a->nz);CHKERRQ(ierr);
1153     for (i=0; i<m; i++) {
1154       shift = a->sliidx[i>>3]+(i&0x07);
1155       for (j=0; j<a->rlen[i]; j++) {
1156 #if defined(PETSC_USE_COMPLEX)
1157         ierr = PetscViewerASCIIPrintf(viewer,"%D %D %g %g\n",i+fshift,a->colidx[shift+8*j]+fshift,(double)PetscRealPart(a->val[shift+8*j]),(double)PetscImaginaryPart(a->val[shift+8*j]));CHKERRQ(ierr);
1158 #else
1159         ierr = PetscViewerASCIIPrintf(viewer,"%D %D %g\n",i+fshift,a->colidx[shift+8*j]+fshift,(double)a->val[shift+8*j]);CHKERRQ(ierr);
1160 #endif
1161       }
1162     }
1163     ierr = PetscViewerASCIIUseTabs(viewer,PETSC_TRUE);CHKERRQ(ierr);
1164   } else if (format == PETSC_VIEWER_NATIVE) {
1165     for (i=0; i<a->totalslices; i++) { /* loop over slices */
1166       PetscInt row;
1167       ierr = PetscViewerASCIIPrintf(viewer,"slice %D: %D %D\n",i,a->sliidx[i],a->sliidx[i+1]);CHKERRQ(ierr);
1168       for (j=a->sliidx[i],row=0; j<a->sliidx[i+1]; j++,row=((row+1)&0x07)) {
1169 #if defined(PETSC_USE_COMPLEX)
1170         if (PetscImaginaryPart(a->val[j]) > 0.0) {
1171           ierr = PetscViewerASCIIPrintf(viewer,"  %D %D %g + %g i\n",8*i+row,a->colidx[j],(double)PetscRealPart(a->val[j]),(double)PetscImaginaryPart(a->val[j]));CHKERRQ(ierr);
1172         } else if (PetscImaginaryPart(a->val[j]) < 0.0) {
1173           ierr = PetscViewerASCIIPrintf(viewer,"  %D %D %g - %g i\n",8*i+row,a->colidx[j],(double)PetscRealPart(a->val[j]),-(double)PetscImaginaryPart(a->val[j]));CHKERRQ(ierr);
1174         } else {
1175           ierr = PetscViewerASCIIPrintf(viewer,"  %D %D %g\n",8*i+row,a->colidx[j],(double)PetscRealPart(a->val[j]));CHKERRQ(ierr);
1176         }
1177 #else
1178         ierr = PetscViewerASCIIPrintf(viewer,"  %D %D %g\n",8*i+row,a->colidx[j],(double)a->val[j]);CHKERRQ(ierr);
1179 #endif
1180       }
1181     }
1182   } else {
1183     ierr = PetscViewerASCIIUseTabs(viewer,PETSC_FALSE);CHKERRQ(ierr);
1184     if (A->factortype) {
1185       for (i=0; i<m; i++) {
1186         shift = a->sliidx[i>>3]+(i&0x07);
1187         ierr = PetscViewerASCIIPrintf(viewer,"row %D:",i);CHKERRQ(ierr);
1188         /* L part */
1189         for (j=shift; j<a->diag[i]; j+=8) {
1190 #if defined(PETSC_USE_COMPLEX)
1191           if (PetscImaginaryPart(a->val[shift+8*j]) > 0.0) {
1192             ierr = PetscViewerASCIIPrintf(viewer," (%D, %g + %g i)",a->colidx[j],(double)PetscRealPart(a->val[j]),(double)PetscImaginaryPart(a->val[j]));CHKERRQ(ierr);
1193           } else if (PetscImaginaryPart(a->val[shift+8*j]) < 0.0) {
1194             ierr = PetscViewerASCIIPrintf(viewer," (%D, %g - %g i)",a->colidx[j],(double)PetscRealPart(a->val[j]),(double)(-PetscImaginaryPart(a->val[j])));CHKERRQ(ierr);
1195           } else {
1196             ierr = PetscViewerASCIIPrintf(viewer," (%D, %g) ",a->colidx[j],(double)PetscRealPart(a->val[j]));CHKERRQ(ierr);
1197           }
1198 #else
1199           ierr = PetscViewerASCIIPrintf(viewer," (%D, %g) ",a->colidx[j],(double)a->val[j]);CHKERRQ(ierr);
1200 #endif
1201         }
1202         /* diagonal */
1203         j = a->diag[i];
1204 #if defined(PETSC_USE_COMPLEX)
1205         if (PetscImaginaryPart(a->val[j]) > 0.0) {
1206           ierr = PetscViewerASCIIPrintf(viewer," (%D, %g + %g i)",a->colidx[j],(double)PetscRealPart(1.0/a->val[j]),(double)PetscImaginaryPart(1.0/a->val[j]));CHKERRQ(ierr);
1207         } else if (PetscImaginaryPart(a->val[j]) < 0.0) {
1208           ierr = PetscViewerASCIIPrintf(viewer," (%D, %g - %g i)",a->colidx[j],(double)PetscRealPart(1.0/a->val[j]),(double)(-PetscImaginaryPart(1.0/a->val[j])));CHKERRQ(ierr);
1209         } else {
1210           ierr = PetscViewerASCIIPrintf(viewer," (%D, %g) ",a->colidx[j],(double)PetscRealPart(1.0/a->val[j]));CHKERRQ(ierr);
1211         }
1212 #else
1213         ierr = PetscViewerASCIIPrintf(viewer," (%D, %g) ",a->colidx[j],(double)(1.0/a->val[j]));CHKERRQ(ierr);
1214 #endif
1215 
1216         /* U part */
1217         for (j=a->diag[i]+1; j<shift+8*a->rlen[i]; j+=8) {
1218 #if defined(PETSC_USE_COMPLEX)
1219           if (PetscImaginaryPart(a->val[j]) > 0.0) {
1220             ierr = PetscViewerASCIIPrintf(viewer," (%D, %g + %g i)",a->colidx[j],(double)PetscRealPart(a->val[j]),(double)PetscImaginaryPart(a->val[j]));CHKERRQ(ierr);
1221           } else if (PetscImaginaryPart(a->val[j]) < 0.0) {
1222             ierr = PetscViewerASCIIPrintf(viewer," (%D, %g - %g i)",a->colidx[j],(double)PetscRealPart(a->val[j]),(double)(-PetscImaginaryPart(a->val[j])));CHKERRQ(ierr);
1223           } else {
1224             ierr = PetscViewerASCIIPrintf(viewer," (%D, %g) ",a->colidx[j],(double)PetscRealPart(a->val[j]));CHKERRQ(ierr);
1225           }
1226 #else
1227           ierr = PetscViewerASCIIPrintf(viewer," (%D, %g) ",a->colidx[j],(double)a->val[j]);CHKERRQ(ierr);
1228 #endif
1229         }
1230         ierr = PetscViewerASCIIPrintf(viewer,"\n");CHKERRQ(ierr);
1231       }
1232     } else {
1233       for (i=0; i<m; i++) {
1234         shift = a->sliidx[i>>3]+(i&0x07);
1235         ierr = PetscViewerASCIIPrintf(viewer,"row %D:",i);CHKERRQ(ierr);
1236         for (j=0; j<a->rlen[i]; j++) {
1237 #if defined(PETSC_USE_COMPLEX)
1238           if (PetscImaginaryPart(a->val[j]) > 0.0) {
1239             ierr = PetscViewerASCIIPrintf(viewer," (%D, %g + %g i)",a->colidx[shift+8*j],(double)PetscRealPart(a->val[shift+8*j]),(double)PetscImaginaryPart(a->val[shift+8*j]));CHKERRQ(ierr);
1240           } else if (PetscImaginaryPart(a->val[j]) < 0.0) {
1241             ierr = PetscViewerASCIIPrintf(viewer," (%D, %g - %g i)",a->colidx[shift+8*j],(double)PetscRealPart(a->val[shift+8*j]),(double)-PetscImaginaryPart(a->val[shift+8*j]));CHKERRQ(ierr);
1242           } else {
1243             ierr = PetscViewerASCIIPrintf(viewer," (%D, %g) ",a->colidx[shift+8*j],(double)PetscRealPart(a->val[shift+8*j]));CHKERRQ(ierr);
1244           }
1245 #else
1246           ierr = PetscViewerASCIIPrintf(viewer," (%D, %g) ",a->colidx[shift+8*j],(double)a->val[shift+8*j]);CHKERRQ(ierr);
1247 #endif
1248         }
1249         ierr = PetscViewerASCIIPrintf(viewer,"\n");CHKERRQ(ierr);
1250       }
1251     }
1252     ierr = PetscViewerASCIIUseTabs(viewer,PETSC_TRUE);CHKERRQ(ierr);
1253   }
1254   ierr = PetscViewerFlush(viewer);CHKERRQ(ierr);
1255   PetscFunctionReturn(0);
1256 }
1257 
1258 #include <petscdraw.h>
1259 PetscErrorCode MatView_SeqSELL_Draw_Zoom(PetscDraw draw,void *Aa)
1260 {
1261   Mat               A=(Mat)Aa;
1262   Mat_SeqSELL       *a=(Mat_SeqSELL*)A->data;
1263   PetscInt          i,j,m=A->rmap->n,shift;
1264   int               color;
1265   PetscReal         xl,yl,xr,yr,x_l,x_r,y_l,y_r;
1266   PetscViewer       viewer;
1267   PetscViewerFormat format;
1268   PetscErrorCode    ierr;
1269 
1270   PetscFunctionBegin;
1271   ierr = PetscObjectQuery((PetscObject)A,"Zoomviewer",(PetscObject*)&viewer);CHKERRQ(ierr);
1272   ierr = PetscViewerGetFormat(viewer,&format);CHKERRQ(ierr);
1273   ierr = PetscDrawGetCoordinates(draw,&xl,&yl,&xr,&yr);CHKERRQ(ierr);
1274 
1275   /* loop over matrix elements drawing boxes */
1276 
1277   if (format != PETSC_VIEWER_DRAW_CONTOUR) {
1278     ierr = PetscDrawCollectiveBegin(draw);CHKERRQ(ierr);
1279     /* Blue for negative, Cyan for zero and  Red for positive */
1280     color = PETSC_DRAW_BLUE;
1281     for (i=0; i<m; i++) {
1282       shift = a->sliidx[i>>3]+(i&0x07); /* starting index of the row i */
1283       y_l = m - i - 1.0; y_r = y_l + 1.0;
1284       for (j=0; j<a->rlen[i]; j++) {
1285         x_l = a->colidx[shift+j*8]; x_r = x_l + 1.0;
1286         if (PetscRealPart(a->val[shift+8*j]) >=  0.) continue;
1287         ierr = PetscDrawRectangle(draw,x_l,y_l,x_r,y_r,color,color,color,color);CHKERRQ(ierr);
1288       }
1289     }
1290     color = PETSC_DRAW_CYAN;
1291     for (i=0; i<m; i++) {
1292       shift = a->sliidx[i>>3]+(i&0x07);
1293       y_l = m - i - 1.0; y_r = y_l + 1.0;
1294       for (j=0; j<a->rlen[i]; j++) {
1295         x_l = a->colidx[shift+j*8]; x_r = x_l + 1.0;
1296         if (a->val[shift+8*j] !=  0.) continue;
1297         ierr = PetscDrawRectangle(draw,x_l,y_l,x_r,y_r,color,color,color,color);CHKERRQ(ierr);
1298       }
1299     }
1300     color = PETSC_DRAW_RED;
1301     for (i=0; i<m; i++) {
1302       shift = a->sliidx[i>>3]+(i&0x07);
1303       y_l = m - i - 1.0; y_r = y_l + 1.0;
1304       for (j=0; j<a->rlen[i]; j++) {
1305         x_l = a->colidx[shift+j*8]; x_r = x_l + 1.0;
1306         if (PetscRealPart(a->val[shift+8*j]) <=  0.) continue;
1307         ierr = PetscDrawRectangle(draw,x_l,y_l,x_r,y_r,color,color,color,color);CHKERRQ(ierr);
1308       }
1309     }
1310     ierr = PetscDrawCollectiveEnd(draw);CHKERRQ(ierr);
1311   } else {
1312     /* use contour shading to indicate magnitude of values */
1313     /* first determine max of all nonzero values */
1314     PetscReal minv=0.0,maxv=0.0;
1315     PetscInt  count=0;
1316     PetscDraw popup;
1317     for (i=0; i<a->sliidx[a->totalslices]; i++) {
1318       if (PetscAbsScalar(a->val[i]) > maxv) maxv = PetscAbsScalar(a->val[i]);
1319     }
1320     if (minv >= maxv) maxv = minv + PETSC_SMALL;
1321     ierr = PetscDrawGetPopup(draw,&popup);CHKERRQ(ierr);
1322     ierr = PetscDrawScalePopup(popup,minv,maxv);CHKERRQ(ierr);
1323 
1324     ierr = PetscDrawCollectiveBegin(draw);CHKERRQ(ierr);
1325     for (i=0; i<m; i++) {
1326       shift = a->sliidx[i>>3]+(i&0x07);
1327       y_l = m - i - 1.0;
1328       y_r = y_l + 1.0;
1329       for (j=0; j<a->rlen[i]; j++) {
1330         x_l = a->colidx[shift+j*8];
1331         x_r = x_l + 1.0;
1332         color = PetscDrawRealToColor(PetscAbsScalar(a->val[count]),minv,maxv);
1333         ierr = PetscDrawRectangle(draw,x_l,y_l,x_r,y_r,color,color,color,color);CHKERRQ(ierr);
1334         count++;
1335       }
1336     }
1337     ierr = PetscDrawCollectiveEnd(draw);CHKERRQ(ierr);
1338   }
1339   PetscFunctionReturn(0);
1340 }
1341 
1342 #include <petscdraw.h>
1343 PetscErrorCode MatView_SeqSELL_Draw(Mat A,PetscViewer viewer)
1344 {
1345   PetscDraw      draw;
1346   PetscReal      xr,yr,xl,yl,h,w;
1347   PetscBool      isnull;
1348   PetscErrorCode ierr;
1349 
1350   PetscFunctionBegin;
1351   ierr = PetscViewerDrawGetDraw(viewer,0,&draw);CHKERRQ(ierr);
1352   ierr = PetscDrawIsNull(draw,&isnull);CHKERRQ(ierr);
1353   if (isnull) PetscFunctionReturn(0);
1354 
1355   xr   = A->cmap->n; yr  = A->rmap->n; h = yr/10.0; w = xr/10.0;
1356   xr  += w;          yr += h;         xl = -w;     yl = -h;
1357   ierr = PetscDrawSetCoordinates(draw,xl,yl,xr,yr);CHKERRQ(ierr);
1358   ierr = PetscObjectCompose((PetscObject)A,"Zoomviewer",(PetscObject)viewer);CHKERRQ(ierr);
1359   ierr = PetscDrawZoom(draw,MatView_SeqSELL_Draw_Zoom,A);CHKERRQ(ierr);
1360   ierr = PetscObjectCompose((PetscObject)A,"Zoomviewer",NULL);CHKERRQ(ierr);
1361   ierr = PetscDrawSave(draw);CHKERRQ(ierr);
1362   PetscFunctionReturn(0);
1363 }
1364 
1365 PetscErrorCode MatView_SeqSELL(Mat A,PetscViewer viewer)
1366 {
1367   PetscBool      iascii,isbinary,isdraw;
1368   PetscErrorCode ierr;
1369 
1370   PetscFunctionBegin;
1371   ierr = PetscObjectTypeCompare((PetscObject)viewer,PETSCVIEWERASCII,&iascii);CHKERRQ(ierr);
1372   ierr = PetscObjectTypeCompare((PetscObject)viewer,PETSCVIEWERBINARY,&isbinary);CHKERRQ(ierr);
1373   ierr = PetscObjectTypeCompare((PetscObject)viewer,PETSCVIEWERDRAW,&isdraw);CHKERRQ(ierr);
1374   if (iascii) {
1375     ierr = MatView_SeqSELL_ASCII(A,viewer);CHKERRQ(ierr);
1376   } else if (isbinary) {
1377     /* ierr = MatView_SeqSELL_Binary(A,viewer);CHKERRQ(ierr); */
1378   } else if (isdraw) {
1379     ierr = MatView_SeqSELL_Draw(A,viewer);CHKERRQ(ierr);
1380   }
1381   PetscFunctionReturn(0);
1382 }
1383 
1384 PetscErrorCode MatAssemblyEnd_SeqSELL(Mat A,MatAssemblyType mode)
1385 {
1386   Mat_SeqSELL    *a=(Mat_SeqSELL*)A->data;
1387   PetscInt       i,shift,row_in_slice,row,nrow,*cp,lastcol,j,k;
1388   MatScalar      *vp;
1389   PetscErrorCode ierr;
1390 
1391   PetscFunctionBegin;
1392   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0);
1393   /* To do: compress out the unused elements */
1394   ierr = MatMarkDiagonal_SeqSELL(A);CHKERRQ(ierr);
1395   ierr = PetscInfo6(A,"Matrix size: %D X %D; storage space: %D allocated %D used (%D nonzeros+%D paddedzeros)\n",A->rmap->n,A->cmap->n,a->maxallocmat,a->sliidx[a->totalslices],a->nz,a->sliidx[a->totalslices]-a->nz);CHKERRQ(ierr);
1396   ierr = PetscInfo1(A,"Number of mallocs during MatSetValues() is %D\n",a->reallocs);CHKERRQ(ierr);
1397   ierr = PetscInfo1(A,"Maximum nonzeros in any row is %D\n",a->rlenmax);CHKERRQ(ierr);
1398   /* Set unused slots for column indices to last valid column index. Set unused slots for values to zero. This allows for a use of unmasked intrinsics -> higher performance */
1399   for (i=0; i<a->totalslices; ++i) {
1400     shift = a->sliidx[i];    /* starting index of the slice */
1401     cp    = a->colidx+shift; /* pointer to the column indices of the slice */
1402     vp    = a->val+shift;    /* pointer to the nonzero values of the slice */
1403     for (row_in_slice=0; row_in_slice<8; ++row_in_slice) { /* loop over rows in the slice */
1404       row  = 8*i + row_in_slice;
1405       nrow = a->rlen[row]; /* number of nonzeros in row */
1406       /*
1407         Search for the nearest nonzero. Normally setting the index to zero may cause extra communication.
1408         But if the entire slice are empty, it is fine to use 0 since the index will not be loaded.
1409       */
1410       lastcol = 0;
1411       if (nrow>0) { /* nonempty row */
1412         lastcol = cp[8*(nrow-1)+row_in_slice]; /* use the index from the last nonzero at current row */
1413       } else if (!row_in_slice) { /* first row of the currect slice is empty */
1414         for (j=1;j<8;j++) {
1415           if (a->rlen[8*i+j]) {
1416             lastcol = cp[j];
1417             break;
1418           }
1419         }
1420       } else {
1421         if (a->sliidx[i+1] != shift) lastcol = cp[row_in_slice-1]; /* use the index from the previous row */
1422       }
1423 
1424       for (k=nrow; k<(a->sliidx[i+1]-shift)/8; ++k) {
1425         cp[8*k+row_in_slice] = lastcol;
1426         vp[8*k+row_in_slice] = (MatScalar)0;
1427       }
1428     }
1429   }
1430 
1431   A->info.mallocs += a->reallocs;
1432   a->reallocs      = 0;
1433 
1434   ierr = MatSeqSELLInvalidateDiagonal(A);CHKERRQ(ierr);
1435   PetscFunctionReturn(0);
1436 }
1437 
1438 PetscErrorCode MatGetInfo_SeqSELL(Mat A,MatInfoType flag,MatInfo *info)
1439 {
1440   Mat_SeqSELL *a=(Mat_SeqSELL*)A->data;
1441 
1442   PetscFunctionBegin;
1443   info->block_size   = 1.0;
1444   info->nz_allocated = (double)a->maxallocmat;
1445   info->nz_used      = (double)a->sliidx[a->totalslices]; /* include padding zeros */
1446   info->nz_unneeded  = (double)(a->maxallocmat-a->sliidx[a->totalslices]);
1447   info->assemblies   = (double)A->num_ass;
1448   info->mallocs      = (double)A->info.mallocs;
1449   info->memory       = ((PetscObject)A)->mem;
1450   if (A->factortype) {
1451     info->fill_ratio_given  = A->info.fill_ratio_given;
1452     info->fill_ratio_needed = A->info.fill_ratio_needed;
1453     info->factor_mallocs    = A->info.factor_mallocs;
1454   } else {
1455     info->fill_ratio_given  = 0;
1456     info->fill_ratio_needed = 0;
1457     info->factor_mallocs    = 0;
1458   }
1459   PetscFunctionReturn(0);
1460 }
1461 
1462 PetscErrorCode MatSetValues_SeqSELL(Mat A,PetscInt m,const PetscInt im[],PetscInt n,const PetscInt in[],const PetscScalar v[],InsertMode is)
1463 {
1464   Mat_SeqSELL    *a=(Mat_SeqSELL*)A->data;
1465   PetscInt       shift,i,k,l,low,high,t,ii,row,col,nrow;
1466   PetscInt       *cp,nonew=a->nonew,lastcol=-1;
1467   MatScalar      *vp,value;
1468   PetscErrorCode ierr;
1469 
1470   PetscFunctionBegin;
1471   for (k=0; k<m; k++) { /* loop over added rows */
1472     row = im[k];
1473     if (row < 0) continue;
1474 #if defined(PETSC_USE_DEBUG)
1475     if (row >= A->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Row too large: row %D max %D",row,A->rmap->n-1);
1476 #endif
1477     shift = a->sliidx[row>>3]+(row&0x07); /* starting index of the row */
1478     cp    = a->colidx+shift; /* pointer to the row */
1479     vp    = a->val+shift; /* pointer to the row */
1480     nrow  = a->rlen[row];
1481     low   = 0;
1482     high  = nrow;
1483 
1484     for (l=0; l<n; l++) { /* loop over added columns */
1485       col = in[l];
1486       if (col<0) continue;
1487 #if defined(PETSC_USE_DEBUG)
1488       if (col >= A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Col too large: row %D max %D",col,A->cmap->n-1);
1489 #endif
1490       if (a->roworiented) {
1491         value = v[l+k*n];
1492       } else {
1493         value = v[k+l*m];
1494       }
1495       if ((value == 0.0 && a->ignorezeroentries) && (is == ADD_VALUES)) continue;
1496 
1497       /* search in this row for the specified colmun, i indicates the column to be set */
1498       if (col <= lastcol) low = 0;
1499       else high = nrow;
1500       lastcol = col;
1501       while (high-low > 5) {
1502         t = (low+high)/2;
1503         if (*(cp+t*8) > col) high = t;
1504         else low = t;
1505       }
1506       for (i=low; i<high; i++) {
1507         if (*(cp+i*8) > col) break;
1508         if (*(cp+i*8) == col) {
1509           if (is == ADD_VALUES) *(vp+i*8) += value;
1510           else *(vp+i*8) = value;
1511           low = i + 1;
1512           goto noinsert;
1513         }
1514       }
1515       if (value == 0.0 && a->ignorezeroentries) goto noinsert;
1516       if (nonew == 1) goto noinsert;
1517       if (nonew == -1) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Inserting a new nonzero (%D, %D) in the matrix", row, col);
1518       /* If the current row length exceeds the slice width (e.g. nrow==slice_width), allocate a new space, otherwise do nothing */
1519       MatSeqXSELLReallocateSELL(A,A->rmap->n,1,nrow,a->sliidx,row/8,row,col,a->colidx,a->val,cp,vp,nonew,MatScalar);
1520       /* add the new nonzero to the high position, shift the remaining elements in current row to the right by one slot */
1521       for (ii=nrow-1; ii>=i; ii--) {
1522         *(cp+(ii+1)*8) = *(cp+ii*8);
1523         *(vp+(ii+1)*8) = *(vp+ii*8);
1524       }
1525       a->rlen[row]++;
1526       *(cp+i*8) = col;
1527       *(vp+i*8) = value;
1528       a->nz++;
1529       A->nonzerostate++;
1530       low = i+1; high++; nrow++;
1531 noinsert:;
1532     }
1533     a->rlen[row] = nrow;
1534   }
1535   PetscFunctionReturn(0);
1536 }
1537 
1538 PetscErrorCode MatCopy_SeqSELL(Mat A,Mat B,MatStructure str)
1539 {
1540   PetscErrorCode ierr;
1541 
1542   PetscFunctionBegin;
1543   /* If the two matrices have the same copy implementation, use fast copy. */
1544   if (str == SAME_NONZERO_PATTERN && (A->ops->copy == B->ops->copy)) {
1545     Mat_SeqSELL *a=(Mat_SeqSELL*)A->data;
1546     Mat_SeqSELL *b=(Mat_SeqSELL*)B->data;
1547 
1548     if (a->sliidx[a->totalslices] != b->sliidx[b->totalslices]) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"Number of nonzeros in two matrices are different");
1549     ierr = PetscMemcpy(b->val,a->val,a->sliidx[a->totalslices]*sizeof(PetscScalar));CHKERRQ(ierr);
1550   } else {
1551     ierr = MatCopy_Basic(A,B,str);CHKERRQ(ierr);
1552   }
1553   PetscFunctionReturn(0);
1554 }
1555 
1556 PetscErrorCode MatSetUp_SeqSELL(Mat A)
1557 {
1558   PetscErrorCode ierr;
1559 
1560   PetscFunctionBegin;
1561   ierr = MatSeqSELLSetPreallocation(A,PETSC_DEFAULT,0);CHKERRQ(ierr);
1562   PetscFunctionReturn(0);
1563 }
1564 
1565 PetscErrorCode MatSeqSELLGetArray_SeqSELL(Mat A,PetscScalar *array[])
1566 {
1567   Mat_SeqSELL *a=(Mat_SeqSELL*)A->data;
1568 
1569   PetscFunctionBegin;
1570   *array = a->val;
1571   PetscFunctionReturn(0);
1572 }
1573 
1574 PetscErrorCode MatSeqSELLRestoreArray_SeqSELL(Mat A,PetscScalar *array[])
1575 {
1576   PetscFunctionBegin;
1577   PetscFunctionReturn(0);
1578 }
1579 
1580 PetscErrorCode MatRealPart_SeqSELL(Mat A)
1581 {
1582   Mat_SeqSELL *a=(Mat_SeqSELL*)A->data;
1583   PetscInt    i;
1584   MatScalar   *aval=a->val;
1585 
1586   PetscFunctionBegin;
1587   for (i=0; i<a->sliidx[a->totalslices]; i++) aval[i]=PetscRealPart(aval[i]);
1588   PetscFunctionReturn(0);
1589 }
1590 
1591 PetscErrorCode MatImaginaryPart_SeqSELL(Mat A)
1592 {
1593   Mat_SeqSELL    *a=(Mat_SeqSELL*)A->data;
1594   PetscInt       i;
1595   MatScalar      *aval=a->val;
1596   PetscErrorCode ierr;
1597 
1598   PetscFunctionBegin;
1599   for (i=0; i<a->sliidx[a->totalslices]; i++) aval[i] = PetscImaginaryPart(aval[i]);
1600   ierr = MatSeqSELLInvalidateDiagonal(A);CHKERRQ(ierr);
1601   PetscFunctionReturn(0);
1602 }
1603 
1604 PetscErrorCode MatScale_SeqSELL(Mat inA,PetscScalar alpha)
1605 {
1606   Mat_SeqSELL    *a=(Mat_SeqSELL*)inA->data;
1607   MatScalar      *aval=a->val;
1608   PetscScalar    oalpha=alpha;
1609   PetscBLASInt   one=1,size;
1610   PetscErrorCode ierr;
1611 
1612   PetscFunctionBegin;
1613   ierr = PetscBLASIntCast(a->sliidx[a->totalslices],&size);CHKERRQ(ierr);
1614   PetscStackCallBLAS("BLASscal",BLASscal_(&size,&oalpha,aval,&one));
1615   ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
1616   ierr = MatSeqSELLInvalidateDiagonal(inA);CHKERRQ(ierr);
1617   PetscFunctionReturn(0);
1618 }
1619 
1620 PetscErrorCode MatShift_SeqSELL(Mat Y,PetscScalar a)
1621 {
1622   Mat_SeqSELL    *y=(Mat_SeqSELL*)Y->data;
1623   PetscErrorCode ierr;
1624 
1625   PetscFunctionBegin;
1626   if (!Y->preallocated || !y->nz) {
1627     ierr = MatSeqSELLSetPreallocation(Y,1,NULL);CHKERRQ(ierr);
1628   }
1629   ierr = MatShift_Basic(Y,a);CHKERRQ(ierr);
1630   PetscFunctionReturn(0);
1631 }
1632 
1633 PetscErrorCode MatSOR_SeqSELL(Mat A,Vec bb,PetscReal omega,MatSORType flag,PetscReal fshift,PetscInt its,PetscInt lits,Vec xx)
1634 {
1635   Mat_SeqSELL       *a=(Mat_SeqSELL*)A->data;
1636   PetscScalar       *x,sum,*t;
1637   const MatScalar   *idiag=0,*mdiag;
1638   const PetscScalar *b,*xb;
1639   PetscInt          n,m=A->rmap->n,i,j,shift;
1640   const PetscInt    *diag;
1641   PetscErrorCode    ierr;
1642 
1643   PetscFunctionBegin;
1644   its = its*lits;
1645 
1646   if (fshift != a->fshift || omega != a->omega) a->idiagvalid = PETSC_FALSE; /* must recompute idiag[] */
1647   if (!a->idiagvalid) {ierr = MatInvertDiagonal_SeqSELL(A,omega,fshift);CHKERRQ(ierr);}
1648   a->fshift = fshift;
1649   a->omega  = omega;
1650 
1651   diag  = a->diag;
1652   t     = a->ssor_work;
1653   idiag = a->idiag;
1654   mdiag = a->mdiag;
1655 
1656   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1657   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
1658   /* We count flops by assuming the upper triangular and lower triangular parts have the same number of nonzeros */
1659   if (flag == SOR_APPLY_UPPER) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"SOR_APPLY_UPPER is not implemented");
1660   if (flag == SOR_APPLY_LOWER) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"SOR_APPLY_LOWER is not implemented");
1661   if (flag & SOR_EISENSTAT) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support yet for Eisenstat");
1662 
1663   if (flag & SOR_ZERO_INITIAL_GUESS) {
1664     if ((flag & SOR_FORWARD_SWEEP) || (flag & SOR_LOCAL_FORWARD_SWEEP)) {
1665       for (i=0; i<m; i++) {
1666         shift = a->sliidx[i>>3]+(i&0x07); /* starting index of the row i */
1667         sum   = b[i];
1668         n     = (diag[i]-shift)/8;
1669         for (j=0; j<n; j++) sum -= a->val[shift+j*8]*x[a->colidx[shift+j*8]];
1670         t[i]  = sum;
1671         x[i]  = sum*idiag[i];
1672       }
1673       xb   = t;
1674       ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
1675     } else xb = b;
1676     if ((flag & SOR_BACKWARD_SWEEP) || (flag & SOR_LOCAL_BACKWARD_SWEEP)) {
1677       for (i=m-1; i>=0; i--) {
1678         shift = a->sliidx[i>>3]+(i&0x07); /* starting index of the row i */
1679         sum   = xb[i];
1680         n     = a->rlen[i]-(diag[i]-shift)/8-1;
1681         for (j=1; j<=n; j++) sum -= a->val[diag[i]+j*8]*x[a->colidx[diag[i]+j*8]];
1682         if (xb == b) {
1683           x[i] = sum*idiag[i];
1684         } else {
1685           x[i] = (1.-omega)*x[i]+sum*idiag[i];  /* omega in idiag */
1686         }
1687       }
1688       ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); /* assumes 1/2 in upper */
1689     }
1690     its--;
1691   }
1692   while (its--) {
1693     if ((flag & SOR_FORWARD_SWEEP) || (flag & SOR_LOCAL_FORWARD_SWEEP)) {
1694       for (i=0; i<m; i++) {
1695         /* lower */
1696         shift = a->sliidx[i>>3]+(i&0x07); /* starting index of the row i */
1697         sum   = b[i];
1698         n     = (diag[i]-shift)/8;
1699         for (j=0; j<n; j++) sum -= a->val[shift+j*8]*x[a->colidx[shift+j*8]];
1700         t[i]  = sum;             /* save application of the lower-triangular part */
1701         /* upper */
1702         n     = a->rlen[i]-(diag[i]-shift)/8-1;
1703         for (j=1; j<=n; j++) sum -= a->val[diag[i]+j*8]*x[a->colidx[diag[i]+j*8]];
1704         x[i]  = (1.-omega)*x[i]+sum*idiag[i];  /* omega in idiag */
1705       }
1706       xb   = t;
1707       ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
1708     } else xb = b;
1709     if ((flag & SOR_BACKWARD_SWEEP) || (flag & SOR_LOCAL_BACKWARD_SWEEP)) {
1710       for (i=m-1; i>=0; i--) {
1711         shift = a->sliidx[i>>3]+(i&0x07); /* starting index of the row i */
1712         sum = xb[i];
1713         if (xb == b) {
1714           /* whole matrix (no checkpointing available) */
1715           n     = a->rlen[i];
1716           for (j=0; j<n; j++) sum -= a->val[shift+j*8]*x[a->colidx[shift+j*8]];
1717           x[i] = (1.-omega)*x[i]+(sum+mdiag[i]*x[i])*idiag[i];
1718         } else { /* lower-triangular part has been saved, so only apply upper-triangular */
1719           n     = a->rlen[i]-(diag[i]-shift)/8-1;
1720           for (j=1; j<=n; j++) sum -= a->val[diag[i]+j*8]*x[a->colidx[diag[i]+j*8]];
1721           x[i]  = (1.-omega)*x[i]+sum*idiag[i];  /* omega in idiag */
1722         }
1723       }
1724       if (xb == b) {
1725         ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
1726       } else {
1727         ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); /* assumes 1/2 in upper */
1728       }
1729     }
1730   }
1731   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1732   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
1733   PetscFunctionReturn(0);
1734 }
1735 
1736 /* -------------------------------------------------------------------*/
1737 static struct _MatOps MatOps_Values = {MatSetValues_SeqSELL,
1738                                        MatGetRow_SeqSELL,
1739                                        MatRestoreRow_SeqSELL,
1740                                        MatMult_SeqSELL,
1741                                /* 4*/  MatMultAdd_SeqSELL,
1742                                        MatMultTranspose_SeqSELL,
1743                                        MatMultTransposeAdd_SeqSELL,
1744                                        0,
1745                                        0,
1746                                        0,
1747                                /* 10*/ 0,
1748                                        0,
1749                                        0,
1750                                        MatSOR_SeqSELL,
1751                                        0,
1752                                /* 15*/ MatGetInfo_SeqSELL,
1753                                        MatEqual_SeqSELL,
1754                                        MatGetDiagonal_SeqSELL,
1755                                        MatDiagonalScale_SeqSELL,
1756                                        0,
1757                                /* 20*/ 0,
1758                                        MatAssemblyEnd_SeqSELL,
1759                                        MatSetOption_SeqSELL,
1760                                        MatZeroEntries_SeqSELL,
1761                                /* 24*/ 0,
1762                                        0,
1763                                        0,
1764                                        0,
1765                                        0,
1766                                /* 29*/ MatSetUp_SeqSELL,
1767                                        0,
1768                                        0,
1769                                        0,
1770                                        0,
1771                                /* 34*/ MatDuplicate_SeqSELL,
1772                                        0,
1773                                        0,
1774                                        0,
1775                                        0,
1776                                /* 39*/ 0,
1777                                        0,
1778                                        0,
1779                                        MatGetValues_SeqSELL,
1780                                        MatCopy_SeqSELL,
1781                                /* 44*/ 0,
1782                                        MatScale_SeqSELL,
1783                                        MatShift_SeqSELL,
1784                                        0,
1785                                        0,
1786                                /* 49*/ 0,
1787                                        0,
1788                                        0,
1789                                        0,
1790                                        0,
1791                                /* 54*/ MatFDColoringCreate_SeqXAIJ,
1792                                        0,
1793                                        0,
1794                                        0,
1795                                        0,
1796                                /* 59*/ 0,
1797                                        MatDestroy_SeqSELL,
1798                                        MatView_SeqSELL,
1799                                        0,
1800                                        0,
1801                                /* 64*/ 0,
1802                                        0,
1803                                        0,
1804                                        0,
1805                                        0,
1806                                /* 69*/ 0,
1807                                        0,
1808                                        0,
1809                                        0,
1810                                        0,
1811                                /* 74*/ 0,
1812                                        MatFDColoringApply_AIJ, /* reuse the FDColoring function for AIJ */
1813                                        0,
1814                                        0,
1815                                        0,
1816                                /* 79*/ 0,
1817                                        0,
1818                                        0,
1819                                        0,
1820                                        0,
1821                                /* 84*/ 0,
1822                                        0,
1823                                        0,
1824                                        0,
1825                                        0,
1826                                /* 89*/ 0,
1827                                        0,
1828                                        0,
1829                                        0,
1830                                        0,
1831                                /* 94*/ 0,
1832                                        0,
1833                                        0,
1834                                        0,
1835                                        0,
1836                                /* 99*/ 0,
1837                                        0,
1838                                        0,
1839                                        MatConjugate_SeqSELL,
1840                                        0,
1841                                /*104*/ 0,
1842                                        0,
1843                                        0,
1844                                        0,
1845                                        0,
1846                                /*109*/ 0,
1847                                        0,
1848                                        0,
1849                                        0,
1850                                        MatMissingDiagonal_SeqSELL,
1851                                /*114*/ 0,
1852                                        0,
1853                                        0,
1854                                        0,
1855                                        0,
1856                                /*119*/ 0,
1857                                        0,
1858                                        0,
1859                                        0,
1860                                        0,
1861                                /*124*/ 0,
1862                                        0,
1863                                        0,
1864                                        0,
1865                                        0,
1866                                /*129*/ 0,
1867                                        0,
1868                                        0,
1869                                        0,
1870                                        0,
1871                                /*134*/ 0,
1872                                        0,
1873                                        0,
1874                                        0,
1875                                        0,
1876                                /*139*/ 0,
1877                                        0,
1878                                        0,
1879                                        MatFDColoringSetUp_SeqXAIJ,
1880                                        0,
1881                                 /*144*/0
1882 };
1883 
1884 PetscErrorCode MatStoreValues_SeqSELL(Mat mat)
1885 {
1886   Mat_SeqSELL    *a=(Mat_SeqSELL*)mat->data;
1887   PetscErrorCode ierr;
1888 
1889   PetscFunctionBegin;
1890   if (!a->nonew) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ORDER,"Must call MatSetOption(A,MAT_NEW_NONZERO_LOCATIONS,PETSC_FALSE);first");
1891 
1892   /* allocate space for values if not already there */
1893   if (!a->saved_values) {
1894     ierr = PetscMalloc1(a->sliidx[a->totalslices]+1,&a->saved_values);CHKERRQ(ierr);
1895     ierr = PetscLogObjectMemory((PetscObject)mat,(a->sliidx[a->totalslices]+1)*sizeof(PetscScalar));CHKERRQ(ierr);
1896   }
1897 
1898   /* copy values over */
1899   ierr = PetscMemcpy(a->saved_values,a->val,a->sliidx[a->totalslices]*sizeof(PetscScalar));CHKERRQ(ierr);
1900   PetscFunctionReturn(0);
1901 }
1902 
1903 PetscErrorCode MatRetrieveValues_SeqSELL(Mat mat)
1904 {
1905   Mat_SeqSELL    *a=(Mat_SeqSELL*)mat->data;
1906   PetscErrorCode ierr;
1907 
1908   PetscFunctionBegin;
1909   if (!a->nonew) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ORDER,"Must call MatSetOption(A,MAT_NEW_NONZERO_LOCATIONS,PETSC_FALSE);first");
1910   if (!a->saved_values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ORDER,"Must call MatStoreValues(A);first");
1911   /* copy values over */
1912   ierr = PetscMemcpy(a->val,a->saved_values,a->sliidx[a->totalslices]*sizeof(PetscScalar));CHKERRQ(ierr);
1913   PetscFunctionReturn(0);
1914 }
1915 
1916 /*@C
1917  MatSeqSELLRestoreArray - returns access to the array where the data for a MATSEQSELL matrix is stored obtained by MatSeqSELLGetArray()
1918 
1919  Not Collective
1920 
1921  Input Parameters:
1922  .  mat - a MATSEQSELL matrix
1923  .  array - pointer to the data
1924 
1925  Level: intermediate
1926 
1927  .seealso: MatSeqSELLGetArray(), MatSeqSELLRestoreArrayF90()
1928  @*/
1929 PetscErrorCode MatSeqSELLRestoreArray(Mat A,PetscScalar **array)
1930 {
1931   PetscErrorCode ierr;
1932 
1933   PetscFunctionBegin;
1934   ierr = PetscUseMethod(A,"MatSeqSELLRestoreArray_C",(Mat,PetscScalar**),(A,array));CHKERRQ(ierr);
1935   PetscFunctionReturn(0);
1936 }
1937 
1938 PETSC_EXTERN PetscErrorCode MatCreate_SeqSELL(Mat B)
1939 {
1940   Mat_SeqSELL    *b;
1941   PetscMPIInt    size;
1942   PetscErrorCode ierr;
1943 
1944   PetscFunctionBegin;
1945   ierr = MPI_Comm_size(PetscObjectComm((PetscObject)B),&size);CHKERRQ(ierr);
1946   if (size > 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Comm must be of size 1");
1947 
1948   ierr = PetscNewLog(B,&b);CHKERRQ(ierr);
1949 
1950   B->data = (void*)b;
1951 
1952   ierr = PetscMemcpy(B->ops,&MatOps_Values,sizeof(struct _MatOps));CHKERRQ(ierr);
1953 
1954   b->row                = 0;
1955   b->col                = 0;
1956   b->icol               = 0;
1957   b->reallocs           = 0;
1958   b->ignorezeroentries  = PETSC_FALSE;
1959   b->roworiented        = PETSC_TRUE;
1960   b->nonew              = 0;
1961   b->diag               = 0;
1962   b->solve_work         = 0;
1963   B->spptr              = 0;
1964   b->saved_values       = 0;
1965   b->idiag              = 0;
1966   b->mdiag              = 0;
1967   b->ssor_work          = 0;
1968   b->omega              = 1.0;
1969   b->fshift             = 0.0;
1970   b->idiagvalid         = PETSC_FALSE;
1971   b->keepnonzeropattern = PETSC_FALSE;
1972 
1973   ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQSELL);CHKERRQ(ierr);
1974   ierr = PetscObjectComposeFunction((PetscObject)B,"MatSeqSELLGetArray_C",MatSeqSELLGetArray_SeqSELL);CHKERRQ(ierr);
1975   ierr = PetscObjectComposeFunction((PetscObject)B,"MatSeqSELLRestoreArray_C",MatSeqSELLRestoreArray_SeqSELL);CHKERRQ(ierr);
1976   ierr = PetscObjectComposeFunction((PetscObject)B,"MatStoreValues_C",MatStoreValues_SeqSELL);CHKERRQ(ierr);
1977   ierr = PetscObjectComposeFunction((PetscObject)B,"MatRetrieveValues_C",MatRetrieveValues_SeqSELL);CHKERRQ(ierr);
1978   ierr = PetscObjectComposeFunction((PetscObject)B,"MatSeqSELLSetPreallocation_C",MatSeqSELLSetPreallocation_SeqSELL);CHKERRQ(ierr);
1979   ierr = PetscObjectComposeFunction((PetscObject)B,"MatConvert_seqsell_seqaij_C",MatConvert_SeqSELL_SeqAIJ);CHKERRQ(ierr);
1980   PetscFunctionReturn(0);
1981 }
1982 
1983 /*
1984  Given a matrix generated with MatGetFactor() duplicates all the information in A into B
1985  */
1986 PetscErrorCode MatDuplicateNoCreate_SeqSELL(Mat C,Mat A,MatDuplicateOption cpvalues,PetscBool mallocmatspace)
1987 {
1988   Mat_SeqSELL    *c,*a=(Mat_SeqSELL*)A->data;
1989   PetscInt       i,m=A->rmap->n;
1990   PetscInt       totalslices=a->totalslices;
1991   PetscErrorCode ierr;
1992 
1993   PetscFunctionBegin;
1994   c = (Mat_SeqSELL*)C->data;
1995 
1996   C->factortype = A->factortype;
1997   c->row        = 0;
1998   c->col        = 0;
1999   c->icol       = 0;
2000   c->reallocs   = 0;
2001 
2002   C->assembled = PETSC_TRUE;
2003 
2004   ierr = PetscLayoutReference(A->rmap,&C->rmap);CHKERRQ(ierr);
2005   ierr = PetscLayoutReference(A->cmap,&C->cmap);CHKERRQ(ierr);
2006 
2007   ierr = PetscMalloc1(8*totalslices,&c->rlen);CHKERRQ(ierr);
2008   ierr = PetscLogObjectMemory((PetscObject)C,m*sizeof(PetscInt));CHKERRQ(ierr);
2009   ierr = PetscMalloc1(totalslices+1,&c->sliidx);CHKERRQ(ierr);
2010   ierr = PetscLogObjectMemory((PetscObject)C, (totalslices+1)*sizeof(PetscInt));CHKERRQ(ierr);
2011 
2012   for (i=0; i<m; i++) c->rlen[i] = a->rlen[i];
2013   for (i=0; i<totalslices+1; i++) c->sliidx[i] = a->sliidx[i];
2014 
2015   /* allocate the matrix space */
2016   if (mallocmatspace) {
2017     ierr = PetscMalloc2(a->maxallocmat,&c->val,a->maxallocmat,&c->colidx);CHKERRQ(ierr);
2018     ierr = PetscLogObjectMemory((PetscObject)C,a->maxallocmat*(sizeof(PetscScalar)+sizeof(PetscInt)));CHKERRQ(ierr);
2019 
2020     c->singlemalloc = PETSC_TRUE;
2021 
2022     if (m > 0) {
2023       ierr = PetscMemcpy(c->colidx,a->colidx,(a->maxallocmat)*sizeof(PetscInt));CHKERRQ(ierr);
2024       if (cpvalues == MAT_COPY_VALUES) {
2025         ierr = PetscMemcpy(c->val,a->val,a->maxallocmat*sizeof(PetscScalar));CHKERRQ(ierr);
2026       } else {
2027         ierr = PetscMemzero(c->val,a->maxallocmat*sizeof(PetscScalar));CHKERRQ(ierr);
2028       }
2029     }
2030   }
2031 
2032   c->ignorezeroentries = a->ignorezeroentries;
2033   c->roworiented       = a->roworiented;
2034   c->nonew             = a->nonew;
2035   if (a->diag) {
2036     ierr = PetscMalloc1(m,&c->diag);CHKERRQ(ierr);
2037     ierr = PetscLogObjectMemory((PetscObject)C,m*sizeof(PetscInt));CHKERRQ(ierr);
2038     for (i=0; i<m; i++) {
2039       c->diag[i] = a->diag[i];
2040     }
2041   } else c->diag = 0;
2042 
2043   c->solve_work         = 0;
2044   c->saved_values       = 0;
2045   c->idiag              = 0;
2046   c->ssor_work          = 0;
2047   c->keepnonzeropattern = a->keepnonzeropattern;
2048   c->free_val           = PETSC_TRUE;
2049   c->free_colidx        = PETSC_TRUE;
2050 
2051   c->maxallocmat  = a->maxallocmat;
2052   c->maxallocrow  = a->maxallocrow;
2053   c->rlenmax      = a->rlenmax;
2054   c->nz           = a->nz;
2055   C->preallocated = PETSC_TRUE;
2056 
2057   c->nonzerorowcnt = a->nonzerorowcnt;
2058   C->nonzerostate  = A->nonzerostate;
2059 
2060   ierr = PetscFunctionListDuplicate(((PetscObject)A)->qlist,&((PetscObject)C)->qlist);CHKERRQ(ierr);
2061   PetscFunctionReturn(0);
2062 }
2063 
2064 PetscErrorCode MatDuplicate_SeqSELL(Mat A,MatDuplicateOption cpvalues,Mat *B)
2065 {
2066   PetscErrorCode ierr;
2067 
2068   PetscFunctionBegin;
2069   ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr);
2070   ierr = MatSetSizes(*B,A->rmap->n,A->cmap->n,A->rmap->n,A->cmap->n);CHKERRQ(ierr);
2071   if (!(A->rmap->n % A->rmap->bs) && !(A->cmap->n % A->cmap->bs)) {
2072     ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr);
2073   }
2074   ierr = MatSetType(*B,((PetscObject)A)->type_name);CHKERRQ(ierr);
2075   ierr = MatDuplicateNoCreate_SeqSELL(*B,A,cpvalues,PETSC_TRUE);CHKERRQ(ierr);
2076   PetscFunctionReturn(0);
2077 }
2078 
2079 /*@C
2080  MatCreateSeqSELL - Creates a sparse matrix in SELL format.
2081 
2082  Collective on MPI_Comm
2083 
2084  Input Parameters:
2085  +  comm - MPI communicator, set to PETSC_COMM_SELF
2086  .  m - number of rows
2087  .  n - number of columns
2088  .  rlenmax - maximum number of nonzeros in a row
2089  -  rlen - array containing the number of nonzeros in the various rows
2090  (possibly different for each row) or NULL
2091 
2092  Output Parameter:
2093  .  A - the matrix
2094 
2095  It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
2096  MatXXXXSetPreallocation() paradgm instead of this routine directly.
2097  [MatXXXXSetPreallocation() is, for example, MatSeqSELLSetPreallocation]
2098 
2099  Notes:
2100  If nnz is given then nz is ignored
2101 
2102  Specify the preallocated storage with either rlenmax or rlen (not both).
2103  Set rlenmax=PETSC_DEFAULT and rlen=NULL for PETSc to control dynamic memory
2104  allocation.  For large problems you MUST preallocate memory or you
2105  will get TERRIBLE performance, see the users' manual chapter on matrices.
2106 
2107  Level: intermediate
2108 
2109  .seealso: MatCreate(), MatCreateSELL(), MatSetValues(), MatCreateSeqSELLWithArrays()
2110 
2111  @*/
2112 PetscErrorCode MatCreateSeqSELL(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt maxallocrow,const PetscInt rlen[],Mat *A)
2113 {
2114   PetscErrorCode ierr;
2115 
2116   PetscFunctionBegin;
2117   ierr = MatCreate(comm,A);CHKERRQ(ierr);
2118   ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr);
2119   ierr = MatSetType(*A,MATSEQSELL);CHKERRQ(ierr);
2120   ierr = MatSeqSELLSetPreallocation_SeqSELL(*A,maxallocrow,rlen);CHKERRQ(ierr);
2121   PetscFunctionReturn(0);
2122 }
2123 
2124 PetscErrorCode MatEqual_SeqSELL(Mat A,Mat B,PetscBool * flg)
2125 {
2126   Mat_SeqSELL     *a=(Mat_SeqSELL*)A->data,*b=(Mat_SeqSELL*)B->data;
2127   PetscInt       totalslices=a->totalslices;
2128   PetscErrorCode ierr;
2129 
2130   PetscFunctionBegin;
2131   /* If the  matrix dimensions are not equal,or no of nonzeros */
2132   if ((A->rmap->n != B->rmap->n) || (A->cmap->n != B->cmap->n) ||(a->nz != b->nz) || (a->rlenmax != b->rlenmax)) {
2133     *flg = PETSC_FALSE;
2134     PetscFunctionReturn(0);
2135   }
2136   /* if the a->colidx are the same */
2137   ierr = PetscMemcmp(a->colidx,b->colidx,a->sliidx[totalslices]*sizeof(PetscInt),flg);CHKERRQ(ierr);
2138   if (!*flg) PetscFunctionReturn(0);
2139   /* if a->val are the same */
2140   ierr = PetscMemcmp(a->val,b->val,a->sliidx[totalslices]*sizeof(PetscScalar),flg);CHKERRQ(ierr);
2141   PetscFunctionReturn(0);
2142 }
2143 
2144 PetscErrorCode MatSeqSELLInvalidateDiagonal(Mat A)
2145 {
2146   Mat_SeqSELL *a=(Mat_SeqSELL*)A->data;
2147 
2148   PetscFunctionBegin;
2149   a->idiagvalid  = PETSC_FALSE;
2150   PetscFunctionReturn(0);
2151 }
2152 
2153 PetscErrorCode MatConjugate_SeqSELL(Mat A)
2154 {
2155 #if defined(PETSC_USE_COMPLEX)
2156   Mat_SeqSELL *a=(Mat_SeqSELL*)A->data;
2157   PetscInt    i;
2158   PetscScalar *val = a->val;
2159 
2160   PetscFunctionBegin;
2161   for (i=0; i<a->sliidx[a->totalslices]; i++) {
2162     val[i] = PetscConj(val[i]);
2163   }
2164 #else
2165   PetscFunctionBegin;
2166 #endif
2167   PetscFunctionReturn(0);
2168 }
2169