xref: /petsc/src/mat/impls/fft/fftw/fftw.c (revision bfcb38ea38335faa6e7f8d97f6bc6ce9aa2a1dd1)
1 
2 /*
3     Provides an interface to the FFTW package.
4     Testing examples can be found in ~src/mat/examples/tests
5 */
6 
7 #include <../src/mat/impls/fft/fft.h>   /*I "petscmat.h" I*/
8 EXTERN_C_BEGIN
9 #include <fftw3-mpi.h>
10 EXTERN_C_END
11 
12 typedef struct {
13   ptrdiff_t    ndim_fftw,*dim_fftw;
14 #if defined(PETSC_USE_64BIT_INDICES)
15   fftw_iodim64 *iodims;
16 #else
17   fftw_iodim   *iodims;
18 #endif
19   PetscInt     partial_dim;
20   fftw_plan    p_forward,p_backward;
21   unsigned     p_flag; /* planner flags, FFTW_ESTIMATE,FFTW_MEASURE, FFTW_PATIENT, FFTW_EXHAUSTIVE */
22   PetscScalar  *finarray,*foutarray,*binarray,*boutarray; /* keep track of arrays becaue fftw plan should be
23                                                             executed for the arrays with which the plan was created */
24 } Mat_FFTW;
25 
26 extern PetscErrorCode MatMult_SeqFFTW(Mat,Vec,Vec);
27 extern PetscErrorCode MatMultTranspose_SeqFFTW(Mat,Vec,Vec);
28 extern PetscErrorCode MatMult_MPIFFTW(Mat,Vec,Vec);
29 extern PetscErrorCode MatMultTranspose_MPIFFTW(Mat,Vec,Vec);
30 extern PetscErrorCode MatDestroy_FFTW(Mat);
31 extern PetscErrorCode VecDestroy_MPIFFTW(Vec);
32 extern PetscErrorCode MatCreateVecsFFTW_FFTW(Mat,Vec*,Vec*,Vec*);
33 
34 /* MatMult_SeqFFTW performs forward DFT in parallel
35    Input parameter:
36      A - the matrix
37      x - the vector on which FDFT will be performed
38 
39    Output parameter:
40      y - vector that stores result of FDFT
41 */
42 PetscErrorCode MatMult_SeqFFTW(Mat A,Vec x,Vec y)
43 {
44   PetscErrorCode ierr;
45   Mat_FFT        *fft  = (Mat_FFT*)A->data;
46   Mat_FFTW       *fftw = (Mat_FFTW*)fft->data;
47   const PetscScalar *x_array;
48   PetscScalar    *y_array;
49 #if defined(PETSC_USE_COMPLEX)
50 #if defined(PETSC_USE_64BIT_INDICES)
51   fftw_iodim64   *iodims;
52 #else
53   fftw_iodim     *iodims;
54 #endif
55   PetscInt       i;
56 #endif
57   PetscInt       ndim = fft->ndim,*dim = fft->dim;
58 
59   PetscFunctionBegin;
60   ierr = VecGetArrayRead(x,&x_array);CHKERRQ(ierr);
61   ierr = VecGetArray(y,&y_array);CHKERRQ(ierr);
62   if (!fftw->p_forward) { /* create a plan, then excute it */
63     switch (ndim) {
64     case 1:
65 #if defined(PETSC_USE_COMPLEX)
66       fftw->p_forward = fftw_plan_dft_1d(dim[0],(fftw_complex*)x_array,(fftw_complex*)y_array,FFTW_FORWARD,fftw->p_flag);
67 #else
68       fftw->p_forward = fftw_plan_dft_r2c_1d(dim[0],(double*)x_array,(fftw_complex*)y_array,fftw->p_flag);
69 #endif
70       break;
71     case 2:
72 #if defined(PETSC_USE_COMPLEX)
73       fftw->p_forward = fftw_plan_dft_2d(dim[0],dim[1],(fftw_complex*)x_array,(fftw_complex*)y_array,FFTW_FORWARD,fftw->p_flag);
74 #else
75       fftw->p_forward = fftw_plan_dft_r2c_2d(dim[0],dim[1],(double*)x_array,(fftw_complex*)y_array,fftw->p_flag);
76 #endif
77       break;
78     case 3:
79 #if defined(PETSC_USE_COMPLEX)
80       fftw->p_forward = fftw_plan_dft_3d(dim[0],dim[1],dim[2],(fftw_complex*)x_array,(fftw_complex*)y_array,FFTW_FORWARD,fftw->p_flag);
81 #else
82       fftw->p_forward = fftw_plan_dft_r2c_3d(dim[0],dim[1],dim[2],(double*)x_array,(fftw_complex*)y_array,fftw->p_flag);
83 #endif
84       break;
85     default:
86 #if defined(PETSC_USE_COMPLEX)
87       iodims = fftw->iodims;
88 #if defined(PETSC_USE_64BIT_INDICES)
89       if (ndim) {
90         iodims[ndim-1].n = (ptrdiff_t)dim[ndim-1];
91         iodims[ndim-1].is = iodims[ndim-1].os = 1;
92         for (i=ndim-2; i>=0; --i) {
93           iodims[i].n = (ptrdiff_t)dim[i];
94           iodims[i].is = iodims[i].os = iodims[i+1].is * iodims[i+1].n;
95         }
96       }
97       fftw->p_forward = fftw_plan_guru64_dft((int)ndim,(fftw_iodim64*)iodims,0,NULL,(fftw_complex*)x_array,(fftw_complex*)y_array,FFTW_FORWARD,fftw->p_flag);
98 #else
99       if (ndim) {
100         iodims[ndim-1].n = (int)dim[ndim-1];
101         iodims[ndim-1].is = iodims[ndim-1].os = 1;
102         for (i=ndim-2; i>=0; --i) {
103           iodims[i].n = (int)dim[i];
104           iodims[i].is = iodims[i].os = iodims[i+1].is * iodims[i+1].n;
105         }
106       }
107       fftw->p_forward = fftw_plan_guru_dft((int)ndim,(fftw_iodim*)iodims,0,NULL,(fftw_complex*)x_array,(fftw_complex*)y_array,FFTW_FORWARD,fftw->p_flag);
108 #endif
109 
110 #else
111       fftw->p_forward = fftw_plan_dft_r2c(ndim,(int*)dim,(double*)x_array,(fftw_complex*)y_array,fftw->p_flag);
112 #endif
113       break;
114     }
115     fftw->finarray  = (PetscScalar *) x_array;
116     fftw->foutarray = y_array;
117     /* Warning: if (fftw->p_flag!==FFTW_ESTIMATE) The data in the in/out arrays is overwritten!
118                 planning should be done before x is initialized! See FFTW manual sec2.1 or sec4 */
119     fftw_execute(fftw->p_forward);
120   } else { /* use existing plan */
121     if (fftw->finarray != x_array || fftw->foutarray != y_array) { /* use existing plan on new arrays */
122 #if defined(PETSC_USE_COMPLEX)
123       fftw_execute_dft(fftw->p_forward,(fftw_complex*)x_array,(fftw_complex*)y_array);
124 #else
125       fftw_execute_dft_r2c(fftw->p_forward,(double*)x_array,(fftw_complex*)y_array);
126 #endif
127     } else {
128       fftw_execute(fftw->p_forward);
129     }
130   }
131   ierr = VecRestoreArray(y,&y_array);CHKERRQ(ierr);
132   ierr = VecRestoreArrayRead(x,&x_array);CHKERRQ(ierr);
133   PetscFunctionReturn(0);
134 }
135 
136 /* MatMultTranspose_SeqFFTW performs serial backward DFT
137    Input parameter:
138      A - the matrix
139      x - the vector on which BDFT will be performed
140 
141    Output parameter:
142      y - vector that stores result of BDFT
143 */
144 
145 PetscErrorCode MatMultTranspose_SeqFFTW(Mat A,Vec x,Vec y)
146 {
147   PetscErrorCode ierr;
148   Mat_FFT        *fft  = (Mat_FFT*)A->data;
149   Mat_FFTW       *fftw = (Mat_FFTW*)fft->data;
150   const PetscScalar *x_array;
151   PetscScalar    *y_array;
152   PetscInt       ndim=fft->ndim,*dim=fft->dim;
153 #if defined(PETSC_USE_COMPLEX)
154 #if defined(PETSC_USE_64BIT_INDICES)
155   fftw_iodim64   *iodims=fftw->iodims;
156 #else
157   fftw_iodim     *iodims=fftw->iodims;
158 #endif
159 #endif
160 
161   PetscFunctionBegin;
162   ierr = VecGetArrayRead(x,&x_array);CHKERRQ(ierr);
163   ierr = VecGetArray(y,&y_array);CHKERRQ(ierr);
164   if (!fftw->p_backward) { /* create a plan, then excute it */
165     switch (ndim) {
166     case 1:
167 #if defined(PETSC_USE_COMPLEX)
168       fftw->p_backward = fftw_plan_dft_1d(dim[0],(fftw_complex*)x_array,(fftw_complex*)y_array,FFTW_BACKWARD,fftw->p_flag);
169 #else
170       fftw->p_backward= fftw_plan_dft_c2r_1d(dim[0],(fftw_complex*)x_array,(double*)y_array,fftw->p_flag);
171 #endif
172       break;
173     case 2:
174 #if defined(PETSC_USE_COMPLEX)
175       fftw->p_backward = fftw_plan_dft_2d(dim[0],dim[1],(fftw_complex*)x_array,(fftw_complex*)y_array,FFTW_BACKWARD,fftw->p_flag);
176 #else
177       fftw->p_backward= fftw_plan_dft_c2r_2d(dim[0],dim[1],(fftw_complex*)x_array,(double*)y_array,fftw->p_flag);
178 #endif
179       break;
180     case 3:
181 #if defined(PETSC_USE_COMPLEX)
182       fftw->p_backward = fftw_plan_dft_3d(dim[0],dim[1],dim[2],(fftw_complex*)x_array,(fftw_complex*)y_array,FFTW_BACKWARD,fftw->p_flag);
183 #else
184       fftw->p_backward= fftw_plan_dft_c2r_3d(dim[0],dim[1],dim[2],(fftw_complex*)x_array,(double*)y_array,fftw->p_flag);
185 #endif
186       break;
187     default:
188 #if defined(PETSC_USE_COMPLEX)
189 #if defined(PETSC_USE_64BIT_INDICES)
190       fftw->p_backward = fftw_plan_guru64_dft((int)ndim,(fftw_iodim64*)iodims,0,NULL,(fftw_complex*)x_array,(fftw_complex*)y_array,FFTW_BACKWARD,fftw->p_flag);
191 #else
192       fftw->p_backward = fftw_plan_guru_dft((int)ndim,iodims,0,NULL,(fftw_complex*)x_array,(fftw_complex*)y_array,FFTW_BACKWARD,fftw->p_flag);
193 #endif
194 #else
195       fftw->p_backward= fftw_plan_dft_c2r((int)ndim,(int*)dim,(fftw_complex*)x_array,(double*)y_array,fftw->p_flag);
196 #endif
197       break;
198     }
199     fftw->binarray  = (PetscScalar *) x_array;
200     fftw->boutarray = y_array;
201     fftw_execute(fftw->p_backward);CHKERRQ(ierr);
202   } else { /* use existing plan */
203     if (fftw->binarray != x_array || fftw->boutarray != y_array) { /* use existing plan on new arrays */
204 #if defined(PETSC_USE_COMPLEX)
205       fftw_execute_dft(fftw->p_backward,(fftw_complex*)x_array,(fftw_complex*)y_array);
206 #else
207       fftw_execute_dft_c2r(fftw->p_backward,(fftw_complex*)x_array,(double*)y_array);
208 #endif
209     } else {
210       fftw_execute(fftw->p_backward);CHKERRQ(ierr);
211     }
212   }
213   ierr = VecRestoreArray(y,&y_array);CHKERRQ(ierr);
214   ierr = VecRestoreArrayRead(x,&x_array);CHKERRQ(ierr);
215   PetscFunctionReturn(0);
216 }
217 
218 /* MatMult_MPIFFTW performs forward DFT in parallel
219    Input parameter:
220      A - the matrix
221      x - the vector on which FDFT will be performed
222 
223    Output parameter:
224    y   - vector that stores result of FDFT
225 */
226 PetscErrorCode MatMult_MPIFFTW(Mat A,Vec x,Vec y)
227 {
228   PetscErrorCode ierr;
229   Mat_FFT        *fft  = (Mat_FFT*)A->data;
230   Mat_FFTW       *fftw = (Mat_FFTW*)fft->data;
231   const PetscScalar *x_array;
232   PetscScalar    *y_array;
233   PetscInt       ndim=fft->ndim,*dim=fft->dim;
234   MPI_Comm       comm;
235 
236   PetscFunctionBegin;
237   ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr);
238   ierr = VecGetArrayRead(x,&x_array);CHKERRQ(ierr);
239   ierr = VecGetArray(y,&y_array);CHKERRQ(ierr);
240   if (!fftw->p_forward) { /* create a plan, then excute it */
241     switch (ndim) {
242     case 1:
243 #if defined(PETSC_USE_COMPLEX)
244       fftw->p_forward = fftw_mpi_plan_dft_1d(dim[0],(fftw_complex*)x_array,(fftw_complex*)y_array,comm,FFTW_FORWARD,fftw->p_flag);
245 #else
246       SETERRQ(comm,PETSC_ERR_SUP,"not support for real numbers yet");
247 #endif
248       break;
249     case 2:
250 #if defined(PETSC_USE_COMPLEX) /* For complex transforms call fftw_mpi_plan_dft, for real transforms call fftw_mpi_plan_dft_r2c */
251       fftw->p_forward = fftw_mpi_plan_dft_2d(dim[0],dim[1],(fftw_complex*)x_array,(fftw_complex*)y_array,comm,FFTW_FORWARD,fftw->p_flag);
252 #else
253       fftw->p_forward = fftw_mpi_plan_dft_r2c_2d(dim[0],dim[1],(double*)x_array,(fftw_complex*)y_array,comm,FFTW_ESTIMATE);
254 #endif
255       break;
256     case 3:
257 #if defined(PETSC_USE_COMPLEX)
258       fftw->p_forward = fftw_mpi_plan_dft_3d(dim[0],dim[1],dim[2],(fftw_complex*)x_array,(fftw_complex*)y_array,comm,FFTW_FORWARD,fftw->p_flag);
259 #else
260       fftw->p_forward = fftw_mpi_plan_dft_r2c_3d(dim[0],dim[1],dim[2],(double*)x_array,(fftw_complex*)y_array,comm,FFTW_ESTIMATE);
261 #endif
262       break;
263     default:
264 #if defined(PETSC_USE_COMPLEX)
265       fftw->p_forward = fftw_mpi_plan_dft(fftw->ndim_fftw,fftw->dim_fftw,(fftw_complex*)x_array,(fftw_complex*)y_array,comm,FFTW_FORWARD,fftw->p_flag);
266 #else
267       fftw->p_forward = fftw_mpi_plan_dft_r2c(fftw->ndim_fftw,fftw->dim_fftw,(double*)x_array,(fftw_complex*)y_array,comm,FFTW_ESTIMATE);
268 #endif
269       break;
270     }
271     fftw->finarray  = (PetscScalar *) x_array;
272     fftw->foutarray = y_array;
273     /* Warning: if (fftw->p_flag!==FFTW_ESTIMATE) The data in the in/out arrays is overwritten!
274                 planning should be done before x is initialized! See FFTW manual sec2.1 or sec4 */
275     fftw_execute(fftw->p_forward);
276   } else { /* use existing plan */
277     if (fftw->finarray != x_array || fftw->foutarray != y_array) { /* use existing plan on new arrays */
278       fftw_execute_dft(fftw->p_forward,(fftw_complex*)x_array,(fftw_complex*)y_array);
279     } else {
280       fftw_execute(fftw->p_forward);
281     }
282   }
283   ierr = VecRestoreArray(y,&y_array);CHKERRQ(ierr);
284   ierr = VecRestoreArrayRead(x,&x_array);CHKERRQ(ierr);
285   PetscFunctionReturn(0);
286 }
287 
288 /* MatMultTranspose_MPIFFTW performs parallel backward DFT
289    Input parameter:
290      A - the matrix
291      x - the vector on which BDFT will be performed
292 
293    Output parameter:
294      y - vector that stores result of BDFT
295 */
296 PetscErrorCode MatMultTranspose_MPIFFTW(Mat A,Vec x,Vec y)
297 {
298   PetscErrorCode ierr;
299   Mat_FFT        *fft  = (Mat_FFT*)A->data;
300   Mat_FFTW       *fftw = (Mat_FFTW*)fft->data;
301   const PetscScalar *x_array;
302   PetscScalar    *y_array;
303   PetscInt       ndim=fft->ndim,*dim=fft->dim;
304   MPI_Comm       comm;
305 
306   PetscFunctionBegin;
307   ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr);
308   ierr = VecGetArrayRead(x,&x_array);CHKERRQ(ierr);
309   ierr = VecGetArray(y,&y_array);CHKERRQ(ierr);
310   if (!fftw->p_backward) { /* create a plan, then excute it */
311     switch (ndim) {
312     case 1:
313 #if defined(PETSC_USE_COMPLEX)
314       fftw->p_backward = fftw_mpi_plan_dft_1d(dim[0],(fftw_complex*)x_array,(fftw_complex*)y_array,comm,FFTW_BACKWARD,fftw->p_flag);
315 #else
316       SETERRQ(comm,PETSC_ERR_SUP,"not support for real numbers yet");
317 #endif
318       break;
319     case 2:
320 #if defined(PETSC_USE_COMPLEX) /* For complex transforms call fftw_mpi_plan_dft with flag FFTW_BACKWARD, for real transforms call fftw_mpi_plan_dft_c2r */
321       fftw->p_backward = fftw_mpi_plan_dft_2d(dim[0],dim[1],(fftw_complex*)x_array,(fftw_complex*)y_array,comm,FFTW_BACKWARD,fftw->p_flag);
322 #else
323       fftw->p_backward = fftw_mpi_plan_dft_c2r_2d(dim[0],dim[1],(fftw_complex*)x_array,(double*)y_array,comm,FFTW_ESTIMATE);
324 #endif
325       break;
326     case 3:
327 #if defined(PETSC_USE_COMPLEX)
328       fftw->p_backward = fftw_mpi_plan_dft_3d(dim[0],dim[1],dim[2],(fftw_complex*)x_array,(fftw_complex*)y_array,comm,FFTW_BACKWARD,fftw->p_flag);
329 #else
330       fftw->p_backward = fftw_mpi_plan_dft_c2r_3d(dim[0],dim[1],dim[2],(fftw_complex*)x_array,(double*)y_array,comm,FFTW_ESTIMATE);
331 #endif
332       break;
333     default:
334 #if defined(PETSC_USE_COMPLEX)
335       fftw->p_backward = fftw_mpi_plan_dft(fftw->ndim_fftw,fftw->dim_fftw,(fftw_complex*)x_array,(fftw_complex*)y_array,comm,FFTW_BACKWARD,fftw->p_flag);
336 #else
337       fftw->p_backward = fftw_mpi_plan_dft_c2r(fftw->ndim_fftw,fftw->dim_fftw,(fftw_complex*)x_array,(double*)y_array,comm,FFTW_ESTIMATE);
338 #endif
339       break;
340     }
341     fftw->binarray  = (PetscScalar *) x_array;
342     fftw->boutarray = y_array;
343     fftw_execute(fftw->p_backward);CHKERRQ(ierr);
344   } else { /* use existing plan */
345     if (fftw->binarray != x_array || fftw->boutarray != y_array) { /* use existing plan on new arrays */
346       fftw_execute_dft(fftw->p_backward,(fftw_complex*)x_array,(fftw_complex*)y_array);
347     } else {
348       fftw_execute(fftw->p_backward);CHKERRQ(ierr);
349     }
350   }
351   ierr = VecRestoreArray(y,&y_array);CHKERRQ(ierr);
352   ierr = VecRestoreArrayRead(x,&x_array);CHKERRQ(ierr);
353   PetscFunctionReturn(0);
354 }
355 
356 PetscErrorCode MatDestroy_FFTW(Mat A)
357 {
358   Mat_FFT        *fft  = (Mat_FFT*)A->data;
359   Mat_FFTW       *fftw = (Mat_FFTW*)fft->data;
360   PetscErrorCode ierr;
361 
362   PetscFunctionBegin;
363   fftw_destroy_plan(fftw->p_forward);
364   fftw_destroy_plan(fftw->p_backward);
365   if (fftw->iodims) {
366     free(fftw->iodims);
367   }
368   ierr = PetscFree(fftw->dim_fftw);CHKERRQ(ierr);
369   ierr = PetscFree(fft->data);CHKERRQ(ierr);
370   fftw_mpi_cleanup();
371   PetscFunctionReturn(0);
372 }
373 
374 #include <../src/vec/vec/impls/mpi/pvecimpl.h>   /*I  "petscvec.h"   I*/
375 PetscErrorCode VecDestroy_MPIFFTW(Vec v)
376 {
377   PetscErrorCode ierr;
378   PetscScalar    *array;
379 
380   PetscFunctionBegin;
381   ierr = VecGetArray(v,&array);CHKERRQ(ierr);
382   fftw_free((fftw_complex*)array);CHKERRQ(ierr);
383   ierr = VecRestoreArray(v,&array);CHKERRQ(ierr);
384   ierr = VecDestroy_MPI(v);CHKERRQ(ierr);
385   PetscFunctionReturn(0);
386 }
387 
388 
389 static PetscErrorCode VecDuplicate_FFTW_fin(Vec fin,Vec *fin_new)
390 {
391   PetscErrorCode ierr;
392   Mat            A;
393 
394   PetscFunctionBegin;
395   ierr = PetscObjectQuery((PetscObject)fin,"FFTmatrix",(PetscObject*)&A);CHKERRQ(ierr);
396   ierr = MatCreateVecsFFTW_FFTW(A,fin_new,NULL,NULL);CHKERRQ(ierr);
397   PetscFunctionReturn(0);
398 }
399 
400 static PetscErrorCode VecDuplicate_FFTW_fout(Vec fout,Vec *fout_new)
401 {
402   PetscErrorCode ierr;
403   Mat            A;
404 
405   PetscFunctionBegin;
406   ierr = PetscObjectQuery((PetscObject)fout,"FFTmatrix",(PetscObject*)&A);CHKERRQ(ierr);
407   ierr = MatCreateVecsFFTW_FFTW(A,NULL,fout_new,NULL);CHKERRQ(ierr);
408   PetscFunctionReturn(0);
409 }
410 
411 static PetscErrorCode VecDuplicate_FFTW_bout(Vec bout, Vec *bout_new)
412 {
413   PetscErrorCode ierr;
414   Mat            A;
415 
416   PetscFunctionBegin;
417   ierr = PetscObjectQuery((PetscObject)bout,"FFTmatrix",(PetscObject*)&A);CHKERRQ(ierr);
418   ierr = MatCreateVecsFFTW_FFTW(A,NULL,NULL,bout_new);CHKERRQ(ierr);
419   PetscFunctionReturn(0);
420 }
421 
422 
423 /*@
424    MatCreateVecsFFTW - Get vector(s) compatible with the matrix, i.e. with the
425      parallel layout determined by FFTW
426 
427    Collective on Mat
428 
429    Input Parameter:
430 .   A - the matrix
431 
432    Output Parameter:
433 +   x - (optional) input vector of forward FFTW
434 -   y - (optional) output vector of forward FFTW
435 -   z - (optional) output vector of backward FFTW
436 
437   Level: advanced
438 
439   Note: The parallel layout of output of forward FFTW is always same as the input
440         of the backward FFTW. But parallel layout of the input vector of forward
441         FFTW might not be same as the output of backward FFTW.
442         Also note that we need to provide enough space while doing parallel real transform.
443         We need to pad extra zeros at the end of last dimension. For this reason the one needs to
444         invoke the routine fftw_mpi_local_size_transposed routines. Remember one has to change the
445         last dimension from n to n/2+1 while invoking this routine. The number of zeros to be padded
446         depends on if the last dimension is even or odd. If the last dimension is even need to pad two
447         zeros if it is odd only one zero is needed.
448         Lastly one needs some scratch space at the end of data set in each process. alloc_local
449         figures out how much space is needed, i.e. it figures out the data+scratch space for
450         each processor and returns that.
451 
452 .seealso: MatCreateFFTW()
453 @*/
454 PetscErrorCode MatCreateVecsFFTW(Mat A,Vec *x,Vec *y,Vec *z)
455 {
456   PetscErrorCode ierr;
457 
458   PetscFunctionBegin;
459   ierr = PetscUseMethod(A,"MatCreateVecsFFTW_C",(Mat,Vec*,Vec*,Vec*),(A,x,y,z));CHKERRQ(ierr);
460   PetscFunctionReturn(0);
461 }
462 
463 PetscErrorCode  MatCreateVecsFFTW_FFTW(Mat A,Vec *fin,Vec *fout,Vec *bout)
464 {
465   PetscErrorCode ierr;
466   PetscMPIInt    size,rank;
467   MPI_Comm       comm;
468   Mat_FFT        *fft  = (Mat_FFT*)A->data;
469   Mat_FFTW       *fftw = (Mat_FFTW*)fft->data;
470   PetscInt       N     = fft->N;
471   PetscInt       ndim  = fft->ndim,*dim=fft->dim,n=fft->n;
472 
473   PetscFunctionBegin;
474   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
475   PetscValidType(A,1);
476   ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr);
477 
478   ierr = MPI_Comm_size(comm, &size);CHKERRQ(ierr);
479   ierr = MPI_Comm_rank(comm, &rank);CHKERRQ(ierr);
480   if (size == 1) { /* sequential case */
481 #if defined(PETSC_USE_COMPLEX)
482     if (fin)  {ierr = VecCreateSeq(PETSC_COMM_SELF,N,fin);CHKERRQ(ierr);}
483     if (fout) {ierr = VecCreateSeq(PETSC_COMM_SELF,N,fout);CHKERRQ(ierr);}
484     if (bout) {ierr = VecCreateSeq(PETSC_COMM_SELF,N,bout);CHKERRQ(ierr);}
485 #else
486     if (fin) {ierr = VecCreateSeq(PETSC_COMM_SELF,n,fin);CHKERRQ(ierr);}
487     if (fout) {ierr = VecCreateSeq(PETSC_COMM_SELF,n,fout);CHKERRQ(ierr);}
488     if (bout) {ierr = VecCreateSeq(PETSC_COMM_SELF,n,bout);CHKERRQ(ierr);}
489 #endif
490   } else { /* parallel cases */
491     ptrdiff_t    alloc_local,local_n0,local_0_start;
492     ptrdiff_t    local_n1;
493     fftw_complex *data_fout;
494     ptrdiff_t    local_1_start;
495 #if defined(PETSC_USE_COMPLEX)
496     fftw_complex *data_fin,*data_bout;
497 #else
498     double    *data_finr,*data_boutr;
499     PetscInt  n1,N1;
500     ptrdiff_t temp;
501 #endif
502 
503     switch (ndim) {
504     case 1:
505 #if !defined(PETSC_USE_COMPLEX)
506       SETERRQ(comm,PETSC_ERR_SUP,"FFTW does not allow parallel real 1D transform");
507 #else
508       alloc_local = fftw_mpi_local_size_1d(dim[0],comm,FFTW_FORWARD,FFTW_ESTIMATE,&local_n0,&local_0_start,&local_n1,&local_1_start);
509       if (fin) {
510         data_fin  = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
511         ierr      = VecCreateMPIWithArray(comm,1,local_n0,N,(const PetscScalar*)data_fin,fin);CHKERRQ(ierr);
512         ierr      = PetscObjectCompose((PetscObject)*fin,"FFTmatrix",(PetscObject)A);CHKERRQ(ierr);
513         (*fin)->ops->duplicate = VecDuplicate_FFTW_fin;
514         (*fin)->ops->destroy   = VecDestroy_MPIFFTW;
515       }
516       if (fout) {
517         data_fout = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
518         ierr      = VecCreateMPIWithArray(comm,1,local_n1,N,(const PetscScalar*)data_fout,fout);CHKERRQ(ierr);
519         ierr      = PetscObjectCompose((PetscObject)*fout,"FFTmatrix",(PetscObject)A);CHKERRQ(ierr);
520         (*fout)->ops->duplicate = VecDuplicate_FFTW_fout;
521         (*fout)->ops->destroy   = VecDestroy_MPIFFTW;
522       }
523       alloc_local = fftw_mpi_local_size_1d(dim[0],comm,FFTW_BACKWARD,FFTW_ESTIMATE,&local_n0,&local_0_start,&local_n1,&local_1_start);
524       if (bout) {
525         data_bout = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
526         ierr      = VecCreateMPIWithArray(comm,1,local_n1,N,(const PetscScalar*)data_bout,bout);CHKERRQ(ierr);
527         ierr      = PetscObjectCompose((PetscObject)*bout,"FFTmatrix",(PetscObject)A);CHKERRQ(ierr);
528         (*bout)->ops->duplicate = VecDuplicate_FFTW_fout;
529         (*bout)->ops->destroy   = VecDestroy_MPIFFTW;
530       }
531       break;
532 #endif
533     case 2:
534 #if !defined(PETSC_USE_COMPLEX) /* Note that N1 is no more the product of individual dimensions */
535       alloc_local =  fftw_mpi_local_size_2d_transposed(dim[0],dim[1]/2+1,comm,&local_n0,&local_0_start,&local_n1,&local_1_start);
536       N1          = 2*dim[0]*(dim[1]/2+1); n1 = 2*local_n0*(dim[1]/2+1);
537       if (fin) {
538         data_finr = (double*)fftw_malloc(sizeof(double)*alloc_local*2);
539         ierr      = VecCreateMPIWithArray(comm,1,(PetscInt)n1,N1,(PetscScalar*)data_finr,fin);CHKERRQ(ierr);
540         ierr      = PetscObjectCompose((PetscObject)*fin,"FFTmatrix",(PetscObject)A);CHKERRQ(ierr);
541         (*fin)->ops->duplicate = VecDuplicate_FFTW_fin;
542         (*fin)->ops->destroy   = VecDestroy_MPIFFTW;
543       }
544       if (fout) {
545         data_fout = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
546         ierr      = VecCreateMPIWithArray(comm,1,(PetscInt)n1,N1,(PetscScalar*)data_fout,fout);CHKERRQ(ierr);
547         ierr      = PetscObjectCompose((PetscObject)*fout,"FFTmatrix",(PetscObject)A);CHKERRQ(ierr);
548         (*fout)->ops->duplicate = VecDuplicate_FFTW_fout;
549         (*fout)->ops->destroy   = VecDestroy_MPIFFTW;
550       }
551       if (bout) {
552         data_boutr = (double*)fftw_malloc(sizeof(double)*alloc_local*2);
553         ierr       = VecCreateMPIWithArray(comm,1,(PetscInt)n1,N1,(PetscScalar*)data_boutr,bout);CHKERRQ(ierr);
554         ierr       = PetscObjectCompose((PetscObject)*bout,"FFTmatrix",(PetscObject)A);CHKERRQ(ierr);
555         (*bout)->ops->duplicate = VecDuplicate_FFTW_bout;
556         (*bout)->ops->destroy   = VecDestroy_MPIFFTW;
557       }
558 #else
559       /* Get local size */
560       alloc_local = fftw_mpi_local_size_2d(dim[0],dim[1],comm,&local_n0,&local_0_start);
561       if (fin) {
562         data_fin = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
563         ierr     = VecCreateMPIWithArray(comm,1,n,N,(const PetscScalar*)data_fin,fin);CHKERRQ(ierr);
564         ierr     = PetscObjectCompose((PetscObject)*fin,"FFTmatrix",(PetscObject)A);CHKERRQ(ierr);
565         (*fin)->ops->duplicate = VecDuplicate_FFTW_fin;
566         (*fin)->ops->destroy   = VecDestroy_MPIFFTW;
567       }
568       if (fout) {
569         data_fout = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
570         ierr      = VecCreateMPIWithArray(comm,1,n,N,(const PetscScalar*)data_fout,fout);CHKERRQ(ierr);
571         ierr      = PetscObjectCompose((PetscObject)*fout,"FFTmatrix",(PetscObject)A);CHKERRQ(ierr);
572         (*fout)->ops->duplicate = VecDuplicate_FFTW_fout;
573         (*fout)->ops->destroy   = VecDestroy_MPIFFTW;
574       }
575       if (bout) {
576         data_bout = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
577         ierr      = VecCreateMPIWithArray(comm,1,n,N,(const PetscScalar*)data_bout,bout);CHKERRQ(ierr);
578         ierr      = PetscObjectCompose((PetscObject)*bout,"FFTmatrix",(PetscObject)A);CHKERRQ(ierr);
579         (*bout)->ops->duplicate = VecDuplicate_FFTW_bout;
580         (*bout)->ops->destroy   = VecDestroy_MPIFFTW;
581       }
582 #endif
583       break;
584     case 3:
585 #if !defined(PETSC_USE_COMPLEX)
586       alloc_local =  fftw_mpi_local_size_3d_transposed(dim[0],dim[1],dim[2]/2+1,comm,&local_n0,&local_0_start,&local_n1,&local_1_start);
587       N1 = 2*dim[0]*dim[1]*(dim[2]/2+1); n1 = 2*local_n0*dim[1]*(dim[2]/2+1);
588       if (fin) {
589         data_finr = (double*)fftw_malloc(sizeof(double)*alloc_local*2);
590         ierr      = VecCreateMPIWithArray(comm,1,(PetscInt)n1,N1,(PetscScalar*)data_finr,fin);CHKERRQ(ierr);
591         ierr      = PetscObjectCompose((PetscObject)*fin,"FFTmatrix",(PetscObject)A);CHKERRQ(ierr);
592         (*fin)->ops->duplicate = VecDuplicate_FFTW_fin;
593         (*fin)->ops->destroy   = VecDestroy_MPIFFTW;
594       }
595       if (fout) {
596         data_fout=(fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
597         ierr = VecCreateMPIWithArray(comm,1,n1,N1,(PetscScalar*)data_fout,fout);CHKERRQ(ierr);
598         ierr = PetscObjectCompose((PetscObject)*fout,"FFTmatrix",(PetscObject)A);CHKERRQ(ierr);
599         (*fout)->ops->duplicate = VecDuplicate_FFTW_fout;
600         (*fout)->ops->destroy   = VecDestroy_MPIFFTW;
601       }
602       if (bout) {
603         data_boutr=(double*)fftw_malloc(sizeof(double)*alloc_local*2);
604         ierr = VecCreateMPIWithArray(comm,1,(PetscInt)n1,N1,(PetscScalar*)data_boutr,bout);CHKERRQ(ierr);
605         ierr = PetscObjectCompose((PetscObject)*bout,"FFTmatrix",(PetscObject)A);CHKERRQ(ierr);
606         (*bout)->ops->duplicate = VecDuplicate_FFTW_bout;
607         (*bout)->ops->destroy   = VecDestroy_MPIFFTW;
608       }
609 #else
610       alloc_local = fftw_mpi_local_size_3d(dim[0],dim[1],dim[2],comm,&local_n0,&local_0_start);
611       if (fin) {
612         data_fin  = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
613         ierr = VecCreateMPIWithArray(comm,1,n,N,(const PetscScalar*)data_fin,fin);CHKERRQ(ierr);
614         ierr = PetscObjectCompose((PetscObject)*fin,"FFTmatrix",(PetscObject)A);CHKERRQ(ierr);
615         (*fin)->ops->duplicate = VecDuplicate_FFTW_fin;
616         (*fin)->ops->destroy   = VecDestroy_MPIFFTW;
617       }
618       if (fout) {
619         data_fout = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
620         ierr = VecCreateMPIWithArray(comm,1,n,N,(const PetscScalar*)data_fout,fout);CHKERRQ(ierr);
621         ierr = PetscObjectCompose((PetscObject)*fout,"FFTmatrix",(PetscObject)A);CHKERRQ(ierr);
622         (*fout)->ops->duplicate = VecDuplicate_FFTW_fout;
623         (*fout)->ops->destroy   = VecDestroy_MPIFFTW;
624       }
625       if (bout) {
626         data_bout  = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
627         ierr = VecCreateMPIWithArray(comm,1,n,N,(const PetscScalar*)data_bout,bout);CHKERRQ(ierr);
628         ierr = PetscObjectCompose((PetscObject)*bout,"FFTmatrix",(PetscObject)A);CHKERRQ(ierr);
629         (*bout)->ops->duplicate = VecDuplicate_FFTW_bout;
630         (*bout)->ops->destroy   = VecDestroy_MPIFFTW;
631       }
632 #endif
633       break;
634     default:
635 #if !defined(PETSC_USE_COMPLEX)
636       temp = (fftw->dim_fftw)[fftw->ndim_fftw-1];
637 
638       (fftw->dim_fftw)[fftw->ndim_fftw-1] = temp/2 + 1;
639 
640       alloc_local = fftw_mpi_local_size_transposed(fftw->ndim_fftw,fftw->dim_fftw,comm,&local_n0,&local_0_start,&local_n1,&local_1_start);
641       N1          = 2*N*(PetscInt)((fftw->dim_fftw)[fftw->ndim_fftw-1])/((PetscInt) temp);
642 
643       (fftw->dim_fftw)[fftw->ndim_fftw-1] = temp;
644 
645       if (fin) {
646         data_finr=(double*)fftw_malloc(sizeof(double)*alloc_local*2);
647         ierr = VecCreateMPIWithArray(comm,1,(PetscInt)n,N1,(PetscScalar*)data_finr,fin);CHKERRQ(ierr);
648         ierr = PetscObjectCompose((PetscObject)*fin,"FFTmatrix",(PetscObject)A);CHKERRQ(ierr);
649         (*fin)->ops->duplicate = VecDuplicate_FFTW_fin;
650         (*fin)->ops->destroy   = VecDestroy_MPIFFTW;
651       }
652       if (fout) {
653         data_fout=(fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
654         ierr = VecCreateMPIWithArray(comm,1,n,N1,(PetscScalar*)data_fout,fout);CHKERRQ(ierr);
655         ierr = PetscObjectCompose((PetscObject)*fout,"FFTmatrix",(PetscObject)A);CHKERRQ(ierr);
656         (*fout)->ops->duplicate = VecDuplicate_FFTW_fout;
657         (*fout)->ops->destroy   = VecDestroy_MPIFFTW;
658       }
659       if (bout) {
660         data_boutr=(double*)fftw_malloc(sizeof(double)*alloc_local*2);
661         ierr = VecCreateMPIWithArray(comm,1,(PetscInt)n,N1,(PetscScalar*)data_boutr,bout);CHKERRQ(ierr);
662         ierr = PetscObjectCompose((PetscObject)*bout,"FFTmatrix",(PetscObject)A);CHKERRQ(ierr);
663         (*bout)->ops->duplicate = VecDuplicate_FFTW_bout;
664         (*bout)->ops->destroy   = VecDestroy_MPIFFTW;
665       }
666 #else
667       alloc_local = fftw_mpi_local_size(fftw->ndim_fftw,fftw->dim_fftw,comm,&local_n0,&local_0_start);
668       if (fin) {
669         data_fin  = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
670         ierr = VecCreateMPIWithArray(comm,1,n,N,(const PetscScalar*)data_fin,fin);CHKERRQ(ierr);
671         ierr = PetscObjectCompose((PetscObject)*fin,"FFTmatrix",(PetscObject)A);CHKERRQ(ierr);
672         (*fin)->ops->duplicate = VecDuplicate_FFTW_fin;
673         (*fin)->ops->destroy   = VecDestroy_MPIFFTW;
674       }
675       if (fout) {
676         data_fout = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
677         ierr = VecCreateMPIWithArray(comm,1,n,N,(const PetscScalar*)data_fout,fout);CHKERRQ(ierr);
678         ierr = PetscObjectCompose((PetscObject)*fout,"FFTmatrix",(PetscObject)A);CHKERRQ(ierr);
679         (*fout)->ops->duplicate = VecDuplicate_FFTW_fout;
680         (*fout)->ops->destroy   = VecDestroy_MPIFFTW;
681       }
682       if (bout) {
683         data_bout  = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
684         ierr = VecCreateMPIWithArray(comm,1,n,N,(const PetscScalar*)data_bout,bout);CHKERRQ(ierr);
685         ierr = PetscObjectCompose((PetscObject)*bout,"FFTmatrix",(PetscObject)A);CHKERRQ(ierr);
686         (*bout)->ops->duplicate = VecDuplicate_FFTW_bout;
687         (*bout)->ops->destroy   = VecDestroy_MPIFFTW;
688       }
689 #endif
690       break;
691     }
692   }
693   PetscFunctionReturn(0);
694 }
695 
696 /*@
697    VecScatterPetscToFFTW - Copies the PETSc vector to the vector that goes into FFTW block.
698 
699    Collective on Mat
700 
701    Input Parameters:
702 +  A - FFTW matrix
703 -  x - the PETSc vector
704 
705    Output Parameters:
706 .  y - the FFTW vector
707 
708   Options Database Keys:
709 . -mat_fftw_plannerflags - set FFTW planner flags
710 
711    Level: intermediate
712 
713    Note: For real parallel FFT, FFTW requires insertion of extra space at the end of last dimension. This required even when
714          one is not doing in-place transform. The last dimension size must be changed to 2*(dim[last]/2+1) to accommodate these extra
715          zeros. This routine does that job by scattering operation.
716 
717 .seealso: VecScatterFFTWToPetsc()
718 @*/
719 PetscErrorCode VecScatterPetscToFFTW(Mat A,Vec x,Vec y)
720 {
721   PetscErrorCode ierr;
722 
723   PetscFunctionBegin;
724   ierr = PetscUseMethod(A,"VecScatterPetscToFFTW_C",(Mat,Vec,Vec),(A,x,y));CHKERRQ(ierr);
725   PetscFunctionReturn(0);
726 }
727 
728 PetscErrorCode VecScatterPetscToFFTW_FFTW(Mat A,Vec x,Vec y)
729 {
730   PetscErrorCode ierr;
731   MPI_Comm       comm;
732   Mat_FFT        *fft  = (Mat_FFT*)A->data;
733   Mat_FFTW       *fftw = (Mat_FFTW*)fft->data;
734   PetscInt       N     =fft->N;
735   PetscInt       ndim  =fft->ndim,*dim=fft->dim;
736   PetscInt       low;
737   PetscMPIInt    rank,size;
738   PetscInt       vsize,vsize1;
739   ptrdiff_t      local_n0,local_0_start;
740   ptrdiff_t      local_n1,local_1_start;
741   VecScatter     vecscat;
742   IS             list1,list2;
743 #if !defined(PETSC_USE_COMPLEX)
744   PetscInt       i,j,k,partial_dim;
745   PetscInt       *indx1, *indx2, tempindx, tempindx1;
746   PetscInt       NM;
747   ptrdiff_t      temp;
748 #endif
749 
750   PetscFunctionBegin;
751   ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr);
752   ierr = MPI_Comm_size(comm, &size);CHKERRQ(ierr);
753   ierr = MPI_Comm_rank(comm, &rank);CHKERRQ(ierr);
754   ierr = VecGetOwnershipRange(y,&low,NULL);
755 
756   if (size==1) {
757     ierr = VecGetSize(x,&vsize);CHKERRQ(ierr);
758     ierr = VecGetSize(y,&vsize1);CHKERRQ(ierr);
759     ierr = ISCreateStride(PETSC_COMM_SELF,N,0,1,&list1);CHKERRQ(ierr);
760     ierr = VecScatterCreate(x,list1,y,list1,&vecscat);CHKERRQ(ierr);
761     ierr = VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
762     ierr = VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
763     ierr = VecScatterDestroy(&vecscat);CHKERRQ(ierr);
764     ierr = ISDestroy(&list1);CHKERRQ(ierr);
765   } else {
766     switch (ndim) {
767     case 1:
768 #if defined(PETSC_USE_COMPLEX)
769       fftw_mpi_local_size_1d(dim[0],comm,FFTW_FORWARD,FFTW_ESTIMATE,&local_n0,&local_0_start,&local_n1,&local_1_start);
770 
771       ierr = ISCreateStride(comm,local_n0,local_0_start,1,&list1);
772       ierr = ISCreateStride(comm,local_n0,low,1,&list2);
773       ierr = VecScatterCreate(x,list1,y,list2,&vecscat);CHKERRQ(ierr);
774       ierr = VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
775       ierr = VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
776       ierr = VecScatterDestroy(&vecscat);CHKERRQ(ierr);
777       ierr = ISDestroy(&list1);CHKERRQ(ierr);
778       ierr = ISDestroy(&list2);CHKERRQ(ierr);
779 #else
780       SETERRQ(comm,PETSC_ERR_SUP,"FFTW does not support parallel 1D real transform");
781 #endif
782       break;
783     case 2:
784 #if defined(PETSC_USE_COMPLEX)
785       fftw_mpi_local_size_2d(dim[0],dim[1],comm,&local_n0,&local_0_start);
786 
787       ierr = ISCreateStride(comm,local_n0*dim[1],local_0_start*dim[1],1,&list1);
788       ierr = ISCreateStride(comm,local_n0*dim[1],low,1,&list2);
789       ierr = VecScatterCreate(x,list1,y,list2,&vecscat);CHKERRQ(ierr);
790       ierr = VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
791       ierr = VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
792       ierr = VecScatterDestroy(&vecscat);CHKERRQ(ierr);
793       ierr = ISDestroy(&list1);CHKERRQ(ierr);
794       ierr = ISDestroy(&list2);CHKERRQ(ierr);
795 #else
796       fftw_mpi_local_size_2d_transposed(dim[0],dim[1]/2+1,comm,&local_n0,&local_0_start,&local_n1,&local_1_start);
797 
798       ierr = PetscMalloc1(((PetscInt)local_n0)*dim[1],&indx1);CHKERRQ(ierr);
799       ierr = PetscMalloc1(((PetscInt)local_n0)*dim[1],&indx2);CHKERRQ(ierr);
800 
801       if (dim[1]%2==0) {
802         NM = dim[1]+2;
803       } else {
804         NM = dim[1]+1;
805       }
806       for (i=0; i<local_n0; i++) {
807         for (j=0; j<dim[1]; j++) {
808           tempindx  = i*dim[1] + j;
809           tempindx1 = i*NM + j;
810 
811           indx1[tempindx]=local_0_start*dim[1]+tempindx;
812           indx2[tempindx]=low+tempindx1;
813         }
814       }
815 
816       ierr = ISCreateGeneral(comm,local_n0*dim[1],indx1,PETSC_COPY_VALUES,&list1);CHKERRQ(ierr);
817       ierr = ISCreateGeneral(comm,local_n0*dim[1],indx2,PETSC_COPY_VALUES,&list2);CHKERRQ(ierr);
818 
819       ierr = VecScatterCreate(x,list1,y,list2,&vecscat);CHKERRQ(ierr);
820       ierr = VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
821       ierr = VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
822       ierr = VecScatterDestroy(&vecscat);CHKERRQ(ierr);
823       ierr = ISDestroy(&list1);CHKERRQ(ierr);
824       ierr = ISDestroy(&list2);CHKERRQ(ierr);
825       ierr = PetscFree(indx1);CHKERRQ(ierr);
826       ierr = PetscFree(indx2);CHKERRQ(ierr);
827 #endif
828       break;
829 
830     case 3:
831 #if defined(PETSC_USE_COMPLEX)
832       fftw_mpi_local_size_3d(dim[0],dim[1],dim[2],comm,&local_n0,&local_0_start);
833 
834       ierr = ISCreateStride(comm,local_n0*dim[1]*dim[2],local_0_start*dim[1]*dim[2],1,&list1);
835       ierr = ISCreateStride(comm,local_n0*dim[1]*dim[2],low,1,&list2);
836       ierr = VecScatterCreate(x,list1,y,list2,&vecscat);CHKERRQ(ierr);
837       ierr = VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
838       ierr = VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
839       ierr = VecScatterDestroy(&vecscat);CHKERRQ(ierr);
840       ierr = ISDestroy(&list1);CHKERRQ(ierr);
841       ierr = ISDestroy(&list2);CHKERRQ(ierr);
842 #else
843       /* buggy, needs to be fixed. See src/mat/examples/tests/ex158.c */
844       SETERRQ(comm,PETSC_ERR_SUP,"FFTW does not support parallel 3D real transform");
845       fftw_mpi_local_size_3d_transposed(dim[0],dim[1],dim[2]/2+1,comm,&local_n0,&local_0_start,&local_n1,&local_1_start);
846 
847       ierr = PetscMalloc1(((PetscInt)local_n0)*dim[1]*dim[2],&indx1);CHKERRQ(ierr);
848       ierr = PetscMalloc1(((PetscInt)local_n0)*dim[1]*dim[2],&indx2);CHKERRQ(ierr);
849 
850       if (dim[2]%2==0) NM = dim[2]+2;
851       else             NM = dim[2]+1;
852 
853       for (i=0; i<local_n0; i++) {
854         for (j=0; j<dim[1]; j++) {
855           for (k=0; k<dim[2]; k++) {
856             tempindx  = i*dim[1]*dim[2] + j*dim[2] + k;
857             tempindx1 = i*dim[1]*NM + j*NM + k;
858 
859             indx1[tempindx]=local_0_start*dim[1]*dim[2]+tempindx;
860             indx2[tempindx]=low+tempindx1;
861           }
862         }
863       }
864 
865       ierr = ISCreateGeneral(comm,local_n0*dim[1]*dim[2],indx1,PETSC_COPY_VALUES,&list1);CHKERRQ(ierr);
866       ierr = ISCreateGeneral(comm,local_n0*dim[1]*dim[2],indx2,PETSC_COPY_VALUES,&list2);CHKERRQ(ierr);
867       ierr = VecScatterCreate(x,list1,y,list2,&vecscat);CHKERRQ(ierr);
868       ierr = VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
869       ierr = VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
870       ierr = VecScatterDestroy(&vecscat);CHKERRQ(ierr);
871       ierr = ISDestroy(&list1);CHKERRQ(ierr);
872       ierr = ISDestroy(&list2);CHKERRQ(ierr);
873       ierr = PetscFree(indx1);CHKERRQ(ierr);
874       ierr = PetscFree(indx2);CHKERRQ(ierr);
875 #endif
876       break;
877 
878     default:
879 #if defined(PETSC_USE_COMPLEX)
880       fftw_mpi_local_size(fftw->ndim_fftw,fftw->dim_fftw,comm,&local_n0,&local_0_start);
881 
882       ierr = ISCreateStride(comm,local_n0*(fftw->partial_dim),local_0_start*(fftw->partial_dim),1,&list1);
883       ierr = ISCreateStride(comm,local_n0*(fftw->partial_dim),low,1,&list2);
884       ierr = VecScatterCreate(x,list1,y,list2,&vecscat);CHKERRQ(ierr);
885       ierr = VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
886       ierr = VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
887       ierr = VecScatterDestroy(&vecscat);CHKERRQ(ierr);
888       ierr = ISDestroy(&list1);CHKERRQ(ierr);
889       ierr = ISDestroy(&list2);CHKERRQ(ierr);
890 #else
891       /* buggy, needs to be fixed. See src/mat/examples/tests/ex158.c */
892       SETERRQ(comm,PETSC_ERR_SUP,"FFTW does not support parallel DIM>3 real transform");
893       temp = (fftw->dim_fftw)[fftw->ndim_fftw-1];
894 
895       (fftw->dim_fftw)[fftw->ndim_fftw-1] = temp/2 + 1;
896 
897       fftw_mpi_local_size_transposed(fftw->ndim_fftw,fftw->dim_fftw,comm,&local_n0,&local_0_start,&local_n1,&local_1_start);
898 
899       (fftw->dim_fftw)[fftw->ndim_fftw-1] = temp;
900 
901       partial_dim = fftw->partial_dim;
902 
903       ierr = PetscMalloc1(((PetscInt)local_n0)*partial_dim,&indx1);CHKERRQ(ierr);
904       ierr = PetscMalloc1(((PetscInt)local_n0)*partial_dim,&indx2);CHKERRQ(ierr);
905 
906       if (dim[ndim-1]%2==0) NM = 2;
907       else                  NM = 1;
908 
909       j = low;
910       for (i=0,k=1; i<((PetscInt)local_n0)*partial_dim;i++,k++) {
911         indx1[i] = local_0_start*partial_dim + i;
912         indx2[i] = j;
913         if (k%dim[ndim-1]==0) j+=NM;
914         j++;
915       }
916       ierr = ISCreateGeneral(comm,local_n0*partial_dim,indx1,PETSC_COPY_VALUES,&list1);CHKERRQ(ierr);
917       ierr = ISCreateGeneral(comm,local_n0*partial_dim,indx2,PETSC_COPY_VALUES,&list2);CHKERRQ(ierr);
918       ierr = VecScatterCreate(x,list1,y,list2,&vecscat);CHKERRQ(ierr);
919       ierr = VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
920       ierr = VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
921       ierr = VecScatterDestroy(&vecscat);CHKERRQ(ierr);
922       ierr = ISDestroy(&list1);CHKERRQ(ierr);
923       ierr = ISDestroy(&list2);CHKERRQ(ierr);
924       ierr = PetscFree(indx1);CHKERRQ(ierr);
925       ierr = PetscFree(indx2);CHKERRQ(ierr);
926 #endif
927       break;
928     }
929   }
930   PetscFunctionReturn(0);
931 }
932 
933 /*@
934    VecScatterFFTWToPetsc - Converts FFTW output to the PETSc vector.
935 
936    Collective on Mat
937 
938     Input Parameters:
939 +   A - FFTW matrix
940 -   x - FFTW vector
941 
942    Output Parameters:
943 .  y - PETSc vector
944 
945    Level: intermediate
946 
947    Note: While doing real transform the FFTW output of backward DFT contains extra zeros at the end of last dimension.
948          VecScatterFFTWToPetsc removes those extra zeros.
949 
950 .seealso: VecScatterPetscToFFTW()
951 @*/
952 PetscErrorCode VecScatterFFTWToPetsc(Mat A,Vec x,Vec y)
953 {
954   PetscErrorCode ierr;
955 
956   PetscFunctionBegin;
957   ierr = PetscUseMethod(A,"VecScatterFFTWToPetsc_C",(Mat,Vec,Vec),(A,x,y));CHKERRQ(ierr);
958   PetscFunctionReturn(0);
959 }
960 
961 PetscErrorCode VecScatterFFTWToPetsc_FFTW(Mat A,Vec x,Vec y)
962 {
963   PetscErrorCode ierr;
964   MPI_Comm       comm;
965   Mat_FFT        *fft  = (Mat_FFT*)A->data;
966   Mat_FFTW       *fftw = (Mat_FFTW*)fft->data;
967   PetscInt       N     = fft->N;
968   PetscInt       ndim  = fft->ndim,*dim=fft->dim;
969   PetscInt       low;
970   PetscMPIInt    rank,size;
971   ptrdiff_t      local_n0,local_0_start;
972   ptrdiff_t      local_n1,local_1_start;
973   VecScatter     vecscat;
974   IS             list1,list2;
975 #if !defined(PETSC_USE_COMPLEX)
976   PetscInt       i,j,k,partial_dim;
977   PetscInt       *indx1, *indx2, tempindx, tempindx1;
978   PetscInt       NM;
979   ptrdiff_t      temp;
980 #endif
981 
982   PetscFunctionBegin;
983   ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr);
984   ierr = MPI_Comm_size(comm, &size);CHKERRQ(ierr);
985   ierr = MPI_Comm_rank(comm, &rank);CHKERRQ(ierr);
986   ierr = VecGetOwnershipRange(x,&low,NULL);CHKERRQ(ierr);
987 
988   if (size==1) {
989     ierr = ISCreateStride(comm,N,0,1,&list1);CHKERRQ(ierr);
990     ierr = VecScatterCreate(x,list1,y,list1,&vecscat);CHKERRQ(ierr);
991     ierr = VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
992     ierr = VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
993     ierr = VecScatterDestroy(&vecscat);CHKERRQ(ierr);
994     ierr = ISDestroy(&list1);CHKERRQ(ierr);
995 
996   } else {
997     switch (ndim) {
998     case 1:
999 #if defined(PETSC_USE_COMPLEX)
1000       fftw_mpi_local_size_1d(dim[0],comm,FFTW_BACKWARD,FFTW_ESTIMATE,&local_n0,&local_0_start,&local_n1,&local_1_start);
1001 
1002       ierr = ISCreateStride(comm,local_n1,local_1_start,1,&list1);
1003       ierr = ISCreateStride(comm,local_n1,low,1,&list2);
1004       ierr = VecScatterCreate(x,list1,y,list2,&vecscat);CHKERRQ(ierr);
1005       ierr = VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
1006       ierr = VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
1007       ierr = VecScatterDestroy(&vecscat);CHKERRQ(ierr);
1008       ierr = ISDestroy(&list1);CHKERRQ(ierr);
1009       ierr = ISDestroy(&list2);CHKERRQ(ierr);
1010 #else
1011       SETERRQ(comm,PETSC_ERR_SUP,"No support for real parallel 1D FFT");
1012 #endif
1013       break;
1014     case 2:
1015 #if defined(PETSC_USE_COMPLEX)
1016       fftw_mpi_local_size_2d(dim[0],dim[1],comm,&local_n0,&local_0_start);
1017 
1018       ierr = ISCreateStride(comm,local_n0*dim[1],local_0_start*dim[1],1,&list1);
1019       ierr = ISCreateStride(comm,local_n0*dim[1],low,1,&list2);
1020       ierr = VecScatterCreate(x,list2,y,list1,&vecscat);CHKERRQ(ierr);
1021       ierr = VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
1022       ierr = VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
1023       ierr = VecScatterDestroy(&vecscat);CHKERRQ(ierr);
1024       ierr = ISDestroy(&list1);CHKERRQ(ierr);
1025       ierr = ISDestroy(&list2);CHKERRQ(ierr);
1026 #else
1027       fftw_mpi_local_size_2d_transposed(dim[0],dim[1]/2+1,comm,&local_n0,&local_0_start,&local_n1,&local_1_start);
1028 
1029       ierr = PetscMalloc1(((PetscInt)local_n0)*dim[1],&indx1);CHKERRQ(ierr);
1030       ierr = PetscMalloc1(((PetscInt)local_n0)*dim[1],&indx2);CHKERRQ(ierr);
1031 
1032       if (dim[1]%2==0) NM = dim[1]+2;
1033       else             NM = dim[1]+1;
1034 
1035       for (i=0; i<local_n0; i++) {
1036         for (j=0; j<dim[1]; j++) {
1037           tempindx = i*dim[1] + j;
1038           tempindx1 = i*NM + j;
1039 
1040           indx1[tempindx]=local_0_start*dim[1]+tempindx;
1041           indx2[tempindx]=low+tempindx1;
1042         }
1043       }
1044 
1045       ierr = ISCreateGeneral(comm,local_n0*dim[1],indx1,PETSC_COPY_VALUES,&list1);CHKERRQ(ierr);
1046       ierr = ISCreateGeneral(comm,local_n0*dim[1],indx2,PETSC_COPY_VALUES,&list2);CHKERRQ(ierr);
1047 
1048       ierr = VecScatterCreate(x,list2,y,list1,&vecscat);CHKERRQ(ierr);
1049       ierr = VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
1050       ierr = VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
1051       ierr = VecScatterDestroy(&vecscat);CHKERRQ(ierr);
1052       ierr = ISDestroy(&list1);CHKERRQ(ierr);
1053       ierr = ISDestroy(&list2);CHKERRQ(ierr);
1054       ierr = PetscFree(indx1);CHKERRQ(ierr);
1055       ierr = PetscFree(indx2);CHKERRQ(ierr);
1056 #endif
1057       break;
1058     case 3:
1059 #if defined(PETSC_USE_COMPLEX)
1060       fftw_mpi_local_size_3d(dim[0],dim[1],dim[2],comm,&local_n0,&local_0_start);
1061 
1062       ierr = ISCreateStride(comm,local_n0*dim[1]*dim[2],local_0_start*dim[1]*dim[2],1,&list1);
1063       ierr = ISCreateStride(comm,local_n0*dim[1]*dim[2],low,1,&list2);
1064       ierr = VecScatterCreate(x,list1,y,list2,&vecscat);CHKERRQ(ierr);
1065       ierr = VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
1066       ierr = VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
1067       ierr = VecScatterDestroy(&vecscat);CHKERRQ(ierr);
1068       ierr = ISDestroy(&list1);CHKERRQ(ierr);
1069       ierr = ISDestroy(&list2);CHKERRQ(ierr);
1070 #else
1071       fftw_mpi_local_size_3d_transposed(dim[0],dim[1],dim[2]/2+1,comm,&local_n0,&local_0_start,&local_n1,&local_1_start);
1072 
1073       ierr = PetscMalloc1(((PetscInt)local_n0)*dim[1]*dim[2],&indx1);CHKERRQ(ierr);
1074       ierr = PetscMalloc1(((PetscInt)local_n0)*dim[1]*dim[2],&indx2);CHKERRQ(ierr);
1075 
1076       if (dim[2]%2==0) NM = dim[2]+2;
1077       else             NM = dim[2]+1;
1078 
1079       for (i=0; i<local_n0; i++) {
1080         for (j=0; j<dim[1]; j++) {
1081           for (k=0; k<dim[2]; k++) {
1082             tempindx  = i*dim[1]*dim[2] + j*dim[2] + k;
1083             tempindx1 = i*dim[1]*NM + j*NM + k;
1084 
1085             indx1[tempindx]=local_0_start*dim[1]*dim[2]+tempindx;
1086             indx2[tempindx]=low+tempindx1;
1087           }
1088         }
1089       }
1090 
1091       ierr = ISCreateGeneral(comm,local_n0*dim[1]*dim[2],indx1,PETSC_COPY_VALUES,&list1);CHKERRQ(ierr);
1092       ierr = ISCreateGeneral(comm,local_n0*dim[1]*dim[2],indx2,PETSC_COPY_VALUES,&list2);CHKERRQ(ierr);
1093 
1094       ierr = VecScatterCreate(x,list2,y,list1,&vecscat);CHKERRQ(ierr);
1095       ierr = VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
1096       ierr = VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
1097       ierr = VecScatterDestroy(&vecscat);CHKERRQ(ierr);
1098       ierr = ISDestroy(&list1);CHKERRQ(ierr);
1099       ierr = ISDestroy(&list2);CHKERRQ(ierr);
1100       ierr = PetscFree(indx1);CHKERRQ(ierr);
1101       ierr = PetscFree(indx2);CHKERRQ(ierr);
1102 #endif
1103       break;
1104     default:
1105 #if defined(PETSC_USE_COMPLEX)
1106       fftw_mpi_local_size(fftw->ndim_fftw,fftw->dim_fftw,comm,&local_n0,&local_0_start);
1107 
1108       ierr = ISCreateStride(comm,local_n0*(fftw->partial_dim),local_0_start*(fftw->partial_dim),1,&list1);
1109       ierr = ISCreateStride(comm,local_n0*(fftw->partial_dim),low,1,&list2);
1110       ierr = VecScatterCreate(x,list1,y,list2,&vecscat);CHKERRQ(ierr);
1111       ierr = VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
1112       ierr = VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
1113       ierr = VecScatterDestroy(&vecscat);CHKERRQ(ierr);
1114       ierr = ISDestroy(&list1);CHKERRQ(ierr);
1115       ierr = ISDestroy(&list2);CHKERRQ(ierr);
1116 #else
1117       temp = (fftw->dim_fftw)[fftw->ndim_fftw-1];
1118 
1119       (fftw->dim_fftw)[fftw->ndim_fftw-1] = temp/2 + 1;
1120 
1121       fftw_mpi_local_size_transposed(fftw->ndim_fftw,fftw->dim_fftw,comm,&local_n0,&local_0_start,&local_n1,&local_1_start);
1122 
1123       (fftw->dim_fftw)[fftw->ndim_fftw-1] = temp;
1124 
1125       partial_dim = fftw->partial_dim;
1126 
1127       ierr = PetscMalloc1(((PetscInt)local_n0)*partial_dim,&indx1);CHKERRQ(ierr);
1128       ierr = PetscMalloc1(((PetscInt)local_n0)*partial_dim,&indx2);CHKERRQ(ierr);
1129 
1130       if (dim[ndim-1]%2==0) NM = 2;
1131       else                  NM = 1;
1132 
1133       j = low;
1134       for (i=0,k=1; i<((PetscInt)local_n0)*partial_dim; i++,k++) {
1135         indx1[i] = local_0_start*partial_dim + i;
1136         indx2[i] = j;
1137         if (k%dim[ndim-1]==0) j+=NM;
1138         j++;
1139       }
1140       ierr = ISCreateGeneral(comm,local_n0*partial_dim,indx1,PETSC_COPY_VALUES,&list1);CHKERRQ(ierr);
1141       ierr = ISCreateGeneral(comm,local_n0*partial_dim,indx2,PETSC_COPY_VALUES,&list2);CHKERRQ(ierr);
1142 
1143       ierr = VecScatterCreate(x,list2,y,list1,&vecscat);CHKERRQ(ierr);
1144       ierr = VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
1145       ierr = VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
1146       ierr = VecScatterDestroy(&vecscat);CHKERRQ(ierr);
1147       ierr = ISDestroy(&list1);CHKERRQ(ierr);
1148       ierr = ISDestroy(&list2);CHKERRQ(ierr);
1149       ierr = PetscFree(indx1);CHKERRQ(ierr);
1150       ierr = PetscFree(indx2);CHKERRQ(ierr);
1151 #endif
1152       break;
1153     }
1154   }
1155   PetscFunctionReturn(0);
1156 }
1157 
1158 /*
1159     MatCreate_FFTW - Creates a matrix object that provides FFT via the external package FFTW
1160 
1161   Options Database Keys:
1162 + -mat_fftw_plannerflags - set FFTW planner flags
1163 
1164    Level: intermediate
1165 
1166 */
1167 PETSC_EXTERN PetscErrorCode MatCreate_FFTW(Mat A)
1168 {
1169   PetscErrorCode ierr;
1170   MPI_Comm       comm;
1171   Mat_FFT        *fft=(Mat_FFT*)A->data;
1172   Mat_FFTW       *fftw;
1173   PetscInt       n=fft->n,N=fft->N,ndim=fft->ndim,*dim=fft->dim;
1174   const char     *plans[]={"FFTW_ESTIMATE","FFTW_MEASURE","FFTW_PATIENT","FFTW_EXHAUSTIVE"};
1175   unsigned       iplans[]={FFTW_ESTIMATE,FFTW_MEASURE,FFTW_PATIENT,FFTW_EXHAUSTIVE};
1176   PetscBool      flg;
1177   PetscInt       p_flag,partial_dim=1,ctr;
1178   PetscMPIInt    size,rank;
1179   ptrdiff_t      *pdim;
1180   ptrdiff_t      local_n1,local_1_start;
1181 #if !defined(PETSC_USE_COMPLEX)
1182   ptrdiff_t      temp;
1183   PetscInt       N1,tot_dim;
1184 #else
1185   PetscInt       n1;
1186 #endif
1187 
1188   PetscFunctionBegin;
1189   ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr);
1190   ierr = MPI_Comm_size(comm, &size);CHKERRQ(ierr);
1191   ierr = MPI_Comm_rank(comm, &rank);CHKERRQ(ierr);
1192 
1193   fftw_mpi_init();
1194   pdim    = (ptrdiff_t*)calloc(ndim,sizeof(ptrdiff_t));
1195   pdim[0] = dim[0];
1196 #if !defined(PETSC_USE_COMPLEX)
1197   tot_dim = 2*dim[0];
1198 #endif
1199   for (ctr=1; ctr<ndim; ctr++) {
1200     partial_dim *= dim[ctr];
1201     pdim[ctr]    = dim[ctr];
1202 #if !defined(PETSC_USE_COMPLEX)
1203     if (ctr==ndim-1) tot_dim *= (dim[ctr]/2+1);
1204     else             tot_dim *= dim[ctr];
1205 #endif
1206   }
1207 
1208   if (size == 1) {
1209 #if defined(PETSC_USE_COMPLEX)
1210     ierr = MatSetSizes(A,N,N,N,N);CHKERRQ(ierr);
1211     n    = N;
1212 #else
1213     ierr = MatSetSizes(A,tot_dim,tot_dim,tot_dim,tot_dim);CHKERRQ(ierr);
1214     n    = tot_dim;
1215 #endif
1216 
1217   } else {
1218     ptrdiff_t local_n0,local_0_start;
1219     switch (ndim) {
1220     case 1:
1221 #if !defined(PETSC_USE_COMPLEX)
1222       SETERRQ(comm,PETSC_ERR_SUP,"FFTW does not support parallel 1D real transform");
1223 #else
1224       fftw_mpi_local_size_1d(dim[0],comm,FFTW_FORWARD,FFTW_ESTIMATE,&local_n0,&local_0_start,&local_n1,&local_1_start);
1225 
1226       n    = (PetscInt)local_n0;
1227       n1   = (PetscInt)local_n1;
1228       ierr = MatSetSizes(A,n1,n,N,N);CHKERRQ(ierr);
1229 #endif
1230       break;
1231     case 2:
1232 #if defined(PETSC_USE_COMPLEX)
1233       fftw_mpi_local_size_2d(dim[0],dim[1],comm,&local_n0,&local_0_start);
1234       /*
1235        PetscSynchronizedPrintf(comm,"[%d] MatCreateSeqFFTW: local_n0, local_0_start %d %d, N %d,dim %d, %d\n",rank,(PetscInt)local_n0*dim[1],(PetscInt)local_0_start,m,dim[0],dim[1]);
1236        PetscSynchronizedFlush(comm,PETSC_STDOUT);
1237        */
1238       n    = (PetscInt)local_n0*dim[1];
1239       ierr = MatSetSizes(A,n,n,N,N);CHKERRQ(ierr);
1240 #else
1241       fftw_mpi_local_size_2d_transposed(dim[0],dim[1]/2+1,comm,&local_n0,&local_0_start,&local_n1,&local_1_start);
1242 
1243       n    = 2*(PetscInt)local_n0*(dim[1]/2+1);
1244       ierr = MatSetSizes(A,n,n,2*dim[0]*(dim[1]/2+1),2*dim[0]*(dim[1]/2+1));
1245 #endif
1246       break;
1247     case 3:
1248 #if defined(PETSC_USE_COMPLEX)
1249       fftw_mpi_local_size_3d(dim[0],dim[1],dim[2],comm,&local_n0,&local_0_start);
1250 
1251       n    = (PetscInt)local_n0*dim[1]*dim[2];
1252       ierr = MatSetSizes(A,n,n,N,N);CHKERRQ(ierr);
1253 #else
1254       fftw_mpi_local_size_3d_transposed(dim[0],dim[1],dim[2]/2+1,comm,&local_n0,&local_0_start,&local_n1,&local_1_start);
1255 
1256       n   = 2*(PetscInt)local_n0*dim[1]*(dim[2]/2+1);
1257       ierr = MatSetSizes(A,n,n,2*dim[0]*dim[1]*(dim[2]/2+1),2*dim[0]*dim[1]*(dim[2]/2+1));
1258 #endif
1259       break;
1260     default:
1261 #if defined(PETSC_USE_COMPLEX)
1262       fftw_mpi_local_size(ndim,pdim,comm,&local_n0,&local_0_start);
1263 
1264       n    = (PetscInt)local_n0*partial_dim;
1265       ierr = MatSetSizes(A,n,n,N,N);CHKERRQ(ierr);
1266 #else
1267       temp = pdim[ndim-1];
1268 
1269       pdim[ndim-1] = temp/2 + 1;
1270 
1271       fftw_mpi_local_size_transposed(ndim,pdim,comm,&local_n0,&local_0_start,&local_n1,&local_1_start);
1272 
1273       n  = 2*(PetscInt)local_n0*partial_dim*pdim[ndim-1]/temp;
1274       N1 = 2*N*(PetscInt)pdim[ndim-1]/((PetscInt) temp);
1275 
1276       pdim[ndim-1] = temp;
1277 
1278       ierr = MatSetSizes(A,n,n,N1,N1);CHKERRQ(ierr);
1279 #endif
1280       break;
1281     }
1282   }
1283   free(pdim);
1284   ierr      = PetscObjectChangeTypeName((PetscObject)A,MATFFTW);CHKERRQ(ierr);
1285   ierr      = PetscNewLog(A,&fftw);CHKERRQ(ierr);
1286   fft->data = (void*)fftw;
1287 
1288   fft->n            = n;
1289   fftw->ndim_fftw   = (ptrdiff_t)ndim; /* This is dimension of fft */
1290   fftw->partial_dim = partial_dim;
1291 
1292   ierr = PetscMalloc1(ndim, &(fftw->dim_fftw));CHKERRQ(ierr);
1293   if (size == 1) {
1294 #if defined(PETSC_USE_64BIT_INDICES)
1295     fftw->iodims = (fftw_iodim64 *) malloc(sizeof(fftw_iodim64) * ndim);
1296 #else
1297     fftw->iodims = (fftw_iodim *) malloc(sizeof(fftw_iodim) * ndim);
1298 #endif
1299   }
1300 
1301   for (ctr=0;ctr<ndim;ctr++) (fftw->dim_fftw)[ctr]=dim[ctr];
1302 
1303   fftw->p_forward  = 0;
1304   fftw->p_backward = 0;
1305   fftw->p_flag     = FFTW_ESTIMATE;
1306 
1307   if (size == 1) {
1308     A->ops->mult          = MatMult_SeqFFTW;
1309     A->ops->multtranspose = MatMultTranspose_SeqFFTW;
1310   } else {
1311     A->ops->mult          = MatMult_MPIFFTW;
1312     A->ops->multtranspose = MatMultTranspose_MPIFFTW;
1313   }
1314   fft->matdestroy = MatDestroy_FFTW;
1315   A->assembled    = PETSC_TRUE;
1316   A->preallocated = PETSC_TRUE;
1317 
1318   ierr = PetscObjectComposeFunction((PetscObject)A,"MatCreateVecsFFTW_C",MatCreateVecsFFTW_FFTW);CHKERRQ(ierr);
1319   ierr = PetscObjectComposeFunction((PetscObject)A,"VecScatterPetscToFFTW_C",VecScatterPetscToFFTW_FFTW);CHKERRQ(ierr);
1320   ierr = PetscObjectComposeFunction((PetscObject)A,"VecScatterFFTWToPetsc_C",VecScatterFFTWToPetsc_FFTW);CHKERRQ(ierr);
1321 
1322   /* get runtime options */
1323   ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)A),((PetscObject)A)->prefix,"FFTW Options","Mat");CHKERRQ(ierr);
1324   ierr = PetscOptionsEList("-mat_fftw_plannerflags","Planner Flags","None",plans,4,plans[0],&p_flag,&flg);CHKERRQ(ierr);
1325   if (flg) {
1326     fftw->p_flag = iplans[p_flag];
1327   }
1328   ierr = PetscOptionsEnd();CHKERRQ(ierr);
1329   PetscFunctionReturn(0);
1330 }
1331 
1332 
1333 
1334 
1335