xref: /petsc/src/mat/impls/fft/fftw/fftw.c (revision 2fa40bb9206b96114faa7cb222621ec184d31cd2)
1 
2 /*
3     Provides an interface to the FFTW package.
4     Testing examples can be found in ~src/mat/tests
5 */
6 
7 #include <../src/mat/impls/fft/fft.h>   /*I "petscmat.h" I*/
8 EXTERN_C_BEGIN
9 #include <fftw3-mpi.h>
10 EXTERN_C_END
11 
12 typedef struct {
13   ptrdiff_t    ndim_fftw,*dim_fftw;
14 #if defined(PETSC_USE_64BIT_INDICES)
15   fftw_iodim64 *iodims;
16 #else
17   fftw_iodim   *iodims;
18 #endif
19   PetscInt     partial_dim;
20   fftw_plan    p_forward,p_backward;
21   unsigned     p_flag; /* planner flags, FFTW_ESTIMATE,FFTW_MEASURE, FFTW_PATIENT, FFTW_EXHAUSTIVE */
22   PetscScalar  *finarray,*foutarray,*binarray,*boutarray; /* keep track of arrays becaue fftw plan should be
23                                                             executed for the arrays with which the plan was created */
24 } Mat_FFTW;
25 
26 extern PetscErrorCode MatMult_SeqFFTW(Mat,Vec,Vec);
27 extern PetscErrorCode MatMultTranspose_SeqFFTW(Mat,Vec,Vec);
28 extern PetscErrorCode MatMult_MPIFFTW(Mat,Vec,Vec);
29 extern PetscErrorCode MatMultTranspose_MPIFFTW(Mat,Vec,Vec);
30 extern PetscErrorCode MatDestroy_FFTW(Mat);
31 extern PetscErrorCode VecDestroy_MPIFFTW(Vec);
32 extern PetscErrorCode MatCreateVecsFFTW_FFTW(Mat,Vec*,Vec*,Vec*);
33 
34 /* MatMult_SeqFFTW performs forward DFT in parallel
35    Input parameter:
36      A - the matrix
37      x - the vector on which FDFT will be performed
38 
39    Output parameter:
40      y - vector that stores result of FDFT
41 */
42 PetscErrorCode MatMult_SeqFFTW(Mat A,Vec x,Vec y)
43 {
44   PetscErrorCode ierr;
45   Mat_FFT        *fft  = (Mat_FFT*)A->data;
46   Mat_FFTW       *fftw = (Mat_FFTW*)fft->data;
47   const PetscScalar *x_array;
48   PetscScalar    *y_array;
49 #if defined(PETSC_USE_COMPLEX)
50 #if defined(PETSC_USE_64BIT_INDICES)
51   fftw_iodim64   *iodims;
52 #else
53   fftw_iodim     *iodims;
54 #endif
55   PetscInt       i;
56 #endif
57   PetscInt       ndim = fft->ndim,*dim = fft->dim;
58 
59   PetscFunctionBegin;
60   ierr = VecGetArrayRead(x,&x_array);CHKERRQ(ierr);
61   ierr = VecGetArray(y,&y_array);CHKERRQ(ierr);
62   if (!fftw->p_forward) { /* create a plan, then excute it */
63     switch (ndim) {
64     case 1:
65 #if defined(PETSC_USE_COMPLEX)
66       fftw->p_forward = fftw_plan_dft_1d(dim[0],(fftw_complex*)x_array,(fftw_complex*)y_array,FFTW_FORWARD,fftw->p_flag);
67 #else
68       fftw->p_forward = fftw_plan_dft_r2c_1d(dim[0],(double*)x_array,(fftw_complex*)y_array,fftw->p_flag);
69 #endif
70       break;
71     case 2:
72 #if defined(PETSC_USE_COMPLEX)
73       fftw->p_forward = fftw_plan_dft_2d(dim[0],dim[1],(fftw_complex*)x_array,(fftw_complex*)y_array,FFTW_FORWARD,fftw->p_flag);
74 #else
75       fftw->p_forward = fftw_plan_dft_r2c_2d(dim[0],dim[1],(double*)x_array,(fftw_complex*)y_array,fftw->p_flag);
76 #endif
77       break;
78     case 3:
79 #if defined(PETSC_USE_COMPLEX)
80       fftw->p_forward = fftw_plan_dft_3d(dim[0],dim[1],dim[2],(fftw_complex*)x_array,(fftw_complex*)y_array,FFTW_FORWARD,fftw->p_flag);
81 #else
82       fftw->p_forward = fftw_plan_dft_r2c_3d(dim[0],dim[1],dim[2],(double*)x_array,(fftw_complex*)y_array,fftw->p_flag);
83 #endif
84       break;
85     default:
86 #if defined(PETSC_USE_COMPLEX)
87       iodims = fftw->iodims;
88 #if defined(PETSC_USE_64BIT_INDICES)
89       if (ndim) {
90         iodims[ndim-1].n = (ptrdiff_t)dim[ndim-1];
91         iodims[ndim-1].is = iodims[ndim-1].os = 1;
92         for (i=ndim-2; i>=0; --i) {
93           iodims[i].n = (ptrdiff_t)dim[i];
94           iodims[i].is = iodims[i].os = iodims[i+1].is * iodims[i+1].n;
95         }
96       }
97       fftw->p_forward = fftw_plan_guru64_dft((int)ndim,(fftw_iodim64*)iodims,0,NULL,(fftw_complex*)x_array,(fftw_complex*)y_array,FFTW_FORWARD,fftw->p_flag);
98 #else
99       if (ndim) {
100         iodims[ndim-1].n = (int)dim[ndim-1];
101         iodims[ndim-1].is = iodims[ndim-1].os = 1;
102         for (i=ndim-2; i>=0; --i) {
103           iodims[i].n = (int)dim[i];
104           iodims[i].is = iodims[i].os = iodims[i+1].is * iodims[i+1].n;
105         }
106       }
107       fftw->p_forward = fftw_plan_guru_dft((int)ndim,(fftw_iodim*)iodims,0,NULL,(fftw_complex*)x_array,(fftw_complex*)y_array,FFTW_FORWARD,fftw->p_flag);
108 #endif
109 
110 #else
111       fftw->p_forward = fftw_plan_dft_r2c(ndim,(int*)dim,(double*)x_array,(fftw_complex*)y_array,fftw->p_flag);
112 #endif
113       break;
114     }
115     fftw->finarray  = (PetscScalar *) x_array;
116     fftw->foutarray = y_array;
117     /* Warning: if (fftw->p_flag!==FFTW_ESTIMATE) The data in the in/out arrays is overwritten!
118                 planning should be done before x is initialized! See FFTW manual sec2.1 or sec4 */
119     fftw_execute(fftw->p_forward);
120   } else { /* use existing plan */
121     if (fftw->finarray != x_array || fftw->foutarray != y_array) { /* use existing plan on new arrays */
122 #if defined(PETSC_USE_COMPLEX)
123       fftw_execute_dft(fftw->p_forward,(fftw_complex*)x_array,(fftw_complex*)y_array);
124 #else
125       fftw_execute_dft_r2c(fftw->p_forward,(double*)x_array,(fftw_complex*)y_array);
126 #endif
127     } else {
128       fftw_execute(fftw->p_forward);
129     }
130   }
131   ierr = VecRestoreArray(y,&y_array);CHKERRQ(ierr);
132   ierr = VecRestoreArrayRead(x,&x_array);CHKERRQ(ierr);
133   PetscFunctionReturn(0);
134 }
135 
136 /* MatMultTranspose_SeqFFTW performs serial backward DFT
137    Input parameter:
138      A - the matrix
139      x - the vector on which BDFT will be performed
140 
141    Output parameter:
142      y - vector that stores result of BDFT
143 */
144 
145 PetscErrorCode MatMultTranspose_SeqFFTW(Mat A,Vec x,Vec y)
146 {
147   PetscErrorCode ierr;
148   Mat_FFT        *fft  = (Mat_FFT*)A->data;
149   Mat_FFTW       *fftw = (Mat_FFTW*)fft->data;
150   const PetscScalar *x_array;
151   PetscScalar    *y_array;
152   PetscInt       ndim=fft->ndim,*dim=fft->dim;
153 #if defined(PETSC_USE_COMPLEX)
154 #if defined(PETSC_USE_64BIT_INDICES)
155   fftw_iodim64   *iodims=fftw->iodims;
156 #else
157   fftw_iodim     *iodims=fftw->iodims;
158 #endif
159 #endif
160 
161   PetscFunctionBegin;
162   ierr = VecGetArrayRead(x,&x_array);CHKERRQ(ierr);
163   ierr = VecGetArray(y,&y_array);CHKERRQ(ierr);
164   if (!fftw->p_backward) { /* create a plan, then excute it */
165     switch (ndim) {
166     case 1:
167 #if defined(PETSC_USE_COMPLEX)
168       fftw->p_backward = fftw_plan_dft_1d(dim[0],(fftw_complex*)x_array,(fftw_complex*)y_array,FFTW_BACKWARD,fftw->p_flag);
169 #else
170       fftw->p_backward= fftw_plan_dft_c2r_1d(dim[0],(fftw_complex*)x_array,(double*)y_array,fftw->p_flag);
171 #endif
172       break;
173     case 2:
174 #if defined(PETSC_USE_COMPLEX)
175       fftw->p_backward = fftw_plan_dft_2d(dim[0],dim[1],(fftw_complex*)x_array,(fftw_complex*)y_array,FFTW_BACKWARD,fftw->p_flag);
176 #else
177       fftw->p_backward= fftw_plan_dft_c2r_2d(dim[0],dim[1],(fftw_complex*)x_array,(double*)y_array,fftw->p_flag);
178 #endif
179       break;
180     case 3:
181 #if defined(PETSC_USE_COMPLEX)
182       fftw->p_backward = fftw_plan_dft_3d(dim[0],dim[1],dim[2],(fftw_complex*)x_array,(fftw_complex*)y_array,FFTW_BACKWARD,fftw->p_flag);
183 #else
184       fftw->p_backward= fftw_plan_dft_c2r_3d(dim[0],dim[1],dim[2],(fftw_complex*)x_array,(double*)y_array,fftw->p_flag);
185 #endif
186       break;
187     default:
188 #if defined(PETSC_USE_COMPLEX)
189 #if defined(PETSC_USE_64BIT_INDICES)
190       fftw->p_backward = fftw_plan_guru64_dft((int)ndim,(fftw_iodim64*)iodims,0,NULL,(fftw_complex*)x_array,(fftw_complex*)y_array,FFTW_BACKWARD,fftw->p_flag);
191 #else
192       fftw->p_backward = fftw_plan_guru_dft((int)ndim,iodims,0,NULL,(fftw_complex*)x_array,(fftw_complex*)y_array,FFTW_BACKWARD,fftw->p_flag);
193 #endif
194 #else
195       fftw->p_backward= fftw_plan_dft_c2r((int)ndim,(int*)dim,(fftw_complex*)x_array,(double*)y_array,fftw->p_flag);
196 #endif
197       break;
198     }
199     fftw->binarray  = (PetscScalar *) x_array;
200     fftw->boutarray = y_array;
201     fftw_execute(fftw->p_backward);CHKERRQ(ierr);
202   } else { /* use existing plan */
203     if (fftw->binarray != x_array || fftw->boutarray != y_array) { /* use existing plan on new arrays */
204 #if defined(PETSC_USE_COMPLEX)
205       fftw_execute_dft(fftw->p_backward,(fftw_complex*)x_array,(fftw_complex*)y_array);
206 #else
207       fftw_execute_dft_c2r(fftw->p_backward,(fftw_complex*)x_array,(double*)y_array);
208 #endif
209     } else {
210       fftw_execute(fftw->p_backward);CHKERRQ(ierr);
211     }
212   }
213   ierr = VecRestoreArray(y,&y_array);CHKERRQ(ierr);
214   ierr = VecRestoreArrayRead(x,&x_array);CHKERRQ(ierr);
215   PetscFunctionReturn(0);
216 }
217 
218 /* MatMult_MPIFFTW performs forward DFT in parallel
219    Input parameter:
220      A - the matrix
221      x - the vector on which FDFT will be performed
222 
223    Output parameter:
224    y   - vector that stores result of FDFT
225 */
226 PetscErrorCode MatMult_MPIFFTW(Mat A,Vec x,Vec y)
227 {
228   PetscErrorCode ierr;
229   Mat_FFT        *fft  = (Mat_FFT*)A->data;
230   Mat_FFTW       *fftw = (Mat_FFTW*)fft->data;
231   const PetscScalar *x_array;
232   PetscScalar    *y_array;
233   PetscInt       ndim=fft->ndim,*dim=fft->dim;
234   MPI_Comm       comm;
235 
236   PetscFunctionBegin;
237   ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr);
238   ierr = VecGetArrayRead(x,&x_array);CHKERRQ(ierr);
239   ierr = VecGetArray(y,&y_array);CHKERRQ(ierr);
240   if (!fftw->p_forward) { /* create a plan, then excute it */
241     switch (ndim) {
242     case 1:
243 #if defined(PETSC_USE_COMPLEX)
244       fftw->p_forward = fftw_mpi_plan_dft_1d(dim[0],(fftw_complex*)x_array,(fftw_complex*)y_array,comm,FFTW_FORWARD,fftw->p_flag);
245 #else
246       SETERRQ(comm,PETSC_ERR_SUP,"not support for real numbers yet");
247 #endif
248       break;
249     case 2:
250 #if defined(PETSC_USE_COMPLEX) /* For complex transforms call fftw_mpi_plan_dft, for real transforms call fftw_mpi_plan_dft_r2c */
251       fftw->p_forward = fftw_mpi_plan_dft_2d(dim[0],dim[1],(fftw_complex*)x_array,(fftw_complex*)y_array,comm,FFTW_FORWARD,fftw->p_flag);
252 #else
253       fftw->p_forward = fftw_mpi_plan_dft_r2c_2d(dim[0],dim[1],(double*)x_array,(fftw_complex*)y_array,comm,FFTW_ESTIMATE);
254 #endif
255       break;
256     case 3:
257 #if defined(PETSC_USE_COMPLEX)
258       fftw->p_forward = fftw_mpi_plan_dft_3d(dim[0],dim[1],dim[2],(fftw_complex*)x_array,(fftw_complex*)y_array,comm,FFTW_FORWARD,fftw->p_flag);
259 #else
260       fftw->p_forward = fftw_mpi_plan_dft_r2c_3d(dim[0],dim[1],dim[2],(double*)x_array,(fftw_complex*)y_array,comm,FFTW_ESTIMATE);
261 #endif
262       break;
263     default:
264 #if defined(PETSC_USE_COMPLEX)
265       fftw->p_forward = fftw_mpi_plan_dft(fftw->ndim_fftw,fftw->dim_fftw,(fftw_complex*)x_array,(fftw_complex*)y_array,comm,FFTW_FORWARD,fftw->p_flag);
266 #else
267       fftw->p_forward = fftw_mpi_plan_dft_r2c(fftw->ndim_fftw,fftw->dim_fftw,(double*)x_array,(fftw_complex*)y_array,comm,FFTW_ESTIMATE);
268 #endif
269       break;
270     }
271     fftw->finarray  = (PetscScalar *) x_array;
272     fftw->foutarray = y_array;
273     /* Warning: if (fftw->p_flag!==FFTW_ESTIMATE) The data in the in/out arrays is overwritten!
274                 planning should be done before x is initialized! See FFTW manual sec2.1 or sec4 */
275     fftw_execute(fftw->p_forward);
276   } else { /* use existing plan */
277     if (fftw->finarray != x_array || fftw->foutarray != y_array) { /* use existing plan on new arrays */
278       fftw_execute_dft(fftw->p_forward,(fftw_complex*)x_array,(fftw_complex*)y_array);
279     } else {
280       fftw_execute(fftw->p_forward);
281     }
282   }
283   ierr = VecRestoreArray(y,&y_array);CHKERRQ(ierr);
284   ierr = VecRestoreArrayRead(x,&x_array);CHKERRQ(ierr);
285   PetscFunctionReturn(0);
286 }
287 
288 /* MatMultTranspose_MPIFFTW performs parallel backward DFT
289    Input parameter:
290      A - the matrix
291      x - the vector on which BDFT will be performed
292 
293    Output parameter:
294      y - vector that stores result of BDFT
295 */
296 PetscErrorCode MatMultTranspose_MPIFFTW(Mat A,Vec x,Vec y)
297 {
298   PetscErrorCode ierr;
299   Mat_FFT        *fft  = (Mat_FFT*)A->data;
300   Mat_FFTW       *fftw = (Mat_FFTW*)fft->data;
301   const PetscScalar *x_array;
302   PetscScalar    *y_array;
303   PetscInt       ndim=fft->ndim,*dim=fft->dim;
304   MPI_Comm       comm;
305 
306   PetscFunctionBegin;
307   ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr);
308   ierr = VecGetArrayRead(x,&x_array);CHKERRQ(ierr);
309   ierr = VecGetArray(y,&y_array);CHKERRQ(ierr);
310   if (!fftw->p_backward) { /* create a plan, then excute it */
311     switch (ndim) {
312     case 1:
313 #if defined(PETSC_USE_COMPLEX)
314       fftw->p_backward = fftw_mpi_plan_dft_1d(dim[0],(fftw_complex*)x_array,(fftw_complex*)y_array,comm,FFTW_BACKWARD,fftw->p_flag);
315 #else
316       SETERRQ(comm,PETSC_ERR_SUP,"not support for real numbers yet");
317 #endif
318       break;
319     case 2:
320 #if defined(PETSC_USE_COMPLEX) /* For complex transforms call fftw_mpi_plan_dft with flag FFTW_BACKWARD, for real transforms call fftw_mpi_plan_dft_c2r */
321       fftw->p_backward = fftw_mpi_plan_dft_2d(dim[0],dim[1],(fftw_complex*)x_array,(fftw_complex*)y_array,comm,FFTW_BACKWARD,fftw->p_flag);
322 #else
323       fftw->p_backward = fftw_mpi_plan_dft_c2r_2d(dim[0],dim[1],(fftw_complex*)x_array,(double*)y_array,comm,FFTW_ESTIMATE);
324 #endif
325       break;
326     case 3:
327 #if defined(PETSC_USE_COMPLEX)
328       fftw->p_backward = fftw_mpi_plan_dft_3d(dim[0],dim[1],dim[2],(fftw_complex*)x_array,(fftw_complex*)y_array,comm,FFTW_BACKWARD,fftw->p_flag);
329 #else
330       fftw->p_backward = fftw_mpi_plan_dft_c2r_3d(dim[0],dim[1],dim[2],(fftw_complex*)x_array,(double*)y_array,comm,FFTW_ESTIMATE);
331 #endif
332       break;
333     default:
334 #if defined(PETSC_USE_COMPLEX)
335       fftw->p_backward = fftw_mpi_plan_dft(fftw->ndim_fftw,fftw->dim_fftw,(fftw_complex*)x_array,(fftw_complex*)y_array,comm,FFTW_BACKWARD,fftw->p_flag);
336 #else
337       fftw->p_backward = fftw_mpi_plan_dft_c2r(fftw->ndim_fftw,fftw->dim_fftw,(fftw_complex*)x_array,(double*)y_array,comm,FFTW_ESTIMATE);
338 #endif
339       break;
340     }
341     fftw->binarray  = (PetscScalar *) x_array;
342     fftw->boutarray = y_array;
343     fftw_execute(fftw->p_backward);CHKERRQ(ierr);
344   } else { /* use existing plan */
345     if (fftw->binarray != x_array || fftw->boutarray != y_array) { /* use existing plan on new arrays */
346       fftw_execute_dft(fftw->p_backward,(fftw_complex*)x_array,(fftw_complex*)y_array);
347     } else {
348       fftw_execute(fftw->p_backward);CHKERRQ(ierr);
349     }
350   }
351   ierr = VecRestoreArray(y,&y_array);CHKERRQ(ierr);
352   ierr = VecRestoreArrayRead(x,&x_array);CHKERRQ(ierr);
353   PetscFunctionReturn(0);
354 }
355 
356 PetscErrorCode MatDestroy_FFTW(Mat A)
357 {
358   Mat_FFT        *fft  = (Mat_FFT*)A->data;
359   Mat_FFTW       *fftw = (Mat_FFTW*)fft->data;
360   PetscErrorCode ierr;
361 
362   PetscFunctionBegin;
363   fftw_destroy_plan(fftw->p_forward);
364   fftw_destroy_plan(fftw->p_backward);
365   if (fftw->iodims) {
366     free(fftw->iodims);
367   }
368   ierr = PetscFree(fftw->dim_fftw);CHKERRQ(ierr);
369   ierr = PetscFree(fft->data);CHKERRQ(ierr);
370   fftw_mpi_cleanup();
371   PetscFunctionReturn(0);
372 }
373 
374 #include <../src/vec/vec/impls/mpi/pvecimpl.h>   /*I  "petscvec.h"   I*/
375 PetscErrorCode VecDestroy_MPIFFTW(Vec v)
376 {
377   PetscErrorCode ierr;
378   PetscScalar    *array;
379 
380   PetscFunctionBegin;
381   ierr = VecGetArray(v,&array);CHKERRQ(ierr);
382   fftw_free((fftw_complex*)array);CHKERRQ(ierr);
383   ierr = VecRestoreArray(v,&array);CHKERRQ(ierr);
384   ierr = VecDestroy_MPI(v);CHKERRQ(ierr);
385   PetscFunctionReturn(0);
386 }
387 
388 static PetscErrorCode VecDuplicate_FFTW_fin(Vec fin,Vec *fin_new)
389 {
390   PetscErrorCode ierr;
391   Mat            A;
392 
393   PetscFunctionBegin;
394   ierr = PetscObjectQuery((PetscObject)fin,"FFTmatrix",(PetscObject*)&A);CHKERRQ(ierr);
395   ierr = MatCreateVecsFFTW_FFTW(A,fin_new,NULL,NULL);CHKERRQ(ierr);
396   PetscFunctionReturn(0);
397 }
398 
399 static PetscErrorCode VecDuplicate_FFTW_fout(Vec fout,Vec *fout_new)
400 {
401   PetscErrorCode ierr;
402   Mat            A;
403 
404   PetscFunctionBegin;
405   ierr = PetscObjectQuery((PetscObject)fout,"FFTmatrix",(PetscObject*)&A);CHKERRQ(ierr);
406   ierr = MatCreateVecsFFTW_FFTW(A,NULL,fout_new,NULL);CHKERRQ(ierr);
407   PetscFunctionReturn(0);
408 }
409 
410 static PetscErrorCode VecDuplicate_FFTW_bout(Vec bout, Vec *bout_new)
411 {
412   PetscErrorCode ierr;
413   Mat            A;
414 
415   PetscFunctionBegin;
416   ierr = PetscObjectQuery((PetscObject)bout,"FFTmatrix",(PetscObject*)&A);CHKERRQ(ierr);
417   ierr = MatCreateVecsFFTW_FFTW(A,NULL,NULL,bout_new);CHKERRQ(ierr);
418   PetscFunctionReturn(0);
419 }
420 
421 /*@
422    MatCreateVecsFFTW - Get vector(s) compatible with the matrix, i.e. with the
423      parallel layout determined by FFTW
424 
425    Collective on Mat
426 
427    Input Parameter:
428 .   A - the matrix
429 
430    Output Parameters:
431 +   x - (optional) input vector of forward FFTW
432 .   y - (optional) output vector of forward FFTW
433 -   z - (optional) output vector of backward FFTW
434 
435   Level: advanced
436 
437   Note: The parallel layout of output of forward FFTW is always same as the input
438         of the backward FFTW. But parallel layout of the input vector of forward
439         FFTW might not be same as the output of backward FFTW.
440         Also note that we need to provide enough space while doing parallel real transform.
441         We need to pad extra zeros at the end of last dimension. For this reason the one needs to
442         invoke the routine fftw_mpi_local_size_transposed routines. Remember one has to change the
443         last dimension from n to n/2+1 while invoking this routine. The number of zeros to be padded
444         depends on if the last dimension is even or odd. If the last dimension is even need to pad two
445         zeros if it is odd only one zero is needed.
446         Lastly one needs some scratch space at the end of data set in each process. alloc_local
447         figures out how much space is needed, i.e. it figures out the data+scratch space for
448         each processor and returns that.
449 
450 .seealso: MatCreateFFT()
451 @*/
452 PetscErrorCode MatCreateVecsFFTW(Mat A,Vec *x,Vec *y,Vec *z)
453 {
454   PetscErrorCode ierr;
455 
456   PetscFunctionBegin;
457   ierr = PetscUseMethod(A,"MatCreateVecsFFTW_C",(Mat,Vec*,Vec*,Vec*),(A,x,y,z));CHKERRQ(ierr);
458   PetscFunctionReturn(0);
459 }
460 
461 PetscErrorCode  MatCreateVecsFFTW_FFTW(Mat A,Vec *fin,Vec *fout,Vec *bout)
462 {
463   PetscErrorCode ierr;
464   PetscMPIInt    size,rank;
465   MPI_Comm       comm;
466   Mat_FFT        *fft  = (Mat_FFT*)A->data;
467   Mat_FFTW       *fftw = (Mat_FFTW*)fft->data;
468   PetscInt       N     = fft->N;
469   PetscInt       ndim  = fft->ndim,*dim=fft->dim,n=fft->n;
470 
471   PetscFunctionBegin;
472   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
473   PetscValidType(A,1);
474   ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr);
475 
476   ierr = MPI_Comm_size(comm, &size);CHKERRMPI(ierr);
477   ierr = MPI_Comm_rank(comm, &rank);CHKERRMPI(ierr);
478   if (size == 1) { /* sequential case */
479 #if defined(PETSC_USE_COMPLEX)
480     if (fin)  {ierr = VecCreateSeq(PETSC_COMM_SELF,N,fin);CHKERRQ(ierr);}
481     if (fout) {ierr = VecCreateSeq(PETSC_COMM_SELF,N,fout);CHKERRQ(ierr);}
482     if (bout) {ierr = VecCreateSeq(PETSC_COMM_SELF,N,bout);CHKERRQ(ierr);}
483 #else
484     if (fin) {ierr = VecCreateSeq(PETSC_COMM_SELF,n,fin);CHKERRQ(ierr);}
485     if (fout) {ierr = VecCreateSeq(PETSC_COMM_SELF,n,fout);CHKERRQ(ierr);}
486     if (bout) {ierr = VecCreateSeq(PETSC_COMM_SELF,n,bout);CHKERRQ(ierr);}
487 #endif
488   } else { /* parallel cases */
489     ptrdiff_t    alloc_local,local_n0,local_0_start;
490     ptrdiff_t    local_n1;
491     fftw_complex *data_fout;
492     ptrdiff_t    local_1_start;
493 #if defined(PETSC_USE_COMPLEX)
494     fftw_complex *data_fin,*data_bout;
495 #else
496     double    *data_finr,*data_boutr;
497     PetscInt  n1,N1;
498     ptrdiff_t temp;
499 #endif
500 
501     switch (ndim) {
502     case 1:
503 #if !defined(PETSC_USE_COMPLEX)
504       SETERRQ(comm,PETSC_ERR_SUP,"FFTW does not allow parallel real 1D transform");
505 #else
506       alloc_local = fftw_mpi_local_size_1d(dim[0],comm,FFTW_FORWARD,FFTW_ESTIMATE,&local_n0,&local_0_start,&local_n1,&local_1_start);
507       if (fin) {
508         data_fin  = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
509         ierr      = VecCreateMPIWithArray(comm,1,local_n0,N,(const PetscScalar*)data_fin,fin);CHKERRQ(ierr);
510         ierr      = PetscObjectCompose((PetscObject)*fin,"FFTmatrix",(PetscObject)A);CHKERRQ(ierr);
511         (*fin)->ops->duplicate = VecDuplicate_FFTW_fin;
512         (*fin)->ops->destroy   = VecDestroy_MPIFFTW;
513       }
514       if (fout) {
515         data_fout = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
516         ierr      = VecCreateMPIWithArray(comm,1,local_n1,N,(const PetscScalar*)data_fout,fout);CHKERRQ(ierr);
517         ierr      = PetscObjectCompose((PetscObject)*fout,"FFTmatrix",(PetscObject)A);CHKERRQ(ierr);
518         (*fout)->ops->duplicate = VecDuplicate_FFTW_fout;
519         (*fout)->ops->destroy   = VecDestroy_MPIFFTW;
520       }
521       alloc_local = fftw_mpi_local_size_1d(dim[0],comm,FFTW_BACKWARD,FFTW_ESTIMATE,&local_n0,&local_0_start,&local_n1,&local_1_start);
522       if (bout) {
523         data_bout = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
524         ierr      = VecCreateMPIWithArray(comm,1,local_n1,N,(const PetscScalar*)data_bout,bout);CHKERRQ(ierr);
525         ierr      = PetscObjectCompose((PetscObject)*bout,"FFTmatrix",(PetscObject)A);CHKERRQ(ierr);
526         (*bout)->ops->duplicate = VecDuplicate_FFTW_fout;
527         (*bout)->ops->destroy   = VecDestroy_MPIFFTW;
528       }
529       break;
530 #endif
531     case 2:
532 #if !defined(PETSC_USE_COMPLEX) /* Note that N1 is no more the product of individual dimensions */
533       alloc_local =  fftw_mpi_local_size_2d_transposed(dim[0],dim[1]/2+1,comm,&local_n0,&local_0_start,&local_n1,&local_1_start);
534       N1          = 2*dim[0]*(dim[1]/2+1); n1 = 2*local_n0*(dim[1]/2+1);
535       if (fin) {
536         data_finr = (double*)fftw_malloc(sizeof(double)*alloc_local*2);
537         ierr      = VecCreateMPIWithArray(comm,1,(PetscInt)n1,N1,(PetscScalar*)data_finr,fin);CHKERRQ(ierr);
538         ierr      = PetscObjectCompose((PetscObject)*fin,"FFTmatrix",(PetscObject)A);CHKERRQ(ierr);
539         (*fin)->ops->duplicate = VecDuplicate_FFTW_fin;
540         (*fin)->ops->destroy   = VecDestroy_MPIFFTW;
541       }
542       if (fout) {
543         data_fout = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
544         ierr      = VecCreateMPIWithArray(comm,1,(PetscInt)n1,N1,(PetscScalar*)data_fout,fout);CHKERRQ(ierr);
545         ierr      = PetscObjectCompose((PetscObject)*fout,"FFTmatrix",(PetscObject)A);CHKERRQ(ierr);
546         (*fout)->ops->duplicate = VecDuplicate_FFTW_fout;
547         (*fout)->ops->destroy   = VecDestroy_MPIFFTW;
548       }
549       if (bout) {
550         data_boutr = (double*)fftw_malloc(sizeof(double)*alloc_local*2);
551         ierr       = VecCreateMPIWithArray(comm,1,(PetscInt)n1,N1,(PetscScalar*)data_boutr,bout);CHKERRQ(ierr);
552         ierr       = PetscObjectCompose((PetscObject)*bout,"FFTmatrix",(PetscObject)A);CHKERRQ(ierr);
553         (*bout)->ops->duplicate = VecDuplicate_FFTW_bout;
554         (*bout)->ops->destroy   = VecDestroy_MPIFFTW;
555       }
556 #else
557       /* Get local size */
558       alloc_local = fftw_mpi_local_size_2d(dim[0],dim[1],comm,&local_n0,&local_0_start);
559       if (fin) {
560         data_fin = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
561         ierr     = VecCreateMPIWithArray(comm,1,n,N,(const PetscScalar*)data_fin,fin);CHKERRQ(ierr);
562         ierr     = PetscObjectCompose((PetscObject)*fin,"FFTmatrix",(PetscObject)A);CHKERRQ(ierr);
563         (*fin)->ops->duplicate = VecDuplicate_FFTW_fin;
564         (*fin)->ops->destroy   = VecDestroy_MPIFFTW;
565       }
566       if (fout) {
567         data_fout = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
568         ierr      = VecCreateMPIWithArray(comm,1,n,N,(const PetscScalar*)data_fout,fout);CHKERRQ(ierr);
569         ierr      = PetscObjectCompose((PetscObject)*fout,"FFTmatrix",(PetscObject)A);CHKERRQ(ierr);
570         (*fout)->ops->duplicate = VecDuplicate_FFTW_fout;
571         (*fout)->ops->destroy   = VecDestroy_MPIFFTW;
572       }
573       if (bout) {
574         data_bout = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
575         ierr      = VecCreateMPIWithArray(comm,1,n,N,(const PetscScalar*)data_bout,bout);CHKERRQ(ierr);
576         ierr      = PetscObjectCompose((PetscObject)*bout,"FFTmatrix",(PetscObject)A);CHKERRQ(ierr);
577         (*bout)->ops->duplicate = VecDuplicate_FFTW_bout;
578         (*bout)->ops->destroy   = VecDestroy_MPIFFTW;
579       }
580 #endif
581       break;
582     case 3:
583 #if !defined(PETSC_USE_COMPLEX)
584       alloc_local =  fftw_mpi_local_size_3d_transposed(dim[0],dim[1],dim[2]/2+1,comm,&local_n0,&local_0_start,&local_n1,&local_1_start);
585       N1 = 2*dim[0]*dim[1]*(dim[2]/2+1); n1 = 2*local_n0*dim[1]*(dim[2]/2+1);
586       if (fin) {
587         data_finr = (double*)fftw_malloc(sizeof(double)*alloc_local*2);
588         ierr      = VecCreateMPIWithArray(comm,1,(PetscInt)n1,N1,(PetscScalar*)data_finr,fin);CHKERRQ(ierr);
589         ierr      = PetscObjectCompose((PetscObject)*fin,"FFTmatrix",(PetscObject)A);CHKERRQ(ierr);
590         (*fin)->ops->duplicate = VecDuplicate_FFTW_fin;
591         (*fin)->ops->destroy   = VecDestroy_MPIFFTW;
592       }
593       if (fout) {
594         data_fout=(fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
595         ierr = VecCreateMPIWithArray(comm,1,n1,N1,(PetscScalar*)data_fout,fout);CHKERRQ(ierr);
596         ierr = PetscObjectCompose((PetscObject)*fout,"FFTmatrix",(PetscObject)A);CHKERRQ(ierr);
597         (*fout)->ops->duplicate = VecDuplicate_FFTW_fout;
598         (*fout)->ops->destroy   = VecDestroy_MPIFFTW;
599       }
600       if (bout) {
601         data_boutr=(double*)fftw_malloc(sizeof(double)*alloc_local*2);
602         ierr = VecCreateMPIWithArray(comm,1,(PetscInt)n1,N1,(PetscScalar*)data_boutr,bout);CHKERRQ(ierr);
603         ierr = PetscObjectCompose((PetscObject)*bout,"FFTmatrix",(PetscObject)A);CHKERRQ(ierr);
604         (*bout)->ops->duplicate = VecDuplicate_FFTW_bout;
605         (*bout)->ops->destroy   = VecDestroy_MPIFFTW;
606       }
607 #else
608       alloc_local = fftw_mpi_local_size_3d(dim[0],dim[1],dim[2],comm,&local_n0,&local_0_start);
609       if (fin) {
610         data_fin  = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
611         ierr = VecCreateMPIWithArray(comm,1,n,N,(const PetscScalar*)data_fin,fin);CHKERRQ(ierr);
612         ierr = PetscObjectCompose((PetscObject)*fin,"FFTmatrix",(PetscObject)A);CHKERRQ(ierr);
613         (*fin)->ops->duplicate = VecDuplicate_FFTW_fin;
614         (*fin)->ops->destroy   = VecDestroy_MPIFFTW;
615       }
616       if (fout) {
617         data_fout = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
618         ierr = VecCreateMPIWithArray(comm,1,n,N,(const PetscScalar*)data_fout,fout);CHKERRQ(ierr);
619         ierr = PetscObjectCompose((PetscObject)*fout,"FFTmatrix",(PetscObject)A);CHKERRQ(ierr);
620         (*fout)->ops->duplicate = VecDuplicate_FFTW_fout;
621         (*fout)->ops->destroy   = VecDestroy_MPIFFTW;
622       }
623       if (bout) {
624         data_bout  = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
625         ierr = VecCreateMPIWithArray(comm,1,n,N,(const PetscScalar*)data_bout,bout);CHKERRQ(ierr);
626         ierr = PetscObjectCompose((PetscObject)*bout,"FFTmatrix",(PetscObject)A);CHKERRQ(ierr);
627         (*bout)->ops->duplicate = VecDuplicate_FFTW_bout;
628         (*bout)->ops->destroy   = VecDestroy_MPIFFTW;
629       }
630 #endif
631       break;
632     default:
633 #if !defined(PETSC_USE_COMPLEX)
634       temp = (fftw->dim_fftw)[fftw->ndim_fftw-1];
635 
636       (fftw->dim_fftw)[fftw->ndim_fftw-1] = temp/2 + 1;
637 
638       alloc_local = fftw_mpi_local_size_transposed(fftw->ndim_fftw,fftw->dim_fftw,comm,&local_n0,&local_0_start,&local_n1,&local_1_start);
639       N1          = 2*N*(PetscInt)((fftw->dim_fftw)[fftw->ndim_fftw-1])/((PetscInt) temp);
640 
641       (fftw->dim_fftw)[fftw->ndim_fftw-1] = temp;
642 
643       if (fin) {
644         data_finr=(double*)fftw_malloc(sizeof(double)*alloc_local*2);
645         ierr = VecCreateMPIWithArray(comm,1,(PetscInt)n,N1,(PetscScalar*)data_finr,fin);CHKERRQ(ierr);
646         ierr = PetscObjectCompose((PetscObject)*fin,"FFTmatrix",(PetscObject)A);CHKERRQ(ierr);
647         (*fin)->ops->duplicate = VecDuplicate_FFTW_fin;
648         (*fin)->ops->destroy   = VecDestroy_MPIFFTW;
649       }
650       if (fout) {
651         data_fout=(fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
652         ierr = VecCreateMPIWithArray(comm,1,n,N1,(PetscScalar*)data_fout,fout);CHKERRQ(ierr);
653         ierr = PetscObjectCompose((PetscObject)*fout,"FFTmatrix",(PetscObject)A);CHKERRQ(ierr);
654         (*fout)->ops->duplicate = VecDuplicate_FFTW_fout;
655         (*fout)->ops->destroy   = VecDestroy_MPIFFTW;
656       }
657       if (bout) {
658         data_boutr=(double*)fftw_malloc(sizeof(double)*alloc_local*2);
659         ierr = VecCreateMPIWithArray(comm,1,(PetscInt)n,N1,(PetscScalar*)data_boutr,bout);CHKERRQ(ierr);
660         ierr = PetscObjectCompose((PetscObject)*bout,"FFTmatrix",(PetscObject)A);CHKERRQ(ierr);
661         (*bout)->ops->duplicate = VecDuplicate_FFTW_bout;
662         (*bout)->ops->destroy   = VecDestroy_MPIFFTW;
663       }
664 #else
665       alloc_local = fftw_mpi_local_size(fftw->ndim_fftw,fftw->dim_fftw,comm,&local_n0,&local_0_start);
666       if (fin) {
667         data_fin  = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
668         ierr = VecCreateMPIWithArray(comm,1,n,N,(const PetscScalar*)data_fin,fin);CHKERRQ(ierr);
669         ierr = PetscObjectCompose((PetscObject)*fin,"FFTmatrix",(PetscObject)A);CHKERRQ(ierr);
670         (*fin)->ops->duplicate = VecDuplicate_FFTW_fin;
671         (*fin)->ops->destroy   = VecDestroy_MPIFFTW;
672       }
673       if (fout) {
674         data_fout = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
675         ierr = VecCreateMPIWithArray(comm,1,n,N,(const PetscScalar*)data_fout,fout);CHKERRQ(ierr);
676         ierr = PetscObjectCompose((PetscObject)*fout,"FFTmatrix",(PetscObject)A);CHKERRQ(ierr);
677         (*fout)->ops->duplicate = VecDuplicate_FFTW_fout;
678         (*fout)->ops->destroy   = VecDestroy_MPIFFTW;
679       }
680       if (bout) {
681         data_bout  = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
682         ierr = VecCreateMPIWithArray(comm,1,n,N,(const PetscScalar*)data_bout,bout);CHKERRQ(ierr);
683         ierr = PetscObjectCompose((PetscObject)*bout,"FFTmatrix",(PetscObject)A);CHKERRQ(ierr);
684         (*bout)->ops->duplicate = VecDuplicate_FFTW_bout;
685         (*bout)->ops->destroy   = VecDestroy_MPIFFTW;
686       }
687 #endif
688       break;
689     }
690     /* fftw vectors have their data array allocated by fftw_malloc, such that v->array=xxx but
691        v->array_allocated=NULL. A regular replacearray call won't free the memory and only causes
692        memory leaks. We void these pointers here to avoid acident uses.
693      */
694     if (fin)  (*fin)->ops->replacearray = NULL;
695     if (fout) (*fout)->ops->replacearray = NULL;
696     if (bout) (*bout)->ops->replacearray = NULL;
697   }
698   PetscFunctionReturn(0);
699 }
700 
701 /*@
702    VecScatterPetscToFFTW - Copies the PETSc vector to the vector that goes into FFTW block.
703 
704    Collective on Mat
705 
706    Input Parameters:
707 +  A - FFTW matrix
708 -  x - the PETSc vector
709 
710    Output Parameters:
711 .  y - the FFTW vector
712 
713   Options Database Keys:
714 . -mat_fftw_plannerflags - set FFTW planner flags
715 
716    Level: intermediate
717 
718    Note: For real parallel FFT, FFTW requires insertion of extra space at the end of last dimension. This required even when
719          one is not doing in-place transform. The last dimension size must be changed to 2*(dim[last]/2+1) to accommodate these extra
720          zeros. This routine does that job by scattering operation.
721 
722 .seealso: VecScatterFFTWToPetsc()
723 @*/
724 PetscErrorCode VecScatterPetscToFFTW(Mat A,Vec x,Vec y)
725 {
726   PetscErrorCode ierr;
727 
728   PetscFunctionBegin;
729   ierr = PetscUseMethod(A,"VecScatterPetscToFFTW_C",(Mat,Vec,Vec),(A,x,y));CHKERRQ(ierr);
730   PetscFunctionReturn(0);
731 }
732 
733 PetscErrorCode VecScatterPetscToFFTW_FFTW(Mat A,Vec x,Vec y)
734 {
735   PetscErrorCode ierr;
736   MPI_Comm       comm;
737   Mat_FFT        *fft  = (Mat_FFT*)A->data;
738   Mat_FFTW       *fftw = (Mat_FFTW*)fft->data;
739   PetscInt       N     =fft->N;
740   PetscInt       ndim  =fft->ndim,*dim=fft->dim;
741   PetscInt       low;
742   PetscMPIInt    rank,size;
743   PetscInt       vsize,vsize1;
744   ptrdiff_t      local_n0,local_0_start;
745   ptrdiff_t      local_n1,local_1_start;
746   VecScatter     vecscat;
747   IS             list1,list2;
748 #if !defined(PETSC_USE_COMPLEX)
749   PetscInt       i,j,k,partial_dim;
750   PetscInt       *indx1, *indx2, tempindx, tempindx1;
751   PetscInt       NM;
752   ptrdiff_t      temp;
753 #endif
754 
755   PetscFunctionBegin;
756   ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr);
757   ierr = MPI_Comm_size(comm, &size);CHKERRMPI(ierr);
758   ierr = MPI_Comm_rank(comm, &rank);CHKERRMPI(ierr);
759   ierr = VecGetOwnershipRange(y,&low,NULL);CHKERRQ(ierr);
760 
761   if (size==1) {
762     ierr = VecGetSize(x,&vsize);CHKERRQ(ierr);
763     ierr = VecGetSize(y,&vsize1);CHKERRQ(ierr);
764     ierr = ISCreateStride(PETSC_COMM_SELF,N,0,1,&list1);CHKERRQ(ierr);
765     ierr = VecScatterCreate(x,list1,y,list1,&vecscat);CHKERRQ(ierr);
766     ierr = VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
767     ierr = VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
768     ierr = VecScatterDestroy(&vecscat);CHKERRQ(ierr);
769     ierr = ISDestroy(&list1);CHKERRQ(ierr);
770   } else {
771     switch (ndim) {
772     case 1:
773 #if defined(PETSC_USE_COMPLEX)
774       fftw_mpi_local_size_1d(dim[0],comm,FFTW_FORWARD,FFTW_ESTIMATE,&local_n0,&local_0_start,&local_n1,&local_1_start);
775 
776       ierr = ISCreateStride(comm,local_n0,local_0_start,1,&list1);CHKERRQ(ierr);
777       ierr = ISCreateStride(comm,local_n0,low,1,&list2);CHKERRQ(ierr);
778       ierr = VecScatterCreate(x,list1,y,list2,&vecscat);CHKERRQ(ierr);
779       ierr = VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
780       ierr = VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
781       ierr = VecScatterDestroy(&vecscat);CHKERRQ(ierr);
782       ierr = ISDestroy(&list1);CHKERRQ(ierr);
783       ierr = ISDestroy(&list2);CHKERRQ(ierr);
784 #else
785       SETERRQ(comm,PETSC_ERR_SUP,"FFTW does not support parallel 1D real transform");
786 #endif
787       break;
788     case 2:
789 #if defined(PETSC_USE_COMPLEX)
790       fftw_mpi_local_size_2d(dim[0],dim[1],comm,&local_n0,&local_0_start);
791 
792       ierr = ISCreateStride(comm,local_n0*dim[1],local_0_start*dim[1],1,&list1);CHKERRQ(ierr);
793       ierr = ISCreateStride(comm,local_n0*dim[1],low,1,&list2);CHKERRQ(ierr);
794       ierr = VecScatterCreate(x,list1,y,list2,&vecscat);CHKERRQ(ierr);
795       ierr = VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
796       ierr = VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
797       ierr = VecScatterDestroy(&vecscat);CHKERRQ(ierr);
798       ierr = ISDestroy(&list1);CHKERRQ(ierr);
799       ierr = ISDestroy(&list2);CHKERRQ(ierr);
800 #else
801       fftw_mpi_local_size_2d_transposed(dim[0],dim[1]/2+1,comm,&local_n0,&local_0_start,&local_n1,&local_1_start);
802 
803       ierr = PetscMalloc1(((PetscInt)local_n0)*dim[1],&indx1);CHKERRQ(ierr);
804       ierr = PetscMalloc1(((PetscInt)local_n0)*dim[1],&indx2);CHKERRQ(ierr);
805 
806       if (dim[1]%2==0) {
807         NM = dim[1]+2;
808       } else {
809         NM = dim[1]+1;
810       }
811       for (i=0; i<local_n0; i++) {
812         for (j=0; j<dim[1]; j++) {
813           tempindx  = i*dim[1] + j;
814           tempindx1 = i*NM + j;
815 
816           indx1[tempindx]=local_0_start*dim[1]+tempindx;
817           indx2[tempindx]=low+tempindx1;
818         }
819       }
820 
821       ierr = ISCreateGeneral(comm,local_n0*dim[1],indx1,PETSC_COPY_VALUES,&list1);CHKERRQ(ierr);
822       ierr = ISCreateGeneral(comm,local_n0*dim[1],indx2,PETSC_COPY_VALUES,&list2);CHKERRQ(ierr);
823 
824       ierr = VecScatterCreate(x,list1,y,list2,&vecscat);CHKERRQ(ierr);
825       ierr = VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
826       ierr = VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
827       ierr = VecScatterDestroy(&vecscat);CHKERRQ(ierr);
828       ierr = ISDestroy(&list1);CHKERRQ(ierr);
829       ierr = ISDestroy(&list2);CHKERRQ(ierr);
830       ierr = PetscFree(indx1);CHKERRQ(ierr);
831       ierr = PetscFree(indx2);CHKERRQ(ierr);
832 #endif
833       break;
834 
835     case 3:
836 #if defined(PETSC_USE_COMPLEX)
837       fftw_mpi_local_size_3d(dim[0],dim[1],dim[2],comm,&local_n0,&local_0_start);
838 
839       ierr = ISCreateStride(comm,local_n0*dim[1]*dim[2],local_0_start*dim[1]*dim[2],1,&list1);CHKERRQ(ierr);
840       ierr = ISCreateStride(comm,local_n0*dim[1]*dim[2],low,1,&list2);CHKERRQ(ierr);
841       ierr = VecScatterCreate(x,list1,y,list2,&vecscat);CHKERRQ(ierr);
842       ierr = VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
843       ierr = VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
844       ierr = VecScatterDestroy(&vecscat);CHKERRQ(ierr);
845       ierr = ISDestroy(&list1);CHKERRQ(ierr);
846       ierr = ISDestroy(&list2);CHKERRQ(ierr);
847 #else
848       /* buggy, needs to be fixed. See src/mat/tests/ex158.c */
849       SETERRQ(comm,PETSC_ERR_SUP,"FFTW does not support parallel 3D real transform");
850       fftw_mpi_local_size_3d_transposed(dim[0],dim[1],dim[2]/2+1,comm,&local_n0,&local_0_start,&local_n1,&local_1_start);
851 
852       ierr = PetscMalloc1(((PetscInt)local_n0)*dim[1]*dim[2],&indx1);CHKERRQ(ierr);
853       ierr = PetscMalloc1(((PetscInt)local_n0)*dim[1]*dim[2],&indx2);CHKERRQ(ierr);
854 
855       if (dim[2]%2==0) NM = dim[2]+2;
856       else             NM = dim[2]+1;
857 
858       for (i=0; i<local_n0; i++) {
859         for (j=0; j<dim[1]; j++) {
860           for (k=0; k<dim[2]; k++) {
861             tempindx  = i*dim[1]*dim[2] + j*dim[2] + k;
862             tempindx1 = i*dim[1]*NM + j*NM + k;
863 
864             indx1[tempindx]=local_0_start*dim[1]*dim[2]+tempindx;
865             indx2[tempindx]=low+tempindx1;
866           }
867         }
868       }
869 
870       ierr = ISCreateGeneral(comm,local_n0*dim[1]*dim[2],indx1,PETSC_COPY_VALUES,&list1);CHKERRQ(ierr);
871       ierr = ISCreateGeneral(comm,local_n0*dim[1]*dim[2],indx2,PETSC_COPY_VALUES,&list2);CHKERRQ(ierr);
872       ierr = VecScatterCreate(x,list1,y,list2,&vecscat);CHKERRQ(ierr);
873       ierr = VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
874       ierr = VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
875       ierr = VecScatterDestroy(&vecscat);CHKERRQ(ierr);
876       ierr = ISDestroy(&list1);CHKERRQ(ierr);
877       ierr = ISDestroy(&list2);CHKERRQ(ierr);
878       ierr = PetscFree(indx1);CHKERRQ(ierr);
879       ierr = PetscFree(indx2);CHKERRQ(ierr);
880 #endif
881       break;
882 
883     default:
884 #if defined(PETSC_USE_COMPLEX)
885       fftw_mpi_local_size(fftw->ndim_fftw,fftw->dim_fftw,comm,&local_n0,&local_0_start);
886 
887       ierr = ISCreateStride(comm,local_n0*(fftw->partial_dim),local_0_start*(fftw->partial_dim),1,&list1);CHKERRQ(ierr);
888       ierr = ISCreateStride(comm,local_n0*(fftw->partial_dim),low,1,&list2);CHKERRQ(ierr);
889       ierr = VecScatterCreate(x,list1,y,list2,&vecscat);CHKERRQ(ierr);
890       ierr = VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
891       ierr = VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
892       ierr = VecScatterDestroy(&vecscat);CHKERRQ(ierr);
893       ierr = ISDestroy(&list1);CHKERRQ(ierr);
894       ierr = ISDestroy(&list2);CHKERRQ(ierr);
895 #else
896       /* buggy, needs to be fixed. See src/mat/tests/ex158.c */
897       SETERRQ(comm,PETSC_ERR_SUP,"FFTW does not support parallel DIM>3 real transform");
898       temp = (fftw->dim_fftw)[fftw->ndim_fftw-1];
899 
900       (fftw->dim_fftw)[fftw->ndim_fftw-1] = temp/2 + 1;
901 
902       fftw_mpi_local_size_transposed(fftw->ndim_fftw,fftw->dim_fftw,comm,&local_n0,&local_0_start,&local_n1,&local_1_start);
903 
904       (fftw->dim_fftw)[fftw->ndim_fftw-1] = temp;
905 
906       partial_dim = fftw->partial_dim;
907 
908       ierr = PetscMalloc1(((PetscInt)local_n0)*partial_dim,&indx1);CHKERRQ(ierr);
909       ierr = PetscMalloc1(((PetscInt)local_n0)*partial_dim,&indx2);CHKERRQ(ierr);
910 
911       if (dim[ndim-1]%2==0) NM = 2;
912       else                  NM = 1;
913 
914       j = low;
915       for (i=0,k=1; i<((PetscInt)local_n0)*partial_dim;i++,k++) {
916         indx1[i] = local_0_start*partial_dim + i;
917         indx2[i] = j;
918         if (k%dim[ndim-1]==0) j+=NM;
919         j++;
920       }
921       ierr = ISCreateGeneral(comm,local_n0*partial_dim,indx1,PETSC_COPY_VALUES,&list1);CHKERRQ(ierr);
922       ierr = ISCreateGeneral(comm,local_n0*partial_dim,indx2,PETSC_COPY_VALUES,&list2);CHKERRQ(ierr);
923       ierr = VecScatterCreate(x,list1,y,list2,&vecscat);CHKERRQ(ierr);
924       ierr = VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
925       ierr = VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
926       ierr = VecScatterDestroy(&vecscat);CHKERRQ(ierr);
927       ierr = ISDestroy(&list1);CHKERRQ(ierr);
928       ierr = ISDestroy(&list2);CHKERRQ(ierr);
929       ierr = PetscFree(indx1);CHKERRQ(ierr);
930       ierr = PetscFree(indx2);CHKERRQ(ierr);
931 #endif
932       break;
933     }
934   }
935   PetscFunctionReturn(0);
936 }
937 
938 /*@
939    VecScatterFFTWToPetsc - Converts FFTW output to the PETSc vector.
940 
941    Collective on Mat
942 
943     Input Parameters:
944 +   A - FFTW matrix
945 -   x - FFTW vector
946 
947    Output Parameters:
948 .  y - PETSc vector
949 
950    Level: intermediate
951 
952    Note: While doing real transform the FFTW output of backward DFT contains extra zeros at the end of last dimension.
953          VecScatterFFTWToPetsc removes those extra zeros.
954 
955 .seealso: VecScatterPetscToFFTW()
956 @*/
957 PetscErrorCode VecScatterFFTWToPetsc(Mat A,Vec x,Vec y)
958 {
959   PetscErrorCode ierr;
960 
961   PetscFunctionBegin;
962   ierr = PetscUseMethod(A,"VecScatterFFTWToPetsc_C",(Mat,Vec,Vec),(A,x,y));CHKERRQ(ierr);
963   PetscFunctionReturn(0);
964 }
965 
966 PetscErrorCode VecScatterFFTWToPetsc_FFTW(Mat A,Vec x,Vec y)
967 {
968   PetscErrorCode ierr;
969   MPI_Comm       comm;
970   Mat_FFT        *fft  = (Mat_FFT*)A->data;
971   Mat_FFTW       *fftw = (Mat_FFTW*)fft->data;
972   PetscInt       N     = fft->N;
973   PetscInt       ndim  = fft->ndim,*dim=fft->dim;
974   PetscInt       low;
975   PetscMPIInt    rank,size;
976   ptrdiff_t      local_n0,local_0_start;
977   ptrdiff_t      local_n1,local_1_start;
978   VecScatter     vecscat;
979   IS             list1,list2;
980 #if !defined(PETSC_USE_COMPLEX)
981   PetscInt       i,j,k,partial_dim;
982   PetscInt       *indx1, *indx2, tempindx, tempindx1;
983   PetscInt       NM;
984   ptrdiff_t      temp;
985 #endif
986 
987   PetscFunctionBegin;
988   ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr);
989   ierr = MPI_Comm_size(comm, &size);CHKERRMPI(ierr);
990   ierr = MPI_Comm_rank(comm, &rank);CHKERRMPI(ierr);
991   ierr = VecGetOwnershipRange(x,&low,NULL);CHKERRQ(ierr);
992 
993   if (size==1) {
994     ierr = ISCreateStride(comm,N,0,1,&list1);CHKERRQ(ierr);
995     ierr = VecScatterCreate(x,list1,y,list1,&vecscat);CHKERRQ(ierr);
996     ierr = VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
997     ierr = VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
998     ierr = VecScatterDestroy(&vecscat);CHKERRQ(ierr);
999     ierr = ISDestroy(&list1);CHKERRQ(ierr);
1000 
1001   } else {
1002     switch (ndim) {
1003     case 1:
1004 #if defined(PETSC_USE_COMPLEX)
1005       fftw_mpi_local_size_1d(dim[0],comm,FFTW_BACKWARD,FFTW_ESTIMATE,&local_n0,&local_0_start,&local_n1,&local_1_start);
1006 
1007       ierr = ISCreateStride(comm,local_n1,local_1_start,1,&list1);CHKERRQ(ierr);
1008       ierr = ISCreateStride(comm,local_n1,low,1,&list2);CHKERRQ(ierr);
1009       ierr = VecScatterCreate(x,list1,y,list2,&vecscat);CHKERRQ(ierr);
1010       ierr = VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
1011       ierr = VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
1012       ierr = VecScatterDestroy(&vecscat);CHKERRQ(ierr);
1013       ierr = ISDestroy(&list1);CHKERRQ(ierr);
1014       ierr = ISDestroy(&list2);CHKERRQ(ierr);
1015 #else
1016       SETERRQ(comm,PETSC_ERR_SUP,"No support for real parallel 1D FFT");
1017 #endif
1018       break;
1019     case 2:
1020 #if defined(PETSC_USE_COMPLEX)
1021       fftw_mpi_local_size_2d(dim[0],dim[1],comm,&local_n0,&local_0_start);
1022 
1023       ierr = ISCreateStride(comm,local_n0*dim[1],local_0_start*dim[1],1,&list1);CHKERRQ(ierr);
1024       ierr = ISCreateStride(comm,local_n0*dim[1],low,1,&list2);CHKERRQ(ierr);
1025       ierr = VecScatterCreate(x,list2,y,list1,&vecscat);CHKERRQ(ierr);
1026       ierr = VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
1027       ierr = VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
1028       ierr = VecScatterDestroy(&vecscat);CHKERRQ(ierr);
1029       ierr = ISDestroy(&list1);CHKERRQ(ierr);
1030       ierr = ISDestroy(&list2);CHKERRQ(ierr);
1031 #else
1032       fftw_mpi_local_size_2d_transposed(dim[0],dim[1]/2+1,comm,&local_n0,&local_0_start,&local_n1,&local_1_start);
1033 
1034       ierr = PetscMalloc1(((PetscInt)local_n0)*dim[1],&indx1);CHKERRQ(ierr);
1035       ierr = PetscMalloc1(((PetscInt)local_n0)*dim[1],&indx2);CHKERRQ(ierr);
1036 
1037       if (dim[1]%2==0) NM = dim[1]+2;
1038       else             NM = dim[1]+1;
1039 
1040       for (i=0; i<local_n0; i++) {
1041         for (j=0; j<dim[1]; j++) {
1042           tempindx = i*dim[1] + j;
1043           tempindx1 = i*NM + j;
1044 
1045           indx1[tempindx]=local_0_start*dim[1]+tempindx;
1046           indx2[tempindx]=low+tempindx1;
1047         }
1048       }
1049 
1050       ierr = ISCreateGeneral(comm,local_n0*dim[1],indx1,PETSC_COPY_VALUES,&list1);CHKERRQ(ierr);
1051       ierr = ISCreateGeneral(comm,local_n0*dim[1],indx2,PETSC_COPY_VALUES,&list2);CHKERRQ(ierr);
1052 
1053       ierr = VecScatterCreate(x,list2,y,list1,&vecscat);CHKERRQ(ierr);
1054       ierr = VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
1055       ierr = VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
1056       ierr = VecScatterDestroy(&vecscat);CHKERRQ(ierr);
1057       ierr = ISDestroy(&list1);CHKERRQ(ierr);
1058       ierr = ISDestroy(&list2);CHKERRQ(ierr);
1059       ierr = PetscFree(indx1);CHKERRQ(ierr);
1060       ierr = PetscFree(indx2);CHKERRQ(ierr);
1061 #endif
1062       break;
1063     case 3:
1064 #if defined(PETSC_USE_COMPLEX)
1065       fftw_mpi_local_size_3d(dim[0],dim[1],dim[2],comm,&local_n0,&local_0_start);
1066 
1067       ierr = ISCreateStride(comm,local_n0*dim[1]*dim[2],local_0_start*dim[1]*dim[2],1,&list1);CHKERRQ(ierr);
1068       ierr = ISCreateStride(comm,local_n0*dim[1]*dim[2],low,1,&list2);CHKERRQ(ierr);
1069       ierr = VecScatterCreate(x,list1,y,list2,&vecscat);CHKERRQ(ierr);
1070       ierr = VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
1071       ierr = VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
1072       ierr = VecScatterDestroy(&vecscat);CHKERRQ(ierr);
1073       ierr = ISDestroy(&list1);CHKERRQ(ierr);
1074       ierr = ISDestroy(&list2);CHKERRQ(ierr);
1075 #else
1076       fftw_mpi_local_size_3d_transposed(dim[0],dim[1],dim[2]/2+1,comm,&local_n0,&local_0_start,&local_n1,&local_1_start);
1077 
1078       ierr = PetscMalloc1(((PetscInt)local_n0)*dim[1]*dim[2],&indx1);CHKERRQ(ierr);
1079       ierr = PetscMalloc1(((PetscInt)local_n0)*dim[1]*dim[2],&indx2);CHKERRQ(ierr);
1080 
1081       if (dim[2]%2==0) NM = dim[2]+2;
1082       else             NM = dim[2]+1;
1083 
1084       for (i=0; i<local_n0; i++) {
1085         for (j=0; j<dim[1]; j++) {
1086           for (k=0; k<dim[2]; k++) {
1087             tempindx  = i*dim[1]*dim[2] + j*dim[2] + k;
1088             tempindx1 = i*dim[1]*NM + j*NM + k;
1089 
1090             indx1[tempindx]=local_0_start*dim[1]*dim[2]+tempindx;
1091             indx2[tempindx]=low+tempindx1;
1092           }
1093         }
1094       }
1095 
1096       ierr = ISCreateGeneral(comm,local_n0*dim[1]*dim[2],indx1,PETSC_COPY_VALUES,&list1);CHKERRQ(ierr);
1097       ierr = ISCreateGeneral(comm,local_n0*dim[1]*dim[2],indx2,PETSC_COPY_VALUES,&list2);CHKERRQ(ierr);
1098 
1099       ierr = VecScatterCreate(x,list2,y,list1,&vecscat);CHKERRQ(ierr);
1100       ierr = VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
1101       ierr = VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
1102       ierr = VecScatterDestroy(&vecscat);CHKERRQ(ierr);
1103       ierr = ISDestroy(&list1);CHKERRQ(ierr);
1104       ierr = ISDestroy(&list2);CHKERRQ(ierr);
1105       ierr = PetscFree(indx1);CHKERRQ(ierr);
1106       ierr = PetscFree(indx2);CHKERRQ(ierr);
1107 #endif
1108       break;
1109     default:
1110 #if defined(PETSC_USE_COMPLEX)
1111       fftw_mpi_local_size(fftw->ndim_fftw,fftw->dim_fftw,comm,&local_n0,&local_0_start);
1112 
1113       ierr = ISCreateStride(comm,local_n0*(fftw->partial_dim),local_0_start*(fftw->partial_dim),1,&list1);CHKERRQ(ierr);
1114       ierr = ISCreateStride(comm,local_n0*(fftw->partial_dim),low,1,&list2);CHKERRQ(ierr);
1115       ierr = VecScatterCreate(x,list1,y,list2,&vecscat);CHKERRQ(ierr);
1116       ierr = VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
1117       ierr = VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
1118       ierr = VecScatterDestroy(&vecscat);CHKERRQ(ierr);
1119       ierr = ISDestroy(&list1);CHKERRQ(ierr);
1120       ierr = ISDestroy(&list2);CHKERRQ(ierr);
1121 #else
1122       temp = (fftw->dim_fftw)[fftw->ndim_fftw-1];
1123 
1124       (fftw->dim_fftw)[fftw->ndim_fftw-1] = temp/2 + 1;
1125 
1126       fftw_mpi_local_size_transposed(fftw->ndim_fftw,fftw->dim_fftw,comm,&local_n0,&local_0_start,&local_n1,&local_1_start);
1127 
1128       (fftw->dim_fftw)[fftw->ndim_fftw-1] = temp;
1129 
1130       partial_dim = fftw->partial_dim;
1131 
1132       ierr = PetscMalloc1(((PetscInt)local_n0)*partial_dim,&indx1);CHKERRQ(ierr);
1133       ierr = PetscMalloc1(((PetscInt)local_n0)*partial_dim,&indx2);CHKERRQ(ierr);
1134 
1135       if (dim[ndim-1]%2==0) NM = 2;
1136       else                  NM = 1;
1137 
1138       j = low;
1139       for (i=0,k=1; i<((PetscInt)local_n0)*partial_dim; i++,k++) {
1140         indx1[i] = local_0_start*partial_dim + i;
1141         indx2[i] = j;
1142         if (k%dim[ndim-1]==0) j+=NM;
1143         j++;
1144       }
1145       ierr = ISCreateGeneral(comm,local_n0*partial_dim,indx1,PETSC_COPY_VALUES,&list1);CHKERRQ(ierr);
1146       ierr = ISCreateGeneral(comm,local_n0*partial_dim,indx2,PETSC_COPY_VALUES,&list2);CHKERRQ(ierr);
1147 
1148       ierr = VecScatterCreate(x,list2,y,list1,&vecscat);CHKERRQ(ierr);
1149       ierr = VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
1150       ierr = VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
1151       ierr = VecScatterDestroy(&vecscat);CHKERRQ(ierr);
1152       ierr = ISDestroy(&list1);CHKERRQ(ierr);
1153       ierr = ISDestroy(&list2);CHKERRQ(ierr);
1154       ierr = PetscFree(indx1);CHKERRQ(ierr);
1155       ierr = PetscFree(indx2);CHKERRQ(ierr);
1156 #endif
1157       break;
1158     }
1159   }
1160   PetscFunctionReturn(0);
1161 }
1162 
1163 /*
1164     MatCreate_FFTW - Creates a matrix object that provides FFT via the external package FFTW
1165 
1166   Options Database Keys:
1167 + -mat_fftw_plannerflags - set FFTW planner flags
1168 
1169    Level: intermediate
1170 
1171 */
1172 PETSC_EXTERN PetscErrorCode MatCreate_FFTW(Mat A)
1173 {
1174   PetscErrorCode ierr;
1175   MPI_Comm       comm;
1176   Mat_FFT        *fft=(Mat_FFT*)A->data;
1177   Mat_FFTW       *fftw;
1178   PetscInt       n=fft->n,N=fft->N,ndim=fft->ndim,*dim=fft->dim;
1179   const char     *plans[]={"FFTW_ESTIMATE","FFTW_MEASURE","FFTW_PATIENT","FFTW_EXHAUSTIVE"};
1180   unsigned       iplans[]={FFTW_ESTIMATE,FFTW_MEASURE,FFTW_PATIENT,FFTW_EXHAUSTIVE};
1181   PetscBool      flg;
1182   PetscInt       p_flag,partial_dim=1,ctr;
1183   PetscMPIInt    size,rank;
1184   ptrdiff_t      *pdim;
1185   ptrdiff_t      local_n1,local_1_start;
1186 #if !defined(PETSC_USE_COMPLEX)
1187   ptrdiff_t      temp;
1188   PetscInt       N1,tot_dim;
1189 #else
1190   PetscInt       n1;
1191 #endif
1192 
1193   PetscFunctionBegin;
1194   ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr);
1195   ierr = MPI_Comm_size(comm, &size);CHKERRMPI(ierr);
1196   ierr = MPI_Comm_rank(comm, &rank);CHKERRMPI(ierr);
1197 
1198   fftw_mpi_init();
1199   pdim    = (ptrdiff_t*)calloc(ndim,sizeof(ptrdiff_t));
1200   pdim[0] = dim[0];
1201 #if !defined(PETSC_USE_COMPLEX)
1202   tot_dim = 2*dim[0];
1203 #endif
1204   for (ctr=1; ctr<ndim; ctr++) {
1205     partial_dim *= dim[ctr];
1206     pdim[ctr]    = dim[ctr];
1207 #if !defined(PETSC_USE_COMPLEX)
1208     if (ctr==ndim-1) tot_dim *= (dim[ctr]/2+1);
1209     else             tot_dim *= dim[ctr];
1210 #endif
1211   }
1212 
1213   if (size == 1) {
1214 #if defined(PETSC_USE_COMPLEX)
1215     ierr = MatSetSizes(A,N,N,N,N);CHKERRQ(ierr);
1216     n    = N;
1217 #else
1218     ierr = MatSetSizes(A,tot_dim,tot_dim,tot_dim,tot_dim);CHKERRQ(ierr);
1219     n    = tot_dim;
1220 #endif
1221 
1222   } else {
1223     ptrdiff_t local_n0,local_0_start;
1224     switch (ndim) {
1225     case 1:
1226 #if !defined(PETSC_USE_COMPLEX)
1227       SETERRQ(comm,PETSC_ERR_SUP,"FFTW does not support parallel 1D real transform");
1228 #else
1229       fftw_mpi_local_size_1d(dim[0],comm,FFTW_FORWARD,FFTW_ESTIMATE,&local_n0,&local_0_start,&local_n1,&local_1_start);
1230 
1231       n    = (PetscInt)local_n0;
1232       n1   = (PetscInt)local_n1;
1233       ierr = MatSetSizes(A,n1,n,N,N);CHKERRQ(ierr);
1234 #endif
1235       break;
1236     case 2:
1237 #if defined(PETSC_USE_COMPLEX)
1238       fftw_mpi_local_size_2d(dim[0],dim[1],comm,&local_n0,&local_0_start);
1239       /*
1240        PetscSynchronizedPrintf(comm,"[%d] MatCreateSeqFFTW: local_n0, local_0_start %d %d, N %d,dim %d, %d\n",rank,(PetscInt)local_n0*dim[1],(PetscInt)local_0_start,m,dim[0],dim[1]);
1241        PetscSynchronizedFlush(comm,PETSC_STDOUT);
1242        */
1243       n    = (PetscInt)local_n0*dim[1];
1244       ierr = MatSetSizes(A,n,n,N,N);CHKERRQ(ierr);
1245 #else
1246       fftw_mpi_local_size_2d_transposed(dim[0],dim[1]/2+1,comm,&local_n0,&local_0_start,&local_n1,&local_1_start);
1247 
1248       n    = 2*(PetscInt)local_n0*(dim[1]/2+1);
1249       ierr = MatSetSizes(A,n,n,2*dim[0]*(dim[1]/2+1),2*dim[0]*(dim[1]/2+1));CHKERRQ(ierr);
1250 #endif
1251       break;
1252     case 3:
1253 #if defined(PETSC_USE_COMPLEX)
1254       fftw_mpi_local_size_3d(dim[0],dim[1],dim[2],comm,&local_n0,&local_0_start);
1255 
1256       n    = (PetscInt)local_n0*dim[1]*dim[2];
1257       ierr = MatSetSizes(A,n,n,N,N);CHKERRQ(ierr);
1258 #else
1259       fftw_mpi_local_size_3d_transposed(dim[0],dim[1],dim[2]/2+1,comm,&local_n0,&local_0_start,&local_n1,&local_1_start);
1260 
1261       n   = 2*(PetscInt)local_n0*dim[1]*(dim[2]/2+1);
1262       ierr = MatSetSizes(A,n,n,2*dim[0]*dim[1]*(dim[2]/2+1),2*dim[0]*dim[1]*(dim[2]/2+1));CHKERRQ(ierr);
1263 #endif
1264       break;
1265     default:
1266 #if defined(PETSC_USE_COMPLEX)
1267       fftw_mpi_local_size(ndim,pdim,comm,&local_n0,&local_0_start);
1268 
1269       n    = (PetscInt)local_n0*partial_dim;
1270       ierr = MatSetSizes(A,n,n,N,N);CHKERRQ(ierr);
1271 #else
1272       temp = pdim[ndim-1];
1273 
1274       pdim[ndim-1] = temp/2 + 1;
1275 
1276       fftw_mpi_local_size_transposed(ndim,pdim,comm,&local_n0,&local_0_start,&local_n1,&local_1_start);
1277 
1278       n  = 2*(PetscInt)local_n0*partial_dim*pdim[ndim-1]/temp;
1279       N1 = 2*N*(PetscInt)pdim[ndim-1]/((PetscInt) temp);
1280 
1281       pdim[ndim-1] = temp;
1282 
1283       ierr = MatSetSizes(A,n,n,N1,N1);CHKERRQ(ierr);
1284 #endif
1285       break;
1286     }
1287   }
1288   free(pdim);
1289   ierr      = PetscObjectChangeTypeName((PetscObject)A,MATFFTW);CHKERRQ(ierr);
1290   ierr      = PetscNewLog(A,&fftw);CHKERRQ(ierr);
1291   fft->data = (void*)fftw;
1292 
1293   fft->n            = n;
1294   fftw->ndim_fftw   = (ptrdiff_t)ndim; /* This is dimension of fft */
1295   fftw->partial_dim = partial_dim;
1296 
1297   ierr = PetscMalloc1(ndim, &(fftw->dim_fftw));CHKERRQ(ierr);
1298   if (size == 1) {
1299 #if defined(PETSC_USE_64BIT_INDICES)
1300     fftw->iodims = (fftw_iodim64 *) malloc(sizeof(fftw_iodim64) * ndim);
1301 #else
1302     fftw->iodims = (fftw_iodim *) malloc(sizeof(fftw_iodim) * ndim);
1303 #endif
1304   }
1305 
1306   for (ctr=0;ctr<ndim;ctr++) (fftw->dim_fftw)[ctr]=dim[ctr];
1307 
1308   fftw->p_forward  = NULL;
1309   fftw->p_backward = NULL;
1310   fftw->p_flag     = FFTW_ESTIMATE;
1311 
1312   if (size == 1) {
1313     A->ops->mult          = MatMult_SeqFFTW;
1314     A->ops->multtranspose = MatMultTranspose_SeqFFTW;
1315   } else {
1316     A->ops->mult          = MatMult_MPIFFTW;
1317     A->ops->multtranspose = MatMultTranspose_MPIFFTW;
1318   }
1319   fft->matdestroy = MatDestroy_FFTW;
1320   A->assembled    = PETSC_TRUE;
1321   A->preallocated = PETSC_TRUE;
1322 
1323   ierr = PetscObjectComposeFunction((PetscObject)A,"MatCreateVecsFFTW_C",MatCreateVecsFFTW_FFTW);CHKERRQ(ierr);
1324   ierr = PetscObjectComposeFunction((PetscObject)A,"VecScatterPetscToFFTW_C",VecScatterPetscToFFTW_FFTW);CHKERRQ(ierr);
1325   ierr = PetscObjectComposeFunction((PetscObject)A,"VecScatterFFTWToPetsc_C",VecScatterFFTWToPetsc_FFTW);CHKERRQ(ierr);
1326 
1327   /* get runtime options */
1328   ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)A),((PetscObject)A)->prefix,"FFTW Options","Mat");CHKERRQ(ierr);
1329   ierr = PetscOptionsEList("-mat_fftw_plannerflags","Planner Flags","None",plans,4,plans[0],&p_flag,&flg);CHKERRQ(ierr);
1330   if (flg) {
1331     fftw->p_flag = iplans[p_flag];
1332   }
1333   ierr = PetscOptionsEnd();CHKERRQ(ierr);
1334   PetscFunctionReturn(0);
1335 }
1336 
1337