xref: /petsc/src/mat/impls/fft/fftw/fftw.c (revision 61a6e5df8a56e9f4635d65196feaa1303d4e372f)
1 
2 /*
3     Provides an interface to the FFTW package.
4     Testing examples can be found in ~src/mat/tests
5 */
6 
7 #include <../src/mat/impls/fft/fft.h>   /*I "petscmat.h" I*/
8 EXTERN_C_BEGIN
9 #if !PetscDefined(HAVE_MPIUNI)
10 #include <fftw3-mpi.h>
11 #else
12 #include <fftw3.h>
13 #endif
14 EXTERN_C_END
15 
16 typedef struct {
17   ptrdiff_t    ndim_fftw,*dim_fftw;
18 #if defined(PETSC_USE_64BIT_INDICES)
19   fftw_iodim64 *iodims;
20 #else
21   fftw_iodim   *iodims;
22 #endif
23   PetscInt     partial_dim;
24   fftw_plan    p_forward,p_backward;
25   unsigned     p_flag; /* planner flags, FFTW_ESTIMATE,FFTW_MEASURE, FFTW_PATIENT, FFTW_EXHAUSTIVE */
26   PetscScalar  *finarray,*foutarray,*binarray,*boutarray; /* keep track of arrays becaue fftw plan should be
27                                                             executed for the arrays with which the plan was created */
28 } Mat_FFTW;
29 
30 extern PetscErrorCode MatMult_SeqFFTW(Mat,Vec,Vec);
31 extern PetscErrorCode MatMultTranspose_SeqFFTW(Mat,Vec,Vec);
32 extern PetscErrorCode MatDestroy_FFTW(Mat);
33 extern PetscErrorCode MatCreateVecsFFTW_FFTW(Mat,Vec*,Vec*,Vec*);
34 #if !PetscDefined(HAVE_MPIUNI)
35 extern PetscErrorCode MatMult_MPIFFTW(Mat,Vec,Vec);
36 extern PetscErrorCode MatMultTranspose_MPIFFTW(Mat,Vec,Vec);
37 extern PetscErrorCode VecDestroy_MPIFFTW(Vec);
38 #endif
39 
40 /*
41    MatMult_SeqFFTW performs forward DFT
42    Input parameter:
43      A - the matrix
44      x - the vector on which FDFT will be performed
45 
46    Output parameter:
47      y - vector that stores result of FDFT
48 */
49 PetscErrorCode MatMult_SeqFFTW(Mat A,Vec x,Vec y)
50 {
51   PetscErrorCode ierr;
52   Mat_FFT        *fft  = (Mat_FFT*)A->data;
53   Mat_FFTW       *fftw = (Mat_FFTW*)fft->data;
54   const PetscScalar *x_array;
55   PetscScalar    *y_array;
56 #if defined(PETSC_USE_COMPLEX)
57 #if defined(PETSC_USE_64BIT_INDICES)
58   fftw_iodim64   *iodims;
59 #else
60   fftw_iodim     *iodims;
61 #endif
62   PetscInt       i;
63 #endif
64   PetscInt       ndim = fft->ndim,*dim = fft->dim;
65 
66   PetscFunctionBegin;
67   ierr = VecGetArrayRead(x,&x_array);CHKERRQ(ierr);
68   ierr = VecGetArray(y,&y_array);CHKERRQ(ierr);
69   if (!fftw->p_forward) { /* create a plan, then excute it */
70     switch (ndim) {
71     case 1:
72 #if defined(PETSC_USE_COMPLEX)
73       fftw->p_forward = fftw_plan_dft_1d(dim[0],(fftw_complex*)x_array,(fftw_complex*)y_array,FFTW_FORWARD,fftw->p_flag);
74 #else
75       fftw->p_forward = fftw_plan_dft_r2c_1d(dim[0],(double*)x_array,(fftw_complex*)y_array,fftw->p_flag);
76 #endif
77       break;
78     case 2:
79 #if defined(PETSC_USE_COMPLEX)
80       fftw->p_forward = fftw_plan_dft_2d(dim[0],dim[1],(fftw_complex*)x_array,(fftw_complex*)y_array,FFTW_FORWARD,fftw->p_flag);
81 #else
82       fftw->p_forward = fftw_plan_dft_r2c_2d(dim[0],dim[1],(double*)x_array,(fftw_complex*)y_array,fftw->p_flag);
83 #endif
84       break;
85     case 3:
86 #if defined(PETSC_USE_COMPLEX)
87       fftw->p_forward = fftw_plan_dft_3d(dim[0],dim[1],dim[2],(fftw_complex*)x_array,(fftw_complex*)y_array,FFTW_FORWARD,fftw->p_flag);
88 #else
89       fftw->p_forward = fftw_plan_dft_r2c_3d(dim[0],dim[1],dim[2],(double*)x_array,(fftw_complex*)y_array,fftw->p_flag);
90 #endif
91       break;
92     default:
93 #if defined(PETSC_USE_COMPLEX)
94       iodims = fftw->iodims;
95 #if defined(PETSC_USE_64BIT_INDICES)
96       if (ndim) {
97         iodims[ndim-1].n = (ptrdiff_t)dim[ndim-1];
98         iodims[ndim-1].is = iodims[ndim-1].os = 1;
99         for (i=ndim-2; i>=0; --i) {
100           iodims[i].n = (ptrdiff_t)dim[i];
101           iodims[i].is = iodims[i].os = iodims[i+1].is * iodims[i+1].n;
102         }
103       }
104       fftw->p_forward = fftw_plan_guru64_dft((int)ndim,(fftw_iodim64*)iodims,0,NULL,(fftw_complex*)x_array,(fftw_complex*)y_array,FFTW_FORWARD,fftw->p_flag);
105 #else
106       if (ndim) {
107         iodims[ndim-1].n = (int)dim[ndim-1];
108         iodims[ndim-1].is = iodims[ndim-1].os = 1;
109         for (i=ndim-2; i>=0; --i) {
110           iodims[i].n = (int)dim[i];
111           iodims[i].is = iodims[i].os = iodims[i+1].is * iodims[i+1].n;
112         }
113       }
114       fftw->p_forward = fftw_plan_guru_dft((int)ndim,(fftw_iodim*)iodims,0,NULL,(fftw_complex*)x_array,(fftw_complex*)y_array,FFTW_FORWARD,fftw->p_flag);
115 #endif
116 
117 #else
118       fftw->p_forward = fftw_plan_dft_r2c(ndim,(int*)dim,(double*)x_array,(fftw_complex*)y_array,fftw->p_flag);
119 #endif
120       break;
121     }
122     fftw->finarray  = (PetscScalar *) x_array;
123     fftw->foutarray = y_array;
124     /* Warning: if (fftw->p_flag!==FFTW_ESTIMATE) The data in the in/out arrays is overwritten!
125                 planning should be done before x is initialized! See FFTW manual sec2.1 or sec4 */
126     fftw_execute(fftw->p_forward);
127   } else { /* use existing plan */
128     if (fftw->finarray != x_array || fftw->foutarray != y_array) { /* use existing plan on new arrays */
129 #if defined(PETSC_USE_COMPLEX)
130       fftw_execute_dft(fftw->p_forward,(fftw_complex*)x_array,(fftw_complex*)y_array);
131 #else
132       fftw_execute_dft_r2c(fftw->p_forward,(double*)x_array,(fftw_complex*)y_array);
133 #endif
134     } else {
135       fftw_execute(fftw->p_forward);
136     }
137   }
138   ierr = VecRestoreArray(y,&y_array);CHKERRQ(ierr);
139   ierr = VecRestoreArrayRead(x,&x_array);CHKERRQ(ierr);
140   PetscFunctionReturn(0);
141 }
142 
143 /* MatMultTranspose_SeqFFTW performs serial backward DFT
144    Input parameter:
145      A - the matrix
146      x - the vector on which BDFT will be performed
147 
148    Output parameter:
149      y - vector that stores result of BDFT
150 */
151 
152 PetscErrorCode MatMultTranspose_SeqFFTW(Mat A,Vec x,Vec y)
153 {
154   PetscErrorCode ierr;
155   Mat_FFT        *fft  = (Mat_FFT*)A->data;
156   Mat_FFTW       *fftw = (Mat_FFTW*)fft->data;
157   const PetscScalar *x_array;
158   PetscScalar    *y_array;
159   PetscInt       ndim = fft->ndim,*dim = fft->dim;
160 #if defined(PETSC_USE_COMPLEX)
161 #if defined(PETSC_USE_64BIT_INDICES)
162   fftw_iodim64   *iodims = fftw->iodims;
163 #else
164   fftw_iodim     *iodims = fftw->iodims;
165 #endif
166 #endif
167 
168   PetscFunctionBegin;
169   ierr = VecGetArrayRead(x,&x_array);CHKERRQ(ierr);
170   ierr = VecGetArray(y,&y_array);CHKERRQ(ierr);
171   if (!fftw->p_backward) { /* create a plan, then excute it */
172     switch (ndim) {
173     case 1:
174 #if defined(PETSC_USE_COMPLEX)
175       fftw->p_backward = fftw_plan_dft_1d(dim[0],(fftw_complex*)x_array,(fftw_complex*)y_array,FFTW_BACKWARD,fftw->p_flag);
176 #else
177       fftw->p_backward= fftw_plan_dft_c2r_1d(dim[0],(fftw_complex*)x_array,(double*)y_array,fftw->p_flag);
178 #endif
179       break;
180     case 2:
181 #if defined(PETSC_USE_COMPLEX)
182       fftw->p_backward = fftw_plan_dft_2d(dim[0],dim[1],(fftw_complex*)x_array,(fftw_complex*)y_array,FFTW_BACKWARD,fftw->p_flag);
183 #else
184       fftw->p_backward= fftw_plan_dft_c2r_2d(dim[0],dim[1],(fftw_complex*)x_array,(double*)y_array,fftw->p_flag);
185 #endif
186       break;
187     case 3:
188 #if defined(PETSC_USE_COMPLEX)
189       fftw->p_backward = fftw_plan_dft_3d(dim[0],dim[1],dim[2],(fftw_complex*)x_array,(fftw_complex*)y_array,FFTW_BACKWARD,fftw->p_flag);
190 #else
191       fftw->p_backward= fftw_plan_dft_c2r_3d(dim[0],dim[1],dim[2],(fftw_complex*)x_array,(double*)y_array,fftw->p_flag);
192 #endif
193       break;
194     default:
195 #if defined(PETSC_USE_COMPLEX)
196 #if defined(PETSC_USE_64BIT_INDICES)
197       fftw->p_backward = fftw_plan_guru64_dft((int)ndim,(fftw_iodim64*)iodims,0,NULL,(fftw_complex*)x_array,(fftw_complex*)y_array,FFTW_BACKWARD,fftw->p_flag);
198 #else
199       fftw->p_backward = fftw_plan_guru_dft((int)ndim,iodims,0,NULL,(fftw_complex*)x_array,(fftw_complex*)y_array,FFTW_BACKWARD,fftw->p_flag);
200 #endif
201 #else
202       fftw->p_backward= fftw_plan_dft_c2r((int)ndim,(int*)dim,(fftw_complex*)x_array,(double*)y_array,fftw->p_flag);
203 #endif
204       break;
205     }
206     fftw->binarray  = (PetscScalar *) x_array;
207     fftw->boutarray = y_array;
208     fftw_execute(fftw->p_backward);
209   } else { /* use existing plan */
210     if (fftw->binarray != x_array || fftw->boutarray != y_array) { /* use existing plan on new arrays */
211 #if defined(PETSC_USE_COMPLEX)
212       fftw_execute_dft(fftw->p_backward,(fftw_complex*)x_array,(fftw_complex*)y_array);
213 #else
214       fftw_execute_dft_c2r(fftw->p_backward,(fftw_complex*)x_array,(double*)y_array);
215 #endif
216     } else {
217       fftw_execute(fftw->p_backward);
218     }
219   }
220   ierr = VecRestoreArray(y,&y_array);CHKERRQ(ierr);
221   ierr = VecRestoreArrayRead(x,&x_array);CHKERRQ(ierr);
222   PetscFunctionReturn(0);
223 }
224 
225 #if !PetscDefined(HAVE_MPIUNI)
226 /* MatMult_MPIFFTW performs forward DFT in parallel
227    Input parameter:
228      A - the matrix
229      x - the vector on which FDFT will be performed
230 
231    Output parameter:
232    y   - vector that stores result of FDFT
233 */
234 PetscErrorCode MatMult_MPIFFTW(Mat A,Vec x,Vec y)
235 {
236   PetscErrorCode ierr;
237   Mat_FFT        *fft  = (Mat_FFT*)A->data;
238   Mat_FFTW       *fftw = (Mat_FFTW*)fft->data;
239   const PetscScalar *x_array;
240   PetscScalar    *y_array;
241   PetscInt       ndim = fft->ndim,*dim = fft->dim;
242   MPI_Comm       comm;
243 
244   PetscFunctionBegin;
245   ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr);
246   ierr = VecGetArrayRead(x,&x_array);CHKERRQ(ierr);
247   ierr = VecGetArray(y,&y_array);CHKERRQ(ierr);
248   if (!fftw->p_forward) { /* create a plan, then excute it */
249     switch (ndim) {
250     case 1:
251 #if defined(PETSC_USE_COMPLEX)
252       fftw->p_forward = fftw_mpi_plan_dft_1d(dim[0],(fftw_complex*)x_array,(fftw_complex*)y_array,comm,FFTW_FORWARD,fftw->p_flag);
253 #else
254       SETERRQ(comm,PETSC_ERR_SUP,"not support for real numbers yet");
255 #endif
256       break;
257     case 2:
258 #if defined(PETSC_USE_COMPLEX) /* For complex transforms call fftw_mpi_plan_dft, for real transforms call fftw_mpi_plan_dft_r2c */
259       fftw->p_forward = fftw_mpi_plan_dft_2d(dim[0],dim[1],(fftw_complex*)x_array,(fftw_complex*)y_array,comm,FFTW_FORWARD,fftw->p_flag);
260 #else
261       fftw->p_forward = fftw_mpi_plan_dft_r2c_2d(dim[0],dim[1],(double*)x_array,(fftw_complex*)y_array,comm,FFTW_ESTIMATE);
262 #endif
263       break;
264     case 3:
265 #if defined(PETSC_USE_COMPLEX)
266       fftw->p_forward = fftw_mpi_plan_dft_3d(dim[0],dim[1],dim[2],(fftw_complex*)x_array,(fftw_complex*)y_array,comm,FFTW_FORWARD,fftw->p_flag);
267 #else
268       fftw->p_forward = fftw_mpi_plan_dft_r2c_3d(dim[0],dim[1],dim[2],(double*)x_array,(fftw_complex*)y_array,comm,FFTW_ESTIMATE);
269 #endif
270       break;
271     default:
272 #if defined(PETSC_USE_COMPLEX)
273       fftw->p_forward = fftw_mpi_plan_dft(fftw->ndim_fftw,fftw->dim_fftw,(fftw_complex*)x_array,(fftw_complex*)y_array,comm,FFTW_FORWARD,fftw->p_flag);
274 #else
275       fftw->p_forward = fftw_mpi_plan_dft_r2c(fftw->ndim_fftw,fftw->dim_fftw,(double*)x_array,(fftw_complex*)y_array,comm,FFTW_ESTIMATE);
276 #endif
277       break;
278     }
279     fftw->finarray  = (PetscScalar *) x_array;
280     fftw->foutarray = y_array;
281     /* Warning: if (fftw->p_flag!==FFTW_ESTIMATE) The data in the in/out arrays is overwritten!
282                 planning should be done before x is initialized! See FFTW manual sec2.1 or sec4 */
283     fftw_execute(fftw->p_forward);
284   } else { /* use existing plan */
285     if (fftw->finarray != x_array || fftw->foutarray != y_array) { /* use existing plan on new arrays */
286       fftw_execute_dft(fftw->p_forward,(fftw_complex*)x_array,(fftw_complex*)y_array);
287     } else {
288       fftw_execute(fftw->p_forward);
289     }
290   }
291   ierr = VecRestoreArray(y,&y_array);CHKERRQ(ierr);
292   ierr = VecRestoreArrayRead(x,&x_array);CHKERRQ(ierr);
293   PetscFunctionReturn(0);
294 }
295 
296 /*
297    MatMultTranspose_MPIFFTW performs parallel backward DFT
298    Input parameter:
299      A - the matrix
300      x - the vector on which BDFT will be performed
301 
302    Output parameter:
303      y - vector that stores result of BDFT
304 */
305 PetscErrorCode MatMultTranspose_MPIFFTW(Mat A,Vec x,Vec y)
306 {
307   PetscErrorCode ierr;
308   Mat_FFT        *fft  = (Mat_FFT*)A->data;
309   Mat_FFTW       *fftw = (Mat_FFTW*)fft->data;
310   const PetscScalar *x_array;
311   PetscScalar    *y_array;
312   PetscInt       ndim = fft->ndim,*dim = fft->dim;
313   MPI_Comm       comm;
314 
315   PetscFunctionBegin;
316   ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr);
317   ierr = VecGetArrayRead(x,&x_array);CHKERRQ(ierr);
318   ierr = VecGetArray(y,&y_array);CHKERRQ(ierr);
319   if (!fftw->p_backward) { /* create a plan, then excute it */
320     switch (ndim) {
321     case 1:
322 #if defined(PETSC_USE_COMPLEX)
323       fftw->p_backward = fftw_mpi_plan_dft_1d(dim[0],(fftw_complex*)x_array,(fftw_complex*)y_array,comm,FFTW_BACKWARD,fftw->p_flag);
324 #else
325       SETERRQ(comm,PETSC_ERR_SUP,"not support for real numbers yet");
326 #endif
327       break;
328     case 2:
329 #if defined(PETSC_USE_COMPLEX) /* For complex transforms call fftw_mpi_plan_dft with flag FFTW_BACKWARD, for real transforms call fftw_mpi_plan_dft_c2r */
330       fftw->p_backward = fftw_mpi_plan_dft_2d(dim[0],dim[1],(fftw_complex*)x_array,(fftw_complex*)y_array,comm,FFTW_BACKWARD,fftw->p_flag);
331 #else
332       fftw->p_backward = fftw_mpi_plan_dft_c2r_2d(dim[0],dim[1],(fftw_complex*)x_array,(double*)y_array,comm,FFTW_ESTIMATE);
333 #endif
334       break;
335     case 3:
336 #if defined(PETSC_USE_COMPLEX)
337       fftw->p_backward = fftw_mpi_plan_dft_3d(dim[0],dim[1],dim[2],(fftw_complex*)x_array,(fftw_complex*)y_array,comm,FFTW_BACKWARD,fftw->p_flag);
338 #else
339       fftw->p_backward = fftw_mpi_plan_dft_c2r_3d(dim[0],dim[1],dim[2],(fftw_complex*)x_array,(double*)y_array,comm,FFTW_ESTIMATE);
340 #endif
341       break;
342     default:
343 #if defined(PETSC_USE_COMPLEX)
344       fftw->p_backward = fftw_mpi_plan_dft(fftw->ndim_fftw,fftw->dim_fftw,(fftw_complex*)x_array,(fftw_complex*)y_array,comm,FFTW_BACKWARD,fftw->p_flag);
345 #else
346       fftw->p_backward = fftw_mpi_plan_dft_c2r(fftw->ndim_fftw,fftw->dim_fftw,(fftw_complex*)x_array,(double*)y_array,comm,FFTW_ESTIMATE);
347 #endif
348       break;
349     }
350     fftw->binarray  = (PetscScalar *) x_array;
351     fftw->boutarray = y_array;
352     fftw_execute(fftw->p_backward);
353   } else { /* use existing plan */
354     if (fftw->binarray != x_array || fftw->boutarray != y_array) { /* use existing plan on new arrays */
355       fftw_execute_dft(fftw->p_backward,(fftw_complex*)x_array,(fftw_complex*)y_array);
356     } else {
357       fftw_execute(fftw->p_backward);
358     }
359   }
360   ierr = VecRestoreArray(y,&y_array);CHKERRQ(ierr);
361   ierr = VecRestoreArrayRead(x,&x_array);CHKERRQ(ierr);
362   PetscFunctionReturn(0);
363 }
364 #endif
365 
366 PetscErrorCode MatDestroy_FFTW(Mat A)
367 {
368   Mat_FFT        *fft  = (Mat_FFT*)A->data;
369   Mat_FFTW       *fftw = (Mat_FFTW*)fft->data;
370   PetscErrorCode ierr;
371 
372   PetscFunctionBegin;
373   fftw_destroy_plan(fftw->p_forward);
374   fftw_destroy_plan(fftw->p_backward);
375   if (fftw->iodims) {
376     free(fftw->iodims);
377   }
378   ierr = PetscFree(fftw->dim_fftw);CHKERRQ(ierr);
379   ierr = PetscFree(fft->data);CHKERRQ(ierr);
380 #if !PetscDefined(HAVE_MPIUNI)
381   fftw_mpi_cleanup();
382 #endif
383   PetscFunctionReturn(0);
384 }
385 
386 #if !PetscDefined(HAVE_MPIUNI)
387 #include <../src/vec/vec/impls/mpi/pvecimpl.h>   /*I  "petscvec.h"   I*/
388 PetscErrorCode VecDestroy_MPIFFTW(Vec v)
389 {
390   PetscErrorCode ierr;
391   PetscScalar    *array;
392 
393   PetscFunctionBegin;
394   ierr = VecGetArray(v,&array);CHKERRQ(ierr);
395   fftw_free((fftw_complex*)array);
396   ierr = VecRestoreArray(v,&array);CHKERRQ(ierr);
397   ierr = VecDestroy_MPI(v);CHKERRQ(ierr);
398   PetscFunctionReturn(0);
399 }
400 #endif
401 
402 #if !PetscDefined(HAVE_MPIUNI)
403 static PetscErrorCode VecDuplicate_FFTW_fin(Vec fin,Vec *fin_new)
404 {
405   PetscErrorCode ierr;
406   Mat            A;
407 
408   PetscFunctionBegin;
409   ierr = PetscObjectQuery((PetscObject)fin,"FFTmatrix",(PetscObject*)&A);CHKERRQ(ierr);
410   ierr = MatCreateVecsFFTW_FFTW(A,fin_new,NULL,NULL);CHKERRQ(ierr);
411   PetscFunctionReturn(0);
412 }
413 
414 static PetscErrorCode VecDuplicate_FFTW_fout(Vec fout,Vec *fout_new)
415 {
416   PetscErrorCode ierr;
417   Mat            A;
418 
419   PetscFunctionBegin;
420   ierr = PetscObjectQuery((PetscObject)fout,"FFTmatrix",(PetscObject*)&A);CHKERRQ(ierr);
421   ierr = MatCreateVecsFFTW_FFTW(A,NULL,fout_new,NULL);CHKERRQ(ierr);
422   PetscFunctionReturn(0);
423 }
424 
425 static PetscErrorCode VecDuplicate_FFTW_bout(Vec bout, Vec *bout_new)
426 {
427   PetscErrorCode ierr;
428   Mat            A;
429 
430   PetscFunctionBegin;
431   ierr = PetscObjectQuery((PetscObject)bout,"FFTmatrix",(PetscObject*)&A);CHKERRQ(ierr);
432   ierr = MatCreateVecsFFTW_FFTW(A,NULL,NULL,bout_new);CHKERRQ(ierr);
433   PetscFunctionReturn(0);
434 }
435 #endif
436 
437 /*@
438    MatCreateVecsFFTW - Get vector(s) compatible with the matrix, i.e. with the
439      parallel layout determined by FFTW
440 
441    Collective on Mat
442 
443    Input Parameter:
444 .   A - the matrix
445 
446    Output Parameters:
447 +   x - (optional) input vector of forward FFTW
448 .   y - (optional) output vector of forward FFTW
449 -   z - (optional) output vector of backward FFTW
450 
451   Level: advanced
452 
453   Note: The parallel layout of output of forward FFTW is always same as the input
454         of the backward FFTW. But parallel layout of the input vector of forward
455         FFTW might not be same as the output of backward FFTW.
456         Also note that we need to provide enough space while doing parallel real transform.
457         We need to pad extra zeros at the end of last dimension. For this reason the one needs to
458         invoke the routine fftw_mpi_local_size_transposed routines. Remember one has to change the
459         last dimension from n to n/2+1 while invoking this routine. The number of zeros to be padded
460         depends on if the last dimension is even or odd. If the last dimension is even need to pad two
461         zeros if it is odd only one zero is needed.
462         Lastly one needs some scratch space at the end of data set in each process. alloc_local
463         figures out how much space is needed, i.e. it figures out the data+scratch space for
464         each processor and returns that.
465 
466 .seealso: MatCreateFFT()
467 @*/
468 PetscErrorCode MatCreateVecsFFTW(Mat A,Vec *x,Vec *y,Vec *z)
469 {
470   PetscErrorCode ierr;
471 
472   PetscFunctionBegin;
473   ierr = PetscUseMethod(A,"MatCreateVecsFFTW_C",(Mat,Vec*,Vec*,Vec*),(A,x,y,z));CHKERRQ(ierr);
474   PetscFunctionReturn(0);
475 }
476 
477 PetscErrorCode  MatCreateVecsFFTW_FFTW(Mat A,Vec *fin,Vec *fout,Vec *bout)
478 {
479   PetscErrorCode ierr;
480   PetscMPIInt    size,rank;
481   MPI_Comm       comm;
482   Mat_FFT        *fft  = (Mat_FFT*)A->data;
483 
484   PetscFunctionBegin;
485   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
486   PetscValidType(A,1);
487   ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr);
488 
489   ierr = MPI_Comm_size(comm, &size);CHKERRMPI(ierr);
490   ierr = MPI_Comm_rank(comm, &rank);CHKERRMPI(ierr);
491   if (size == 1) { /* sequential case */
492 #if defined(PETSC_USE_COMPLEX)
493     if (fin)  {ierr = VecCreateSeq(PETSC_COMM_SELF,fft->N,fin);CHKERRQ(ierr);}
494     if (fout) {ierr = VecCreateSeq(PETSC_COMM_SELF,fft->N,fout);CHKERRQ(ierr);}
495     if (bout) {ierr = VecCreateSeq(PETSC_COMM_SELF,fft->N,bout);CHKERRQ(ierr);}
496 #else
497     if (fin) {ierr = VecCreateSeq(PETSC_COMM_SELF,fft->n,fin);CHKERRQ(ierr);}
498     if (fout) {ierr = VecCreateSeq(PETSC_COMM_SELF,fft->n,fout);CHKERRQ(ierr);}
499     if (bout) {ierr = VecCreateSeq(PETSC_COMM_SELF,fft->n,bout);CHKERRQ(ierr);}
500 #endif
501 #if !PetscDefined(HAVE_MPIUNI)
502   } else { /* parallel cases */
503     Mat_FFTW     *fftw = (Mat_FFTW*)fft->data;
504     PetscInt     ndim  = fft->ndim,*dim = fft->dim;
505     ptrdiff_t    alloc_local,local_n0,local_0_start;
506     ptrdiff_t    local_n1;
507     fftw_complex *data_fout;
508     ptrdiff_t    local_1_start;
509 #if defined(PETSC_USE_COMPLEX)
510     fftw_complex *data_fin,*data_bout;
511 #else
512     double       *data_finr,*data_boutr;
513     PetscInt     n1,N1;
514     ptrdiff_t    temp;
515 #endif
516 
517     switch (ndim) {
518     case 1:
519 #if !defined(PETSC_USE_COMPLEX)
520       SETERRQ(comm,PETSC_ERR_SUP,"FFTW does not allow parallel real 1D transform");
521 #else
522       alloc_local = fftw_mpi_local_size_1d(dim[0],comm,FFTW_FORWARD,FFTW_ESTIMATE,&local_n0,&local_0_start,&local_n1,&local_1_start);
523       if (fin) {
524         data_fin  = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
525         ierr      = VecCreateMPIWithArray(comm,1,local_n0,fft->N,(const PetscScalar*)data_fin,fin);CHKERRQ(ierr);
526         ierr      = PetscObjectCompose((PetscObject)*fin,"FFTmatrix",(PetscObject)A);CHKERRQ(ierr);
527         (*fin)->ops->duplicate = VecDuplicate_FFTW_fin;
528         (*fin)->ops->destroy   = VecDestroy_MPIFFTW;
529       }
530       if (fout) {
531         data_fout = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
532         ierr      = VecCreateMPIWithArray(comm,1,local_n1,fft->N,(const PetscScalar*)data_fout,fout);CHKERRQ(ierr);
533         ierr      = PetscObjectCompose((PetscObject)*fout,"FFTmatrix",(PetscObject)A);CHKERRQ(ierr);
534         (*fout)->ops->duplicate = VecDuplicate_FFTW_fout;
535         (*fout)->ops->destroy   = VecDestroy_MPIFFTW;
536       }
537       alloc_local = fftw_mpi_local_size_1d(dim[0],comm,FFTW_BACKWARD,FFTW_ESTIMATE,&local_n0,&local_0_start,&local_n1,&local_1_start);
538       if (bout) {
539         data_bout = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
540         ierr      = VecCreateMPIWithArray(comm,1,local_n1,fft->N,(const PetscScalar*)data_bout,bout);CHKERRQ(ierr);
541         ierr      = PetscObjectCompose((PetscObject)*bout,"FFTmatrix",(PetscObject)A);CHKERRQ(ierr);
542         (*bout)->ops->duplicate = VecDuplicate_FFTW_fout;
543         (*bout)->ops->destroy   = VecDestroy_MPIFFTW;
544       }
545       break;
546 #endif
547     case 2:
548 #if !defined(PETSC_USE_COMPLEX) /* Note that N1 is no more the product of individual dimensions */
549       alloc_local =  fftw_mpi_local_size_2d_transposed(dim[0],dim[1]/2+1,comm,&local_n0,&local_0_start,&local_n1,&local_1_start);
550       N1          = 2*dim[0]*(dim[1]/2+1); n1 = 2*local_n0*(dim[1]/2+1);
551       if (fin) {
552         data_finr = (double*)fftw_malloc(sizeof(double)*alloc_local*2);
553         ierr      = VecCreateMPIWithArray(comm,1,(PetscInt)n1,N1,(PetscScalar*)data_finr,fin);CHKERRQ(ierr);
554         ierr      = PetscObjectCompose((PetscObject)*fin,"FFTmatrix",(PetscObject)A);CHKERRQ(ierr);
555         (*fin)->ops->duplicate = VecDuplicate_FFTW_fin;
556         (*fin)->ops->destroy   = VecDestroy_MPIFFTW;
557       }
558       if (fout) {
559         data_fout = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
560         ierr      = VecCreateMPIWithArray(comm,1,(PetscInt)n1,N1,(PetscScalar*)data_fout,fout);CHKERRQ(ierr);
561         ierr      = PetscObjectCompose((PetscObject)*fout,"FFTmatrix",(PetscObject)A);CHKERRQ(ierr);
562         (*fout)->ops->duplicate = VecDuplicate_FFTW_fout;
563         (*fout)->ops->destroy   = VecDestroy_MPIFFTW;
564       }
565       if (bout) {
566         data_boutr = (double*)fftw_malloc(sizeof(double)*alloc_local*2);
567         ierr       = VecCreateMPIWithArray(comm,1,(PetscInt)n1,N1,(PetscScalar*)data_boutr,bout);CHKERRQ(ierr);
568         ierr       = PetscObjectCompose((PetscObject)*bout,"FFTmatrix",(PetscObject)A);CHKERRQ(ierr);
569         (*bout)->ops->duplicate = VecDuplicate_FFTW_bout;
570         (*bout)->ops->destroy   = VecDestroy_MPIFFTW;
571       }
572 #else
573       /* Get local size */
574       alloc_local = fftw_mpi_local_size_2d(dim[0],dim[1],comm,&local_n0,&local_0_start);
575       if (fin) {
576         data_fin = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
577         ierr     = VecCreateMPIWithArray(comm,1,fft->n,fft->N,(const PetscScalar*)data_fin,fin);CHKERRQ(ierr);
578         ierr     = PetscObjectCompose((PetscObject)*fin,"FFTmatrix",(PetscObject)A);CHKERRQ(ierr);
579         (*fin)->ops->duplicate = VecDuplicate_FFTW_fin;
580         (*fin)->ops->destroy   = VecDestroy_MPIFFTW;
581       }
582       if (fout) {
583         data_fout = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
584         ierr      = VecCreateMPIWithArray(comm,1,fft->n,fft->N,(const PetscScalar*)data_fout,fout);CHKERRQ(ierr);
585         ierr      = PetscObjectCompose((PetscObject)*fout,"FFTmatrix",(PetscObject)A);CHKERRQ(ierr);
586         (*fout)->ops->duplicate = VecDuplicate_FFTW_fout;
587         (*fout)->ops->destroy   = VecDestroy_MPIFFTW;
588       }
589       if (bout) {
590         data_bout = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
591         ierr      = VecCreateMPIWithArray(comm,1,fft->n,fft->N,(const PetscScalar*)data_bout,bout);CHKERRQ(ierr);
592         ierr      = PetscObjectCompose((PetscObject)*bout,"FFTmatrix",(PetscObject)A);CHKERRQ(ierr);
593         (*bout)->ops->duplicate = VecDuplicate_FFTW_bout;
594         (*bout)->ops->destroy   = VecDestroy_MPIFFTW;
595       }
596 #endif
597       break;
598     case 3:
599 #if !defined(PETSC_USE_COMPLEX)
600       alloc_local =  fftw_mpi_local_size_3d_transposed(dim[0],dim[1],dim[2]/2+1,comm,&local_n0,&local_0_start,&local_n1,&local_1_start);
601       N1 = 2*dim[0]*dim[1]*(dim[2]/2+1); n1 = 2*local_n0*dim[1]*(dim[2]/2+1);
602       if (fin) {
603         data_finr = (double*)fftw_malloc(sizeof(double)*alloc_local*2);
604         ierr      = VecCreateMPIWithArray(comm,1,(PetscInt)n1,N1,(PetscScalar*)data_finr,fin);CHKERRQ(ierr);
605         ierr      = PetscObjectCompose((PetscObject)*fin,"FFTmatrix",(PetscObject)A);CHKERRQ(ierr);
606         (*fin)->ops->duplicate = VecDuplicate_FFTW_fin;
607         (*fin)->ops->destroy   = VecDestroy_MPIFFTW;
608       }
609       if (fout) {
610         data_fout=(fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
611         ierr = VecCreateMPIWithArray(comm,1,n1,N1,(PetscScalar*)data_fout,fout);CHKERRQ(ierr);
612         ierr = PetscObjectCompose((PetscObject)*fout,"FFTmatrix",(PetscObject)A);CHKERRQ(ierr);
613         (*fout)->ops->duplicate = VecDuplicate_FFTW_fout;
614         (*fout)->ops->destroy   = VecDestroy_MPIFFTW;
615       }
616       if (bout) {
617         data_boutr=(double*)fftw_malloc(sizeof(double)*alloc_local*2);
618         ierr = VecCreateMPIWithArray(comm,1,(PetscInt)n1,N1,(PetscScalar*)data_boutr,bout);CHKERRQ(ierr);
619         ierr = PetscObjectCompose((PetscObject)*bout,"FFTmatrix",(PetscObject)A);CHKERRQ(ierr);
620         (*bout)->ops->duplicate = VecDuplicate_FFTW_bout;
621         (*bout)->ops->destroy   = VecDestroy_MPIFFTW;
622       }
623 #else
624       alloc_local = fftw_mpi_local_size_3d(dim[0],dim[1],dim[2],comm,&local_n0,&local_0_start);
625       if (fin) {
626         data_fin  = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
627         ierr = VecCreateMPIWithArray(comm,1,fft->n,fft->N,(const PetscScalar*)data_fin,fin);CHKERRQ(ierr);
628         ierr = PetscObjectCompose((PetscObject)*fin,"FFTmatrix",(PetscObject)A);CHKERRQ(ierr);
629         (*fin)->ops->duplicate = VecDuplicate_FFTW_fin;
630         (*fin)->ops->destroy   = VecDestroy_MPIFFTW;
631       }
632       if (fout) {
633         data_fout = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
634         ierr = VecCreateMPIWithArray(comm,1,fft->n,fft->N,(const PetscScalar*)data_fout,fout);CHKERRQ(ierr);
635         ierr = PetscObjectCompose((PetscObject)*fout,"FFTmatrix",(PetscObject)A);CHKERRQ(ierr);
636         (*fout)->ops->duplicate = VecDuplicate_FFTW_fout;
637         (*fout)->ops->destroy   = VecDestroy_MPIFFTW;
638       }
639       if (bout) {
640         data_bout  = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
641         ierr = VecCreateMPIWithArray(comm,1,fft->n,fft->N,(const PetscScalar*)data_bout,bout);CHKERRQ(ierr);
642         ierr = PetscObjectCompose((PetscObject)*bout,"FFTmatrix",(PetscObject)A);CHKERRQ(ierr);
643         (*bout)->ops->duplicate = VecDuplicate_FFTW_bout;
644         (*bout)->ops->destroy   = VecDestroy_MPIFFTW;
645       }
646 #endif
647       break;
648     default:
649 #if !defined(PETSC_USE_COMPLEX)
650       temp = (fftw->dim_fftw)[fftw->ndim_fftw-1];
651       (fftw->dim_fftw)[fftw->ndim_fftw-1] = temp/2 + 1;
652       alloc_local = fftw_mpi_local_size_transposed(fftw->ndim_fftw,fftw->dim_fftw,comm,&local_n0,&local_0_start,&local_n1,&local_1_start);
653       N1          = 2*fft->N*(PetscInt)((fftw->dim_fftw)[fftw->ndim_fftw-1])/((PetscInt) temp);
654       (fftw->dim_fftw)[fftw->ndim_fftw-1] = temp;
655 
656       if (fin) {
657         data_finr=(double*)fftw_malloc(sizeof(double)*alloc_local*2);
658         ierr = VecCreateMPIWithArray(comm,1,fft->n,N1,(PetscScalar*)data_finr,fin);CHKERRQ(ierr);
659         ierr = PetscObjectCompose((PetscObject)*fin,"FFTmatrix",(PetscObject)A);CHKERRQ(ierr);
660         (*fin)->ops->duplicate = VecDuplicate_FFTW_fin;
661         (*fin)->ops->destroy   = VecDestroy_MPIFFTW;
662       }
663       if (fout) {
664         data_fout=(fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
665         ierr = VecCreateMPIWithArray(comm,1,fft->n,N1,(PetscScalar*)data_fout,fout);CHKERRQ(ierr);
666         ierr = PetscObjectCompose((PetscObject)*fout,"FFTmatrix",(PetscObject)A);CHKERRQ(ierr);
667         (*fout)->ops->duplicate = VecDuplicate_FFTW_fout;
668         (*fout)->ops->destroy   = VecDestroy_MPIFFTW;
669       }
670       if (bout) {
671         data_boutr=(double*)fftw_malloc(sizeof(double)*alloc_local*2);
672         ierr = VecCreateMPIWithArray(comm,1,fft->n,N1,(PetscScalar*)data_boutr,bout);CHKERRQ(ierr);
673         ierr = PetscObjectCompose((PetscObject)*bout,"FFTmatrix",(PetscObject)A);CHKERRQ(ierr);
674         (*bout)->ops->duplicate = VecDuplicate_FFTW_bout;
675         (*bout)->ops->destroy   = VecDestroy_MPIFFTW;
676       }
677 #else
678       alloc_local = fftw_mpi_local_size(fftw->ndim_fftw,fftw->dim_fftw,comm,&local_n0,&local_0_start);
679       if (fin) {
680         data_fin  = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
681         ierr = VecCreateMPIWithArray(comm,1,fft->n,fft->N,(const PetscScalar*)data_fin,fin);CHKERRQ(ierr);
682         ierr = PetscObjectCompose((PetscObject)*fin,"FFTmatrix",(PetscObject)A);CHKERRQ(ierr);
683         (*fin)->ops->duplicate = VecDuplicate_FFTW_fin;
684         (*fin)->ops->destroy   = VecDestroy_MPIFFTW;
685       }
686       if (fout) {
687         data_fout = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
688         ierr = VecCreateMPIWithArray(comm,1,fft->n,fft->N,(const PetscScalar*)data_fout,fout);CHKERRQ(ierr);
689         ierr = PetscObjectCompose((PetscObject)*fout,"FFTmatrix",(PetscObject)A);CHKERRQ(ierr);
690         (*fout)->ops->duplicate = VecDuplicate_FFTW_fout;
691         (*fout)->ops->destroy   = VecDestroy_MPIFFTW;
692       }
693       if (bout) {
694         data_bout  = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
695         ierr = VecCreateMPIWithArray(comm,1,fft->n,fft->N,(const PetscScalar*)data_bout,bout);CHKERRQ(ierr);
696         ierr = PetscObjectCompose((PetscObject)*bout,"FFTmatrix",(PetscObject)A);CHKERRQ(ierr);
697         (*bout)->ops->duplicate = VecDuplicate_FFTW_bout;
698         (*bout)->ops->destroy   = VecDestroy_MPIFFTW;
699       }
700 #endif
701       break;
702     }
703     /* fftw vectors have their data array allocated by fftw_malloc, such that v->array=xxx but
704        v->array_allocated=NULL. A regular replacearray call won't free the memory and only causes
705        memory leaks. We void these pointers here to avoid acident uses.
706     */
707     if (fin)  (*fin)->ops->replacearray = NULL;
708     if (fout) (*fout)->ops->replacearray = NULL;
709     if (bout) (*bout)->ops->replacearray = NULL;
710 #endif
711   }
712   PetscFunctionReturn(0);
713 }
714 
715 /*@
716    VecScatterPetscToFFTW - Copies the PETSc vector to the vector that goes into FFTW block.
717 
718    Collective on Mat
719 
720    Input Parameters:
721 +  A - FFTW matrix
722 -  x - the PETSc vector
723 
724    Output Parameters:
725 .  y - the FFTW vector
726 
727   Options Database Keys:
728 . -mat_fftw_plannerflags - set FFTW planner flags
729 
730    Level: intermediate
731 
732    Note: For real parallel FFT, FFTW requires insertion of extra space at the end of last dimension. This required even when
733          one is not doing in-place transform. The last dimension size must be changed to 2*(dim[last]/2+1) to accommodate these extra
734          zeros. This routine does that job by scattering operation.
735 
736 .seealso: VecScatterFFTWToPetsc()
737 @*/
738 PetscErrorCode VecScatterPetscToFFTW(Mat A,Vec x,Vec y)
739 {
740   PetscErrorCode ierr;
741 
742   PetscFunctionBegin;
743   ierr = PetscUseMethod(A,"VecScatterPetscToFFTW_C",(Mat,Vec,Vec),(A,x,y));CHKERRQ(ierr);
744   PetscFunctionReturn(0);
745 }
746 
747 PetscErrorCode VecScatterPetscToFFTW_FFTW(Mat A,Vec x,Vec y)
748 {
749   PetscErrorCode ierr;
750   MPI_Comm       comm;
751   Mat_FFT        *fft  = (Mat_FFT*)A->data;
752   PetscInt       low;
753   PetscMPIInt    rank,size;
754   PetscInt       vsize,vsize1;
755   VecScatter     vecscat;
756   IS             list1;
757 
758   PetscFunctionBegin;
759   ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr);
760   ierr = MPI_Comm_size(comm, &size);CHKERRMPI(ierr);
761   ierr = MPI_Comm_rank(comm, &rank);CHKERRMPI(ierr);
762   ierr = VecGetOwnershipRange(y,&low,NULL);CHKERRQ(ierr);
763 
764   if (size==1) {
765     ierr = VecGetSize(x,&vsize);CHKERRQ(ierr);
766     ierr = VecGetSize(y,&vsize1);CHKERRQ(ierr);
767     ierr = ISCreateStride(PETSC_COMM_SELF,fft->N,0,1,&list1);CHKERRQ(ierr);
768     ierr = VecScatterCreate(x,list1,y,list1,&vecscat);CHKERRQ(ierr);
769     ierr = VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
770     ierr = VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
771     ierr = VecScatterDestroy(&vecscat);CHKERRQ(ierr);
772     ierr = ISDestroy(&list1);CHKERRQ(ierr);
773 #if !PetscDefined(HAVE_MPIUNI)
774   } else {
775     Mat_FFTW   *fftw = (Mat_FFTW*)fft->data;
776     PetscInt   ndim  = fft->ndim,*dim = fft->dim;
777     ptrdiff_t  local_n0,local_0_start;
778     ptrdiff_t  local_n1,local_1_start;
779     IS         list2;
780 #if !defined(PETSC_USE_COMPLEX)
781     PetscInt   i,j,k,partial_dim;
782     PetscInt   *indx1, *indx2, tempindx, tempindx1;
783     PetscInt   NM;
784     ptrdiff_t  temp;
785 #endif
786 
787     switch (ndim) {
788     case 1:
789 #if defined(PETSC_USE_COMPLEX)
790       fftw_mpi_local_size_1d(dim[0],comm,FFTW_FORWARD,FFTW_ESTIMATE,&local_n0,&local_0_start,&local_n1,&local_1_start);
791 
792       ierr = ISCreateStride(comm,local_n0,local_0_start,1,&list1);CHKERRQ(ierr);
793       ierr = ISCreateStride(comm,local_n0,low,1,&list2);CHKERRQ(ierr);
794       ierr = VecScatterCreate(x,list1,y,list2,&vecscat);CHKERRQ(ierr);
795       ierr = VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
796       ierr = VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
797       ierr = VecScatterDestroy(&vecscat);CHKERRQ(ierr);
798       ierr = ISDestroy(&list1);CHKERRQ(ierr);
799       ierr = ISDestroy(&list2);CHKERRQ(ierr);
800 #else
801       SETERRQ(comm,PETSC_ERR_SUP,"FFTW does not support parallel 1D real transform");
802 #endif
803       break;
804     case 2:
805 #if defined(PETSC_USE_COMPLEX)
806       fftw_mpi_local_size_2d(dim[0],dim[1],comm,&local_n0,&local_0_start);
807 
808       ierr = ISCreateStride(comm,local_n0*dim[1],local_0_start*dim[1],1,&list1);CHKERRQ(ierr);
809       ierr = ISCreateStride(comm,local_n0*dim[1],low,1,&list2);CHKERRQ(ierr);
810       ierr = VecScatterCreate(x,list1,y,list2,&vecscat);CHKERRQ(ierr);
811       ierr = VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
812       ierr = VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
813       ierr = VecScatterDestroy(&vecscat);CHKERRQ(ierr);
814       ierr = ISDestroy(&list1);CHKERRQ(ierr);
815       ierr = ISDestroy(&list2);CHKERRQ(ierr);
816 #else
817       fftw_mpi_local_size_2d_transposed(dim[0],dim[1]/2+1,comm,&local_n0,&local_0_start,&local_n1,&local_1_start);
818 
819       ierr = PetscMalloc1(((PetscInt)local_n0)*dim[1],&indx1);CHKERRQ(ierr);
820       ierr = PetscMalloc1(((PetscInt)local_n0)*dim[1],&indx2);CHKERRQ(ierr);
821 
822       if (dim[1]%2==0) {
823         NM = dim[1]+2;
824       } else {
825         NM = dim[1]+1;
826       }
827       for (i=0; i<local_n0; i++) {
828         for (j=0; j<dim[1]; j++) {
829           tempindx  = i*dim[1] + j;
830           tempindx1 = i*NM + j;
831 
832           indx1[tempindx]=local_0_start*dim[1]+tempindx;
833           indx2[tempindx]=low+tempindx1;
834         }
835       }
836 
837       ierr = ISCreateGeneral(comm,local_n0*dim[1],indx1,PETSC_COPY_VALUES,&list1);CHKERRQ(ierr);
838       ierr = ISCreateGeneral(comm,local_n0*dim[1],indx2,PETSC_COPY_VALUES,&list2);CHKERRQ(ierr);
839 
840       ierr = VecScatterCreate(x,list1,y,list2,&vecscat);CHKERRQ(ierr);
841       ierr = VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
842       ierr = VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
843       ierr = VecScatterDestroy(&vecscat);CHKERRQ(ierr);
844       ierr = ISDestroy(&list1);CHKERRQ(ierr);
845       ierr = ISDestroy(&list2);CHKERRQ(ierr);
846       ierr = PetscFree(indx1);CHKERRQ(ierr);
847       ierr = PetscFree(indx2);CHKERRQ(ierr);
848 #endif
849       break;
850 
851     case 3:
852 #if defined(PETSC_USE_COMPLEX)
853       fftw_mpi_local_size_3d(dim[0],dim[1],dim[2],comm,&local_n0,&local_0_start);
854 
855       ierr = ISCreateStride(comm,local_n0*dim[1]*dim[2],local_0_start*dim[1]*dim[2],1,&list1);CHKERRQ(ierr);
856       ierr = ISCreateStride(comm,local_n0*dim[1]*dim[2],low,1,&list2);CHKERRQ(ierr);
857       ierr = VecScatterCreate(x,list1,y,list2,&vecscat);CHKERRQ(ierr);
858       ierr = VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
859       ierr = VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
860       ierr = VecScatterDestroy(&vecscat);CHKERRQ(ierr);
861       ierr = ISDestroy(&list1);CHKERRQ(ierr);
862       ierr = ISDestroy(&list2);CHKERRQ(ierr);
863 #else
864       /* buggy, needs to be fixed. See src/mat/tests/ex158.c */
865       SETERRQ(comm,PETSC_ERR_SUP,"FFTW does not support parallel 3D real transform");
866       fftw_mpi_local_size_3d_transposed(dim[0],dim[1],dim[2]/2+1,comm,&local_n0,&local_0_start,&local_n1,&local_1_start);
867 
868       ierr = PetscMalloc1(((PetscInt)local_n0)*dim[1]*dim[2],&indx1);CHKERRQ(ierr);
869       ierr = PetscMalloc1(((PetscInt)local_n0)*dim[1]*dim[2],&indx2);CHKERRQ(ierr);
870 
871       if (dim[2]%2==0) NM = dim[2]+2;
872       else             NM = dim[2]+1;
873 
874       for (i=0; i<local_n0; i++) {
875         for (j=0; j<dim[1]; j++) {
876           for (k=0; k<dim[2]; k++) {
877             tempindx  = i*dim[1]*dim[2] + j*dim[2] + k;
878             tempindx1 = i*dim[1]*NM + j*NM + k;
879 
880             indx1[tempindx]=local_0_start*dim[1]*dim[2]+tempindx;
881             indx2[tempindx]=low+tempindx1;
882           }
883         }
884       }
885 
886       ierr = ISCreateGeneral(comm,local_n0*dim[1]*dim[2],indx1,PETSC_COPY_VALUES,&list1);CHKERRQ(ierr);
887       ierr = ISCreateGeneral(comm,local_n0*dim[1]*dim[2],indx2,PETSC_COPY_VALUES,&list2);CHKERRQ(ierr);
888       ierr = VecScatterCreate(x,list1,y,list2,&vecscat);CHKERRQ(ierr);
889       ierr = VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
890       ierr = VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
891       ierr = VecScatterDestroy(&vecscat);CHKERRQ(ierr);
892       ierr = ISDestroy(&list1);CHKERRQ(ierr);
893       ierr = ISDestroy(&list2);CHKERRQ(ierr);
894       ierr = PetscFree(indx1);CHKERRQ(ierr);
895       ierr = PetscFree(indx2);CHKERRQ(ierr);
896 #endif
897       break;
898 
899     default:
900 #if defined(PETSC_USE_COMPLEX)
901       fftw_mpi_local_size(fftw->ndim_fftw,fftw->dim_fftw,comm,&local_n0,&local_0_start);
902 
903       ierr = ISCreateStride(comm,local_n0*(fftw->partial_dim),local_0_start*(fftw->partial_dim),1,&list1);CHKERRQ(ierr);
904       ierr = ISCreateStride(comm,local_n0*(fftw->partial_dim),low,1,&list2);CHKERRQ(ierr);
905       ierr = VecScatterCreate(x,list1,y,list2,&vecscat);CHKERRQ(ierr);
906       ierr = VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
907       ierr = VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
908       ierr = VecScatterDestroy(&vecscat);CHKERRQ(ierr);
909       ierr = ISDestroy(&list1);CHKERRQ(ierr);
910       ierr = ISDestroy(&list2);CHKERRQ(ierr);
911 #else
912       /* buggy, needs to be fixed. See src/mat/tests/ex158.c */
913       SETERRQ(comm,PETSC_ERR_SUP,"FFTW does not support parallel DIM>3 real transform");
914       temp = (fftw->dim_fftw)[fftw->ndim_fftw-1];
915 
916       (fftw->dim_fftw)[fftw->ndim_fftw-1] = temp/2 + 1;
917 
918       fftw_mpi_local_size_transposed(fftw->ndim_fftw,fftw->dim_fftw,comm,&local_n0,&local_0_start,&local_n1,&local_1_start);
919 
920       (fftw->dim_fftw)[fftw->ndim_fftw-1] = temp;
921 
922       partial_dim = fftw->partial_dim;
923 
924       ierr = PetscMalloc1(((PetscInt)local_n0)*partial_dim,&indx1);CHKERRQ(ierr);
925       ierr = PetscMalloc1(((PetscInt)local_n0)*partial_dim,&indx2);CHKERRQ(ierr);
926 
927       if (dim[ndim-1]%2==0) NM = 2;
928       else                  NM = 1;
929 
930       j = low;
931       for (i=0,k=1; i<((PetscInt)local_n0)*partial_dim;i++,k++) {
932         indx1[i] = local_0_start*partial_dim + i;
933         indx2[i] = j;
934         if (k%dim[ndim-1]==0) j+=NM;
935         j++;
936       }
937       ierr = ISCreateGeneral(comm,local_n0*partial_dim,indx1,PETSC_COPY_VALUES,&list1);CHKERRQ(ierr);
938       ierr = ISCreateGeneral(comm,local_n0*partial_dim,indx2,PETSC_COPY_VALUES,&list2);CHKERRQ(ierr);
939       ierr = VecScatterCreate(x,list1,y,list2,&vecscat);CHKERRQ(ierr);
940       ierr = VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
941       ierr = VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
942       ierr = VecScatterDestroy(&vecscat);CHKERRQ(ierr);
943       ierr = ISDestroy(&list1);CHKERRQ(ierr);
944       ierr = ISDestroy(&list2);CHKERRQ(ierr);
945       ierr = PetscFree(indx1);CHKERRQ(ierr);
946       ierr = PetscFree(indx2);CHKERRQ(ierr);
947 #endif
948       break;
949     }
950 #endif
951   }
952   PetscFunctionReturn(0);
953 }
954 
955 /*@
956    VecScatterFFTWToPetsc - Converts FFTW output to the PETSc vector.
957 
958    Collective on Mat
959 
960     Input Parameters:
961 +   A - FFTW matrix
962 -   x - FFTW vector
963 
964    Output Parameters:
965 .  y - PETSc vector
966 
967    Level: intermediate
968 
969    Note: While doing real transform the FFTW output of backward DFT contains extra zeros at the end of last dimension.
970          VecScatterFFTWToPetsc removes those extra zeros.
971 
972 .seealso: VecScatterPetscToFFTW()
973 @*/
974 PetscErrorCode VecScatterFFTWToPetsc(Mat A,Vec x,Vec y)
975 {
976   PetscErrorCode ierr;
977 
978   PetscFunctionBegin;
979   ierr = PetscUseMethod(A,"VecScatterFFTWToPetsc_C",(Mat,Vec,Vec),(A,x,y));CHKERRQ(ierr);
980   PetscFunctionReturn(0);
981 }
982 
983 PetscErrorCode VecScatterFFTWToPetsc_FFTW(Mat A,Vec x,Vec y)
984 {
985   PetscErrorCode ierr;
986   MPI_Comm       comm;
987   Mat_FFT        *fft  = (Mat_FFT*)A->data;
988   PetscInt       low;
989   PetscMPIInt    rank,size;
990   VecScatter     vecscat;
991   IS             list1;
992 
993   PetscFunctionBegin;
994   ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr);
995   ierr = MPI_Comm_size(comm, &size);CHKERRMPI(ierr);
996   ierr = MPI_Comm_rank(comm, &rank);CHKERRMPI(ierr);
997   ierr = VecGetOwnershipRange(x,&low,NULL);CHKERRQ(ierr);
998 
999   if (size==1) {
1000     ierr = ISCreateStride(comm,fft->N,0,1,&list1);CHKERRQ(ierr);
1001     ierr = VecScatterCreate(x,list1,y,list1,&vecscat);CHKERRQ(ierr);
1002     ierr = VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
1003     ierr = VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
1004     ierr = VecScatterDestroy(&vecscat);CHKERRQ(ierr);
1005     ierr = ISDestroy(&list1);CHKERRQ(ierr);
1006 
1007 #if !PetscDefined(HAVE_MPIUNI)
1008   } else {
1009     Mat_FFTW       *fftw = (Mat_FFTW*)fft->data;
1010     PetscInt       ndim  = fft->ndim,*dim = fft->dim;
1011     ptrdiff_t      local_n0,local_0_start;
1012     ptrdiff_t      local_n1,local_1_start;
1013     IS             list2;
1014 #if !defined(PETSC_USE_COMPLEX)
1015     PetscInt       i,j,k,partial_dim;
1016     PetscInt       *indx1, *indx2, tempindx, tempindx1;
1017     PetscInt       NM;
1018     ptrdiff_t      temp;
1019 #endif
1020     switch (ndim) {
1021     case 1:
1022 #if defined(PETSC_USE_COMPLEX)
1023       fftw_mpi_local_size_1d(dim[0],comm,FFTW_BACKWARD,FFTW_ESTIMATE,&local_n0,&local_0_start,&local_n1,&local_1_start);
1024 
1025       ierr = ISCreateStride(comm,local_n1,local_1_start,1,&list1);CHKERRQ(ierr);
1026       ierr = ISCreateStride(comm,local_n1,low,1,&list2);CHKERRQ(ierr);
1027       ierr = VecScatterCreate(x,list1,y,list2,&vecscat);CHKERRQ(ierr);
1028       ierr = VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
1029       ierr = VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
1030       ierr = VecScatterDestroy(&vecscat);CHKERRQ(ierr);
1031       ierr = ISDestroy(&list1);CHKERRQ(ierr);
1032       ierr = ISDestroy(&list2);CHKERRQ(ierr);
1033 #else
1034       SETERRQ(comm,PETSC_ERR_SUP,"No support for real parallel 1D FFT");
1035 #endif
1036       break;
1037     case 2:
1038 #if defined(PETSC_USE_COMPLEX)
1039       fftw_mpi_local_size_2d(dim[0],dim[1],comm,&local_n0,&local_0_start);
1040 
1041       ierr = ISCreateStride(comm,local_n0*dim[1],local_0_start*dim[1],1,&list1);CHKERRQ(ierr);
1042       ierr = ISCreateStride(comm,local_n0*dim[1],low,1,&list2);CHKERRQ(ierr);
1043       ierr = VecScatterCreate(x,list2,y,list1,&vecscat);CHKERRQ(ierr);
1044       ierr = VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
1045       ierr = VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
1046       ierr = VecScatterDestroy(&vecscat);CHKERRQ(ierr);
1047       ierr = ISDestroy(&list1);CHKERRQ(ierr);
1048       ierr = ISDestroy(&list2);CHKERRQ(ierr);
1049 #else
1050       fftw_mpi_local_size_2d_transposed(dim[0],dim[1]/2+1,comm,&local_n0,&local_0_start,&local_n1,&local_1_start);
1051 
1052       ierr = PetscMalloc1(((PetscInt)local_n0)*dim[1],&indx1);CHKERRQ(ierr);
1053       ierr = PetscMalloc1(((PetscInt)local_n0)*dim[1],&indx2);CHKERRQ(ierr);
1054 
1055       if (dim[1]%2==0) NM = dim[1]+2;
1056       else             NM = dim[1]+1;
1057 
1058       for (i=0; i<local_n0; i++) {
1059         for (j=0; j<dim[1]; j++) {
1060           tempindx = i*dim[1] + j;
1061           tempindx1 = i*NM + j;
1062 
1063           indx1[tempindx]=local_0_start*dim[1]+tempindx;
1064           indx2[tempindx]=low+tempindx1;
1065         }
1066       }
1067 
1068       ierr = ISCreateGeneral(comm,local_n0*dim[1],indx1,PETSC_COPY_VALUES,&list1);CHKERRQ(ierr);
1069       ierr = ISCreateGeneral(comm,local_n0*dim[1],indx2,PETSC_COPY_VALUES,&list2);CHKERRQ(ierr);
1070 
1071       ierr = VecScatterCreate(x,list2,y,list1,&vecscat);CHKERRQ(ierr);
1072       ierr = VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
1073       ierr = VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
1074       ierr = VecScatterDestroy(&vecscat);CHKERRQ(ierr);
1075       ierr = ISDestroy(&list1);CHKERRQ(ierr);
1076       ierr = ISDestroy(&list2);CHKERRQ(ierr);
1077       ierr = PetscFree(indx1);CHKERRQ(ierr);
1078       ierr = PetscFree(indx2);CHKERRQ(ierr);
1079 #endif
1080       break;
1081     case 3:
1082 #if defined(PETSC_USE_COMPLEX)
1083       fftw_mpi_local_size_3d(dim[0],dim[1],dim[2],comm,&local_n0,&local_0_start);
1084 
1085       ierr = ISCreateStride(comm,local_n0*dim[1]*dim[2],local_0_start*dim[1]*dim[2],1,&list1);CHKERRQ(ierr);
1086       ierr = ISCreateStride(comm,local_n0*dim[1]*dim[2],low,1,&list2);CHKERRQ(ierr);
1087       ierr = VecScatterCreate(x,list1,y,list2,&vecscat);CHKERRQ(ierr);
1088       ierr = VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
1089       ierr = VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
1090       ierr = VecScatterDestroy(&vecscat);CHKERRQ(ierr);
1091       ierr = ISDestroy(&list1);CHKERRQ(ierr);
1092       ierr = ISDestroy(&list2);CHKERRQ(ierr);
1093 #else
1094       fftw_mpi_local_size_3d_transposed(dim[0],dim[1],dim[2]/2+1,comm,&local_n0,&local_0_start,&local_n1,&local_1_start);
1095 
1096       ierr = PetscMalloc1(((PetscInt)local_n0)*dim[1]*dim[2],&indx1);CHKERRQ(ierr);
1097       ierr = PetscMalloc1(((PetscInt)local_n0)*dim[1]*dim[2],&indx2);CHKERRQ(ierr);
1098 
1099       if (dim[2]%2==0) NM = dim[2]+2;
1100       else             NM = dim[2]+1;
1101 
1102       for (i=0; i<local_n0; i++) {
1103         for (j=0; j<dim[1]; j++) {
1104           for (k=0; k<dim[2]; k++) {
1105             tempindx  = i*dim[1]*dim[2] + j*dim[2] + k;
1106             tempindx1 = i*dim[1]*NM + j*NM + k;
1107 
1108             indx1[tempindx]=local_0_start*dim[1]*dim[2]+tempindx;
1109             indx2[tempindx]=low+tempindx1;
1110           }
1111         }
1112       }
1113 
1114       ierr = ISCreateGeneral(comm,local_n0*dim[1]*dim[2],indx1,PETSC_COPY_VALUES,&list1);CHKERRQ(ierr);
1115       ierr = ISCreateGeneral(comm,local_n0*dim[1]*dim[2],indx2,PETSC_COPY_VALUES,&list2);CHKERRQ(ierr);
1116 
1117       ierr = VecScatterCreate(x,list2,y,list1,&vecscat);CHKERRQ(ierr);
1118       ierr = VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
1119       ierr = VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
1120       ierr = VecScatterDestroy(&vecscat);CHKERRQ(ierr);
1121       ierr = ISDestroy(&list1);CHKERRQ(ierr);
1122       ierr = ISDestroy(&list2);CHKERRQ(ierr);
1123       ierr = PetscFree(indx1);CHKERRQ(ierr);
1124       ierr = PetscFree(indx2);CHKERRQ(ierr);
1125 #endif
1126       break;
1127     default:
1128 #if defined(PETSC_USE_COMPLEX)
1129       fftw_mpi_local_size(fftw->ndim_fftw,fftw->dim_fftw,comm,&local_n0,&local_0_start);
1130 
1131       ierr = ISCreateStride(comm,local_n0*(fftw->partial_dim),local_0_start*(fftw->partial_dim),1,&list1);CHKERRQ(ierr);
1132       ierr = ISCreateStride(comm,local_n0*(fftw->partial_dim),low,1,&list2);CHKERRQ(ierr);
1133       ierr = VecScatterCreate(x,list1,y,list2,&vecscat);CHKERRQ(ierr);
1134       ierr = VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
1135       ierr = VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
1136       ierr = VecScatterDestroy(&vecscat);CHKERRQ(ierr);
1137       ierr = ISDestroy(&list1);CHKERRQ(ierr);
1138       ierr = ISDestroy(&list2);CHKERRQ(ierr);
1139 #else
1140       temp = (fftw->dim_fftw)[fftw->ndim_fftw-1];
1141 
1142       (fftw->dim_fftw)[fftw->ndim_fftw-1] = temp/2 + 1;
1143 
1144       fftw_mpi_local_size_transposed(fftw->ndim_fftw,fftw->dim_fftw,comm,&local_n0,&local_0_start,&local_n1,&local_1_start);
1145 
1146       (fftw->dim_fftw)[fftw->ndim_fftw-1] = temp;
1147 
1148       partial_dim = fftw->partial_dim;
1149 
1150       ierr = PetscMalloc1(((PetscInt)local_n0)*partial_dim,&indx1);CHKERRQ(ierr);
1151       ierr = PetscMalloc1(((PetscInt)local_n0)*partial_dim,&indx2);CHKERRQ(ierr);
1152 
1153       if (dim[ndim-1]%2==0) NM = 2;
1154       else                  NM = 1;
1155 
1156       j = low;
1157       for (i=0,k=1; i<((PetscInt)local_n0)*partial_dim; i++,k++) {
1158         indx1[i] = local_0_start*partial_dim + i;
1159         indx2[i] = j;
1160         if (k%dim[ndim-1]==0) j+=NM;
1161         j++;
1162       }
1163       ierr = ISCreateGeneral(comm,local_n0*partial_dim,indx1,PETSC_COPY_VALUES,&list1);CHKERRQ(ierr);
1164       ierr = ISCreateGeneral(comm,local_n0*partial_dim,indx2,PETSC_COPY_VALUES,&list2);CHKERRQ(ierr);
1165 
1166       ierr = VecScatterCreate(x,list2,y,list1,&vecscat);CHKERRQ(ierr);
1167       ierr = VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
1168       ierr = VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
1169       ierr = VecScatterDestroy(&vecscat);CHKERRQ(ierr);
1170       ierr = ISDestroy(&list1);CHKERRQ(ierr);
1171       ierr = ISDestroy(&list2);CHKERRQ(ierr);
1172       ierr = PetscFree(indx1);CHKERRQ(ierr);
1173       ierr = PetscFree(indx2);CHKERRQ(ierr);
1174 #endif
1175       break;
1176     }
1177 #endif
1178   }
1179   PetscFunctionReturn(0);
1180 }
1181 
1182 /*
1183     MatCreate_FFTW - Creates a matrix object that provides FFT via the external package FFTW
1184 
1185   Options Database Keys:
1186 + -mat_fftw_plannerflags - set FFTW planner flags
1187 
1188    Level: intermediate
1189 
1190 */
1191 PETSC_EXTERN PetscErrorCode MatCreate_FFTW(Mat A)
1192 {
1193   PetscErrorCode ierr;
1194   MPI_Comm       comm;
1195   Mat_FFT        *fft = (Mat_FFT*)A->data;
1196   Mat_FFTW       *fftw;
1197   PetscInt       ndim = fft->ndim,*dim = fft->dim;
1198   const char     *plans[]={"FFTW_ESTIMATE","FFTW_MEASURE","FFTW_PATIENT","FFTW_EXHAUSTIVE"};
1199   unsigned       iplans[]={FFTW_ESTIMATE,FFTW_MEASURE,FFTW_PATIENT,FFTW_EXHAUSTIVE};
1200   PetscBool      flg;
1201   PetscInt       p_flag,partial_dim=1,ctr;
1202   PetscMPIInt    size,rank;
1203   ptrdiff_t      *pdim;
1204 #if !defined(PETSC_USE_COMPLEX)
1205   PetscInt       tot_dim;
1206 #endif
1207 
1208   PetscFunctionBegin;
1209   ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr);
1210   ierr = MPI_Comm_size(comm, &size);CHKERRMPI(ierr);
1211   ierr = MPI_Comm_rank(comm, &rank);CHKERRMPI(ierr);
1212 
1213 #if !PetscDefined(HAVE_MPIUNI)
1214   fftw_mpi_init();
1215 #endif
1216   pdim    = (ptrdiff_t*)calloc(ndim,sizeof(ptrdiff_t));
1217   pdim[0] = dim[0];
1218 #if !defined(PETSC_USE_COMPLEX)
1219   tot_dim = 2*dim[0];
1220 #endif
1221   for (ctr=1; ctr<ndim; ctr++) {
1222     partial_dim *= dim[ctr];
1223     pdim[ctr]    = dim[ctr];
1224 #if !defined(PETSC_USE_COMPLEX)
1225     if (ctr==ndim-1) tot_dim *= (dim[ctr]/2+1);
1226     else             tot_dim *= dim[ctr];
1227 #endif
1228   }
1229 
1230   if (size == 1) {
1231 #if defined(PETSC_USE_COMPLEX)
1232     ierr = MatSetSizes(A,fft->N,fft->N,fft->N,fft->N);CHKERRQ(ierr);
1233     fft->n = fft->N;
1234 #else
1235     ierr = MatSetSizes(A,tot_dim,tot_dim,tot_dim,tot_dim);CHKERRQ(ierr);
1236     fft->n = tot_dim;
1237 #endif
1238 #if !PetscDefined(HAVE_MPIUNI)
1239   } else {
1240     ptrdiff_t local_n0,local_0_start,local_n1,local_1_start;
1241 #if !defined(PETSC_USE_COMPLEX)
1242     ptrdiff_t temp;
1243     PetscInt  N1;
1244 #endif
1245 
1246     switch (ndim) {
1247     case 1:
1248 #if !defined(PETSC_USE_COMPLEX)
1249       SETERRQ(comm,PETSC_ERR_SUP,"FFTW does not support parallel 1D real transform");
1250 #else
1251       fftw_mpi_local_size_1d(dim[0],comm,FFTW_FORWARD,FFTW_ESTIMATE,&local_n0,&local_0_start,&local_n1,&local_1_start);
1252       fft->n = (PetscInt)local_n0;
1253       ierr   = MatSetSizes(A,local_n1,fft->n,fft->N,fft->N);CHKERRQ(ierr);
1254 #endif
1255       break;
1256     case 2:
1257 #if defined(PETSC_USE_COMPLEX)
1258       fftw_mpi_local_size_2d(dim[0],dim[1],comm,&local_n0,&local_0_start);
1259       fft->n    = (PetscInt)local_n0*dim[1];
1260       ierr = MatSetSizes(A,fft->n,fft->n,fft->N,fft->N);CHKERRQ(ierr);
1261 #else
1262       fftw_mpi_local_size_2d_transposed(dim[0],dim[1]/2+1,comm,&local_n0,&local_0_start,&local_n1,&local_1_start);
1263 
1264       fft->n = 2*(PetscInt)local_n0*(dim[1]/2+1);
1265       ierr = MatSetSizes(A,fft->n,fft->n,2*dim[0]*(dim[1]/2+1),2*dim[0]*(dim[1]/2+1));CHKERRQ(ierr);
1266 #endif
1267       break;
1268     case 3:
1269 #if defined(PETSC_USE_COMPLEX)
1270       fftw_mpi_local_size_3d(dim[0],dim[1],dim[2],comm,&local_n0,&local_0_start);
1271 
1272       fft->n = (PetscInt)local_n0*dim[1]*dim[2];
1273       ierr = MatSetSizes(A,fft->n,fft->n,fft->N,fft->N);CHKERRQ(ierr);
1274 #else
1275       fftw_mpi_local_size_3d_transposed(dim[0],dim[1],dim[2]/2+1,comm,&local_n0,&local_0_start,&local_n1,&local_1_start);
1276 
1277       fft->n = 2*(PetscInt)local_n0*dim[1]*(dim[2]/2+1);
1278       ierr = MatSetSizes(A,fft->n,fft->n,2*dim[0]*dim[1]*(dim[2]/2+1),2*dim[0]*dim[1]*(dim[2]/2+1));CHKERRQ(ierr);
1279 #endif
1280       break;
1281     default:
1282 #if defined(PETSC_USE_COMPLEX)
1283       fftw_mpi_local_size(ndim,pdim,comm,&local_n0,&local_0_start);
1284 
1285       fft->n = (PetscInt)local_n0*partial_dim;
1286       ierr = MatSetSizes(A,fft->n,fft->n,fft->N,fft->N);CHKERRQ(ierr);
1287 #else
1288       temp = pdim[ndim-1];
1289 
1290       pdim[ndim-1] = temp/2 + 1;
1291 
1292       fftw_mpi_local_size_transposed(ndim,pdim,comm,&local_n0,&local_0_start,&local_n1,&local_1_start);
1293 
1294       fft->n  = 2*(PetscInt)local_n0*partial_dim*pdim[ndim-1]/temp;
1295       N1 = 2*fft->N*(PetscInt)pdim[ndim-1]/((PetscInt) temp);
1296 
1297       pdim[ndim-1] = temp;
1298 
1299       ierr = MatSetSizes(A,fft->n,fft->n,N1,N1);CHKERRQ(ierr);
1300 #endif
1301       break;
1302     }
1303 #endif
1304   }
1305   free(pdim);
1306   ierr      = PetscObjectChangeTypeName((PetscObject)A,MATFFTW);CHKERRQ(ierr);
1307   ierr      = PetscNewLog(A,&fftw);CHKERRQ(ierr);
1308   fft->data = (void*)fftw;
1309 
1310   fftw->ndim_fftw   = (ptrdiff_t)ndim; /* This is dimension of fft */
1311   fftw->partial_dim = partial_dim;
1312 
1313   ierr = PetscMalloc1(ndim, &(fftw->dim_fftw));CHKERRQ(ierr);
1314   if (size == 1) {
1315 #if defined(PETSC_USE_64BIT_INDICES)
1316     fftw->iodims = (fftw_iodim64 *) malloc(sizeof(fftw_iodim64) * ndim);
1317 #else
1318     fftw->iodims = (fftw_iodim *) malloc(sizeof(fftw_iodim) * ndim);
1319 #endif
1320   }
1321 
1322   for (ctr=0;ctr<ndim;ctr++) (fftw->dim_fftw)[ctr]=dim[ctr];
1323 
1324   fftw->p_forward  = NULL;
1325   fftw->p_backward = NULL;
1326   fftw->p_flag     = FFTW_ESTIMATE;
1327 
1328   if (size == 1) {
1329     A->ops->mult          = MatMult_SeqFFTW;
1330     A->ops->multtranspose = MatMultTranspose_SeqFFTW;
1331 #if !PetscDefined(HAVE_MPIUNI)
1332   } else {
1333     A->ops->mult          = MatMult_MPIFFTW;
1334     A->ops->multtranspose = MatMultTranspose_MPIFFTW;
1335 #endif
1336   }
1337   fft->matdestroy = MatDestroy_FFTW;
1338   A->assembled    = PETSC_TRUE;
1339   A->preallocated = PETSC_TRUE;
1340 
1341   ierr = PetscObjectComposeFunction((PetscObject)A,"MatCreateVecsFFTW_C",MatCreateVecsFFTW_FFTW);CHKERRQ(ierr);
1342   ierr = PetscObjectComposeFunction((PetscObject)A,"VecScatterPetscToFFTW_C",VecScatterPetscToFFTW_FFTW);CHKERRQ(ierr);
1343   ierr = PetscObjectComposeFunction((PetscObject)A,"VecScatterFFTWToPetsc_C",VecScatterFFTWToPetsc_FFTW);CHKERRQ(ierr);
1344 
1345   /* get runtime options */
1346   ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)A),((PetscObject)A)->prefix,"FFTW Options","Mat");CHKERRQ(ierr);
1347   ierr = PetscOptionsEList("-mat_fftw_plannerflags","Planner Flags","None",plans,4,plans[0],&p_flag,&flg);CHKERRQ(ierr);
1348   if (flg) {
1349     fftw->p_flag = iplans[p_flag];
1350   }
1351   ierr = PetscOptionsEnd();CHKERRQ(ierr);
1352   PetscFunctionReturn(0);
1353 }
1354 
1355