xref: /petsc/src/mat/impls/fft/fftw/fftw.c (revision 009bbdc485cd9ad46be9940d3549e2dde9cdc322)
1 
2 /*
3     Provides an interface to the FFTW package.
4     Testing examples can be found in ~src/mat/examples/tests
5 */
6 
7 #include <../src/mat/impls/fft/fft.h>   /*I "petscmat.h" I*/
8 EXTERN_C_BEGIN
9 #include <fftw3-mpi.h>
10 EXTERN_C_END
11 
12 typedef struct {
13   ptrdiff_t   ndim_fftw,*dim_fftw;
14   PetscInt    partial_dim;
15   fftw_plan   p_forward,p_backward;
16   unsigned    p_flag; /* planner flags, FFTW_ESTIMATE,FFTW_MEASURE, FFTW_PATIENT, FFTW_EXHAUSTIVE */
17   PetscScalar *finarray,*foutarray,*binarray,*boutarray; /* keep track of arrays becaue fftw plan should be
18                                                             executed for the arrays with which the plan was created */
19 } Mat_FFTW;
20 
21 extern PetscErrorCode MatMult_SeqFFTW(Mat,Vec,Vec);
22 extern PetscErrorCode MatMultTranspose_SeqFFTW(Mat,Vec,Vec);
23 extern PetscErrorCode MatMult_MPIFFTW(Mat,Vec,Vec);
24 extern PetscErrorCode MatMultTranspose_MPIFFTW(Mat,Vec,Vec);
25 extern PetscErrorCode MatDestroy_FFTW(Mat);
26 extern PetscErrorCode VecDestroy_MPIFFTW(Vec);
27 
28 /* MatMult_SeqFFTW performs forward DFT in parallel
29    Input parameter:
30      A - the matrix
31      x - the vector on which FDFT will be performed
32 
33    Output parameter:
34      y - vector that stores result of FDFT
35 */
36 #undef __FUNCT__
37 #define __FUNCT__ "MatMult_SeqFFTW"
38 PetscErrorCode MatMult_SeqFFTW(Mat A,Vec x,Vec y)
39 {
40   PetscErrorCode ierr;
41   Mat_FFT        *fft  = (Mat_FFT*)A->data;
42   Mat_FFTW       *fftw = (Mat_FFTW*)fft->data;
43   PetscScalar    *x_array,*y_array;
44   PetscInt       ndim=fft->ndim,*dim=fft->dim;
45 
46   PetscFunctionBegin;
47   ierr = VecGetArray(x,&x_array);CHKERRQ(ierr);
48   ierr = VecGetArray(y,&y_array);CHKERRQ(ierr);
49   if (!fftw->p_forward) { /* create a plan, then excute it */
50     switch (ndim) {
51     case 1:
52 #if defined(PETSC_USE_COMPLEX)
53       fftw->p_forward = fftw_plan_dft_1d(dim[0],(fftw_complex*)x_array,(fftw_complex*)y_array,FFTW_FORWARD,fftw->p_flag);
54 #else
55       fftw->p_forward = fftw_plan_dft_r2c_1d(dim[0],(double *)x_array,(fftw_complex*)y_array,fftw->p_flag);
56 #endif
57       break;
58     case 2:
59 #if defined(PETSC_USE_COMPLEX)
60       fftw->p_forward = fftw_plan_dft_2d(dim[0],dim[1],(fftw_complex*)x_array,(fftw_complex*)y_array,FFTW_FORWARD,fftw->p_flag);
61 #else
62       fftw->p_forward = fftw_plan_dft_r2c_2d(dim[0],dim[1],(double *)x_array,(fftw_complex*)y_array,fftw->p_flag);
63 #endif
64       break;
65     case 3:
66 #if defined(PETSC_USE_COMPLEX)
67       fftw->p_forward = fftw_plan_dft_3d(dim[0],dim[1],dim[2],(fftw_complex*)x_array,(fftw_complex*)y_array,FFTW_FORWARD,fftw->p_flag);
68 #else
69       fftw->p_forward = fftw_plan_dft_r2c_3d(dim[0],dim[1],dim[2],(double *)x_array,(fftw_complex*)y_array,fftw->p_flag);
70 #endif
71       break;
72     default:
73 #if defined(PETSC_USE_COMPLEX)
74       fftw->p_forward = fftw_plan_dft(ndim,dim,(fftw_complex*)x_array,(fftw_complex*)y_array,FFTW_FORWARD,fftw->p_flag);
75 #else
76       fftw->p_forward = fftw_plan_dft_r2c(ndim,dim,(double *)x_array,(fftw_complex*)y_array,fftw->p_flag);
77 #endif
78       break;
79     }
80     fftw->finarray  = x_array;
81     fftw->foutarray = y_array;
82     /* Warning: if (fftw->p_flag!==FFTW_ESTIMATE) The data in the in/out arrays is overwritten!
83                 planning should be done before x is initialized! See FFTW manual sec2.1 or sec4 */
84     fftw_execute(fftw->p_forward);
85   } else { /* use existing plan */
86     if (fftw->finarray != x_array || fftw->foutarray != y_array) { /* use existing plan on new arrays */
87       fftw_execute_dft(fftw->p_forward,(fftw_complex*)x_array,(fftw_complex*)y_array);
88     } else {
89       fftw_execute(fftw->p_forward);
90     }
91   }
92   ierr = VecRestoreArray(y,&y_array);CHKERRQ(ierr);
93   ierr = VecRestoreArray(x,&x_array);CHKERRQ(ierr);
94   PetscFunctionReturn(0);
95 }
96 
97 /* MatMultTranspose_SeqFFTW performs serial backward DFT
98    Input parameter:
99      A - the matrix
100      x - the vector on which BDFT will be performed
101 
102    Output parameter:
103      y - vector that stores result of BDFT
104 */
105 
106 #undef __FUNCT__
107 #define __FUNCT__ "MatMultTranspose_SeqFFTW"
108 PetscErrorCode MatMultTranspose_SeqFFTW(Mat A,Vec x,Vec y)
109 {
110   PetscErrorCode ierr;
111   Mat_FFT        *fft = (Mat_FFT*)A->data;
112   Mat_FFTW       *fftw = (Mat_FFTW*)fft->data;
113   PetscScalar    *x_array,*y_array;
114   PetscInt       ndim=fft->ndim,*dim=fft->dim;
115 
116   PetscFunctionBegin;
117   ierr = VecGetArray(x,&x_array);CHKERRQ(ierr);
118   ierr = VecGetArray(y,&y_array);CHKERRQ(ierr);
119   if (!fftw->p_backward) { /* create a plan, then excute it */
120     switch (ndim) {
121     case 1:
122 #if defined(PETSC_USE_COMPLEX)
123       fftw->p_backward = fftw_plan_dft_1d(dim[0],(fftw_complex*)x_array,(fftw_complex*)y_array,FFTW_BACKWARD,fftw->p_flag);
124 #else
125       fftw->p_backward= fftw_plan_dft_c2r_1d(dim[0],(fftw_complex*)x_array,(double *)y_array,fftw->p_flag);
126 #endif
127       break;
128     case 2:
129 #if defined(PETSC_USE_COMPLEX)
130       fftw->p_backward = fftw_plan_dft_2d(dim[0],dim[1],(fftw_complex*)x_array,(fftw_complex*)y_array,FFTW_BACKWARD,fftw->p_flag);
131 #else
132       fftw->p_backward= fftw_plan_dft_c2r_2d(dim[0],dim[1],(fftw_complex*)x_array,(double *)y_array,fftw->p_flag);
133 #endif
134       break;
135     case 3:
136 #if defined(PETSC_USE_COMPLEX)
137       fftw->p_backward = fftw_plan_dft_3d(dim[0],dim[1],dim[2],(fftw_complex*)x_array,(fftw_complex*)y_array,FFTW_BACKWARD,fftw->p_flag);
138 #else
139       fftw->p_backward= fftw_plan_dft_c2r_3d(dim[0],dim[1],dim[2],(fftw_complex*)x_array,(double *)y_array,fftw->p_flag);
140 #endif
141       break;
142     default:
143 #if defined(PETSC_USE_COMPLEX)
144       fftw->p_backward = fftw_plan_dft(ndim,dim,(fftw_complex*)x_array,(fftw_complex*)y_array,FFTW_BACKWARD,fftw->p_flag);
145 #else
146       fftw->p_backward= fftw_plan_dft_c2r(ndim,dim,(fftw_complex*)x_array,(double *)y_array,fftw->p_flag);
147 #endif
148       break;
149     }
150     fftw->binarray  = x_array;
151     fftw->boutarray = y_array;
152     fftw_execute(fftw->p_backward);CHKERRQ(ierr);
153   } else { /* use existing plan */
154     if (fftw->binarray != x_array || fftw->boutarray != y_array) { /* use existing plan on new arrays */
155       fftw_execute_dft(fftw->p_backward,(fftw_complex*)x_array,(fftw_complex*)y_array);
156     } else {
157       fftw_execute(fftw->p_backward);CHKERRQ(ierr);
158     }
159   }
160   ierr = VecRestoreArray(y,&y_array);CHKERRQ(ierr);
161   ierr = VecRestoreArray(x,&x_array);CHKERRQ(ierr);
162   PetscFunctionReturn(0);
163 }
164 
165 /* MatMult_MPIFFTW performs forward DFT in parallel
166    Input parameter:
167      A - the matrix
168      x - the vector on which FDFT will be performed
169 
170    Output parameter:
171    y   - vector that stores result of FDFT
172 */
173 #undef __FUNCT__
174 #define __FUNCT__ "MatMult_MPIFFTW"
175 PetscErrorCode MatMult_MPIFFTW(Mat A,Vec x,Vec y)
176 {
177   PetscErrorCode ierr;
178   Mat_FFT        *fft  = (Mat_FFT*)A->data;
179   Mat_FFTW       *fftw = (Mat_FFTW*)fft->data;
180   PetscScalar    *x_array,*y_array;
181   PetscInt       ndim=fft->ndim,*dim=fft->dim;
182   MPI_Comm       comm=((PetscObject)A)->comm;
183 
184   PetscFunctionBegin;
185   ierr = VecGetArray(x,&x_array);CHKERRQ(ierr);
186   ierr = VecGetArray(y,&y_array);CHKERRQ(ierr);
187   if (!fftw->p_forward) { /* create a plan, then excute it */
188     switch (ndim) {
189     case 1:
190 #if defined(PETSC_USE_COMPLEX)
191       fftw->p_forward = fftw_mpi_plan_dft_1d(dim[0],(fftw_complex*)x_array,(fftw_complex*)y_array,comm,FFTW_FORWARD,fftw->p_flag);
192 #else
193       SETERRQ(comm,PETSC_ERR_SUP,"not support for real numbers yet");
194 #endif
195       break;
196     case 2:
197 #if defined(PETSC_USE_COMPLEX) /* For complex transforms call fftw_mpi_plan_dft, for real transforms call fftw_mpi_plan_dft_r2c */
198       fftw->p_forward = fftw_mpi_plan_dft_2d(dim[0],dim[1],(fftw_complex*)x_array,(fftw_complex*)y_array,comm,FFTW_FORWARD,fftw->p_flag);
199 #else
200       fftw->p_forward = fftw_mpi_plan_dft_r2c_2d(dim[0],dim[1],(double *)x_array,(fftw_complex*)y_array,comm,FFTW_ESTIMATE);
201 #endif
202       break;
203     case 3:
204 #if defined(PETSC_USE_COMPLEX)
205       fftw->p_forward = fftw_mpi_plan_dft_3d(dim[0],dim[1],dim[2],(fftw_complex*)x_array,(fftw_complex*)y_array,comm,FFTW_FORWARD,fftw->p_flag);
206 #else
207       fftw->p_forward = fftw_mpi_plan_dft_r2c_3d(dim[0],dim[1],dim[2],(double *)x_array,(fftw_complex*)y_array,comm,FFTW_ESTIMATE);
208 #endif
209       break;
210     default:
211 #if defined(PETSC_USE_COMPLEX)
212       fftw->p_forward = fftw_mpi_plan_dft(fftw->ndim_fftw,fftw->dim_fftw,(fftw_complex*)x_array,(fftw_complex*)y_array,comm,FFTW_FORWARD,fftw->p_flag);
213 #else
214       fftw->p_forward = fftw_mpi_plan_dft_r2c(fftw->ndim_fftw,fftw->dim_fftw,(double *)x_array,(fftw_complex*)y_array,comm,FFTW_ESTIMATE);
215 #endif
216       break;
217     }
218     fftw->finarray  = x_array;
219     fftw->foutarray = y_array;
220     /* Warning: if (fftw->p_flag!==FFTW_ESTIMATE) The data in the in/out arrays is overwritten!
221                 planning should be done before x is initialized! See FFTW manual sec2.1 or sec4 */
222     fftw_execute(fftw->p_forward);
223   } else { /* use existing plan */
224     if (fftw->finarray != x_array || fftw->foutarray != y_array) { /* use existing plan on new arrays */
225       fftw_execute_dft(fftw->p_forward,(fftw_complex*)x_array,(fftw_complex*)y_array);
226     } else {
227       fftw_execute(fftw->p_forward);
228     }
229   }
230   ierr = VecRestoreArray(y,&y_array);CHKERRQ(ierr);
231   ierr = VecRestoreArray(x,&x_array);CHKERRQ(ierr);
232   PetscFunctionReturn(0);
233 }
234 
235 /* MatMultTranspose_MPIFFTW performs parallel backward DFT
236    Input parameter:
237      A - the matrix
238      x - the vector on which BDFT will be performed
239 
240    Output parameter:
241      y - vector that stores result of BDFT
242 */
243 #undef __FUNCT__
244 #define __FUNCT__ "MatMultTranspose_MPIFFTW"
245 PetscErrorCode MatMultTranspose_MPIFFTW(Mat A,Vec x,Vec y)
246 {
247   PetscErrorCode ierr;
248   Mat_FFT        *fft  = (Mat_FFT*)A->data;
249   Mat_FFTW       *fftw = (Mat_FFTW*)fft->data;
250   PetscScalar    *x_array,*y_array;
251   PetscInt       ndim=fft->ndim,*dim=fft->dim;
252   MPI_Comm       comm=((PetscObject)A)->comm;
253 
254   PetscFunctionBegin;
255   ierr = VecGetArray(x,&x_array);CHKERRQ(ierr);
256   ierr = VecGetArray(y,&y_array);CHKERRQ(ierr);
257   if (!fftw->p_backward) { /* create a plan, then excute it */
258     switch (ndim) {
259     case 1:
260 #if defined(PETSC_USE_COMPLEX)
261       fftw->p_backward = fftw_mpi_plan_dft_1d(dim[0],(fftw_complex*)x_array,(fftw_complex*)y_array,comm,FFTW_BACKWARD,fftw->p_flag);
262 #else
263       SETERRQ(comm,PETSC_ERR_SUP,"not support for real numbers yet");
264 #endif
265       break;
266     case 2:
267 #if defined(PETSC_USE_COMPLEX) /* For complex transforms call fftw_mpi_plan_dft with flag FFTW_BACKWARD, for real transforms call fftw_mpi_plan_dft_c2r */
268       fftw->p_backward = fftw_mpi_plan_dft_2d(dim[0],dim[1],(fftw_complex*)x_array,(fftw_complex*)y_array,comm,FFTW_BACKWARD,fftw->p_flag);
269 #else
270       fftw->p_backward = fftw_mpi_plan_dft_c2r_2d(dim[0],dim[1],(fftw_complex*)x_array,(double *)y_array,comm,FFTW_ESTIMATE);
271 #endif
272       break;
273     case 3:
274 #if defined(PETSC_USE_COMPLEX)
275       fftw->p_backward = fftw_mpi_plan_dft_3d(dim[0],dim[1],dim[2],(fftw_complex*)x_array,(fftw_complex*)y_array,comm,FFTW_BACKWARD,fftw->p_flag);
276 #else
277       fftw->p_backward = fftw_mpi_plan_dft_c2r_3d(dim[0],dim[1],dim[2],(fftw_complex*)x_array,(double *)y_array,comm,FFTW_ESTIMATE);
278 #endif
279       break;
280     default:
281 #if defined(PETSC_USE_COMPLEX)
282       fftw->p_backward = fftw_mpi_plan_dft(fftw->ndim_fftw,fftw->dim_fftw,(fftw_complex*)x_array,(fftw_complex*)y_array,comm,FFTW_BACKWARD,fftw->p_flag);
283 #else
284       fftw->p_backward = fftw_mpi_plan_dft_c2r(fftw->ndim_fftw,fftw->dim_fftw,(fftw_complex*)x_array,(double *)y_array,comm,FFTW_ESTIMATE);
285 #endif
286       break;
287     }
288     fftw->binarray  = x_array;
289     fftw->boutarray = y_array;
290     fftw_execute(fftw->p_backward);CHKERRQ(ierr);
291   } else { /* use existing plan */
292     if (fftw->binarray != x_array || fftw->boutarray != y_array) { /* use existing plan on new arrays */
293       fftw_execute_dft(fftw->p_backward,(fftw_complex*)x_array,(fftw_complex*)y_array);
294     } else {
295       fftw_execute(fftw->p_backward);CHKERRQ(ierr);
296     }
297   }
298   ierr = VecRestoreArray(y,&y_array);CHKERRQ(ierr);
299   ierr = VecRestoreArray(x,&x_array);CHKERRQ(ierr);
300   PetscFunctionReturn(0);
301 }
302 
303 #undef __FUNCT__
304 #define __FUNCT__ "MatDestroy_FFTW"
305 PetscErrorCode MatDestroy_FFTW(Mat A)
306 {
307   Mat_FFT        *fft = (Mat_FFT*)A->data;
308   Mat_FFTW       *fftw = (Mat_FFTW*)fft->data;
309   PetscErrorCode ierr;
310 
311   PetscFunctionBegin;
312   fftw_destroy_plan(fftw->p_forward);
313   fftw_destroy_plan(fftw->p_backward);
314   ierr = PetscFree(fftw->dim_fftw);CHKERRQ(ierr);
315   ierr = PetscFree(fft->data);CHKERRQ(ierr);
316   fftw_mpi_cleanup();
317   PetscFunctionReturn(0);
318 }
319 
320 #include <../src/vec/vec/impls/mpi/pvecimpl.h>   /*I  "petscvec.h"   I*/
321 #undef __FUNCT__
322 #define __FUNCT__ "VecDestroy_MPIFFTW"
323 PetscErrorCode VecDestroy_MPIFFTW(Vec v)
324 {
325   PetscErrorCode  ierr;
326   PetscScalar     *array;
327 
328   PetscFunctionBegin;
329   ierr = VecGetArray(v,&array);CHKERRQ(ierr);
330   fftw_free((fftw_complex*)array);CHKERRQ(ierr);
331   ierr = VecRestoreArray(v,&array);CHKERRQ(ierr);
332   ierr = VecDestroy_MPI(v);CHKERRQ(ierr);
333   PetscFunctionReturn(0);
334 }
335 
336 #undef __FUNCT__
337 #define __FUNCT__ "MatGetVecsFFTW"
338 /*@
339    MatGetVecsFFTW - Get vector(s) compatible with the matrix, i.e. with the
340      parallel layout determined by FFTW
341 
342    Collective on Mat
343 
344    Input Parameter:
345 .   A - the matrix
346 
347    Output Parameter:
348 +   x - (optional) input vector of forward FFTW
349 -   y - (optional) output vector of forward FFTW
350 -   z - (optional) output vector of backward FFTW
351 
352   Level: advanced
353 
354   Note: The parallel layout of output of forward FFTW is always same as the input
355         of the backward FFTW. But parallel layout of the input vector of forward
356         FFTW might not be same as the output of backward FFTW.
357         Also note that we need to provide enough space while doing parallel real transform.
358         We need to pad extra zeros at the end of last dimension. For this reason the one needs to
359         invoke the routine fftw_mpi_local_size_transposed routines. Remember one has to change the
360         last dimension from n to n/2+1 while invoking this routine. The number of zeros to be padded
361         depends on if the last dimension is even or odd. If the last dimension is even need to pad two
362         zeros if it is odd only one zero is needed.
363         Lastly one needs some scratch space at the end of data set in each process. alloc_local
364         figures out how much space is needed, i.e. it figures out the data+scratch space for
365         each processor and returns that.
366 
367 .seealso: MatCreateFFTW()
368 @*/
369 PetscErrorCode MatGetVecsFFTW(Mat A,Vec *x,Vec *y,Vec *z)
370 {
371   PetscErrorCode ierr;
372 
373   PetscFunctionBegin;
374   ierr = PetscTryMethod(A,"MatGetVecsFFTW_C",(Mat,Vec*,Vec*,Vec*),(A,x,y,z));CHKERRQ(ierr);
375   PetscFunctionReturn(0);
376 }
377 
378 EXTERN_C_BEGIN
379 #undef __FUNCT__
380 #define __FUNCT__ "MatGetVecsFFTW_FFTW"
381 PetscErrorCode  MatGetVecsFFTW_FFTW(Mat A,Vec *fin,Vec *fout,Vec *bout)
382 {
383   PetscErrorCode ierr;
384   PetscMPIInt    size,rank;
385   MPI_Comm       comm=((PetscObject)A)->comm;
386   Mat_FFT        *fft = (Mat_FFT*)A->data;
387   Mat_FFTW       *fftw = (Mat_FFTW*)fft->data;
388   PetscInt       N=fft->N;
389   PetscInt       ndim=fft->ndim,*dim=fft->dim,n=fft->n;
390 
391   PetscFunctionBegin;
392   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
393   PetscValidType(A,1);
394 
395   ierr = MPI_Comm_size(comm, &size);CHKERRQ(ierr);
396   ierr = MPI_Comm_rank(comm, &rank);CHKERRQ(ierr);
397   if (size == 1) { /* sequential case */
398 #if defined(PETSC_USE_COMPLEX)
399     if (fin) {ierr = VecCreateSeq(PETSC_COMM_SELF,N,fin);CHKERRQ(ierr);}
400     if (fout) {ierr = VecCreateSeq(PETSC_COMM_SELF,N,fout);CHKERRQ(ierr);}
401     if (bout) {ierr = VecCreateSeq(PETSC_COMM_SELF,N,bout);CHKERRQ(ierr);}
402 #else
403     if (fin) {ierr = VecCreateSeq(PETSC_COMM_SELF,n,fin);CHKERRQ(ierr);}
404     if (fout) {ierr = VecCreateSeq(PETSC_COMM_SELF,n,fout);CHKERRQ(ierr);}
405     if (bout) {ierr = VecCreateSeq(PETSC_COMM_SELF,n,bout);CHKERRQ(ierr);}
406 #endif
407   } else { /* parallel cases */
408     ptrdiff_t      alloc_local,local_n0,local_0_start;
409     ptrdiff_t      local_n1;
410     fftw_complex   *data_fout;
411     ptrdiff_t      local_1_start;
412 #if defined(PETSC_USE_COMPLEX)
413     fftw_complex   *data_fin,*data_bout;
414 #else
415     double         *data_finr,*data_boutr;
416     PetscInt       n1,N1;
417     ptrdiff_t      temp;
418 #endif
419 
420     switch (ndim) {
421     case 1:
422 #if !defined(PETSC_USE_COMPLEX)
423       SETERRQ(comm,PETSC_ERR_SUP,"FFTW does not allow parallel real 1D transform");
424 #else
425       alloc_local = fftw_mpi_local_size_1d(dim[0],comm,FFTW_FORWARD,FFTW_ESTIMATE,&local_n0,&local_0_start,&local_n1,&local_1_start);
426       if (fin) {
427         data_fin  = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
428         ierr = VecCreateMPIWithArray(comm,1,local_n0,N,(const PetscScalar*)data_fin,fin);CHKERRQ(ierr);
429         (*fin)->ops->destroy = VecDestroy_MPIFFTW;
430       }
431       if (fout) {
432         data_fout = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
433         ierr = VecCreateMPIWithArray(comm,1,local_n1,N,(const PetscScalar*)data_fout,fout);CHKERRQ(ierr);
434         (*fout)->ops->destroy = VecDestroy_MPIFFTW;
435       }
436       alloc_local = fftw_mpi_local_size_1d(dim[0],comm,FFTW_BACKWARD,FFTW_ESTIMATE,&local_n0,&local_0_start,&local_n1,&local_1_start);
437       if (bout) {
438         data_bout = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
439         ierr = VecCreateMPIWithArray(comm,1,local_n1,N,(const PetscScalar*)data_bout,bout);CHKERRQ(ierr);
440         (*bout)->ops->destroy = VecDestroy_MPIFFTW;
441       }
442       break;
443 #endif
444     case 2:
445 #if !defined(PETSC_USE_COMPLEX) /* Note that N1 is no more the product of individual dimensions */
446       alloc_local =  fftw_mpi_local_size_2d_transposed(dim[0],dim[1]/2+1,comm,&local_n0,&local_0_start,&local_n1,&local_1_start);
447       N1 = 2*dim[0]*(dim[1]/2+1); n1 = 2*local_n0*(dim[1]/2+1);
448       if (fin) {
449         data_finr=(double *)fftw_malloc(sizeof(double)*alloc_local*2);
450         ierr = VecCreateMPIWithArray(comm,1,(PetscInt)n1,N1,(PetscScalar*)data_finr,fin);CHKERRQ(ierr);
451         (*fin)->ops->destroy   = VecDestroy_MPIFFTW;
452       }
453       if (bout) {
454         data_boutr=(double *)fftw_malloc(sizeof(double)*alloc_local*2);
455         ierr = VecCreateMPIWithArray(comm,1,(PetscInt)n1,N1,(PetscScalar*)data_boutr,bout);CHKERRQ(ierr);
456         (*bout)->ops->destroy   = VecDestroy_MPIFFTW;
457       }
458       if (fout) {
459         data_fout=(fftw_complex *)fftw_malloc(sizeof(fftw_complex)*alloc_local);
460         ierr = VecCreateMPIWithArray(comm,1,(PetscInt)n1,N1,(PetscScalar*)data_fout,fout);CHKERRQ(ierr);
461         (*fout)->ops->destroy   = VecDestroy_MPIFFTW;
462       }
463 #else
464       /* Get local size */
465       alloc_local = fftw_mpi_local_size_2d(dim[0],dim[1],comm,&local_n0,&local_0_start);
466       if (fin) {
467         data_fin  = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
468         ierr = VecCreateMPIWithArray(comm,1,n,N,(const PetscScalar*)data_fin,fin);CHKERRQ(ierr);
469         (*fin)->ops->destroy   = VecDestroy_MPIFFTW;
470       }
471       if (fout) {
472         data_fout = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
473         ierr = VecCreateMPIWithArray(comm,1,n,N,(const PetscScalar*)data_fout,fout);CHKERRQ(ierr);
474         (*fout)->ops->destroy   = VecDestroy_MPIFFTW;
475       }
476       if (bout) {
477         data_bout  = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
478         ierr = VecCreateMPIWithArray(comm,1,n,N,(const PetscScalar*)data_bout,bout);CHKERRQ(ierr);
479         (*bout)->ops->destroy   = VecDestroy_MPIFFTW;
480       }
481 #endif
482       break;
483     case 3:
484 #if !defined(PETSC_USE_COMPLEX)
485       alloc_local =  fftw_mpi_local_size_3d_transposed(dim[0],dim[1],dim[2]/2+1,comm,&local_n0,&local_0_start,&local_n1,&local_1_start);
486       N1 = 2*dim[0]*dim[1]*(dim[2]/2+1); n1 = 2*local_n0*dim[1]*(dim[2]/2+1);
487       if (fin) {
488         data_finr=(double *)fftw_malloc(sizeof(double)*alloc_local*2);
489         ierr = VecCreateMPIWithArray(comm,1,(PetscInt)n1,N1,(PetscScalar*)data_finr,fin);CHKERRQ(ierr);
490         (*fin)->ops->destroy   = VecDestroy_MPIFFTW;
491       }
492       if (bout) {
493         data_boutr=(double *)fftw_malloc(sizeof(double)*alloc_local*2);
494         ierr = VecCreateMPIWithArray(comm,1,(PetscInt)n1,N1,(PetscScalar*)data_boutr,bout);CHKERRQ(ierr);
495         (*bout)->ops->destroy   = VecDestroy_MPIFFTW;
496       }
497 
498       if (fout) {
499         data_fout=(fftw_complex *)fftw_malloc(sizeof(fftw_complex)*alloc_local);
500         ierr = VecCreateMPIWithArray(comm,1,n1,N1,(PetscScalar*)data_fout,fout);CHKERRQ(ierr);
501         (*fout)->ops->destroy   = VecDestroy_MPIFFTW;
502       }
503 #else
504       alloc_local = fftw_mpi_local_size_3d(dim[0],dim[1],dim[2],comm,&local_n0,&local_0_start);
505       if (fin) {
506         data_fin  = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
507         ierr = VecCreateMPIWithArray(comm,1,n,N,(const PetscScalar*)data_fin,fin);CHKERRQ(ierr);
508         (*fin)->ops->destroy   = VecDestroy_MPIFFTW;
509       }
510       if (fout) {
511         data_fout = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
512         ierr = VecCreateMPIWithArray(comm,1,n,N,(const PetscScalar*)data_fout,fout);CHKERRQ(ierr);
513         (*fout)->ops->destroy   = VecDestroy_MPIFFTW;
514       }
515       if (bout) {
516         data_bout  = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
517         ierr = VecCreateMPIWithArray(comm,1,n,N,(const PetscScalar*)data_bout,bout);CHKERRQ(ierr);
518         (*bout)->ops->destroy   = VecDestroy_MPIFFTW;
519       }
520 #endif
521       break;
522     default:
523 #if !defined(PETSC_USE_COMPLEX)
524       temp = (fftw->dim_fftw)[fftw->ndim_fftw-1];
525       (fftw->dim_fftw)[fftw->ndim_fftw-1] = temp/2 + 1;
526       alloc_local = fftw_mpi_local_size_transposed(fftw->ndim_fftw,fftw->dim_fftw,comm,&local_n0,&local_0_start,&local_n1,&local_1_start);
527       N1 = 2*N*(PetscInt)((fftw->dim_fftw)[fftw->ndim_fftw-1])/((PetscInt) temp);
528       (fftw->dim_fftw)[fftw->ndim_fftw-1] = temp;
529 
530       if (fin) {
531         data_finr=(double *)fftw_malloc(sizeof(double)*alloc_local*2);
532         ierr = VecCreateMPIWithArray(comm,1,(PetscInt)n,N1,(PetscScalar*)data_finr,fin);CHKERRQ(ierr);
533         (*fin)->ops->destroy   = VecDestroy_MPIFFTW;
534       }
535       if (bout) {
536         data_boutr=(double *)fftw_malloc(sizeof(double)*alloc_local*2);
537         ierr = VecCreateMPIWithArray(comm,1,(PetscInt)n,N1,(PetscScalar*)data_boutr,bout);CHKERRQ(ierr);
538         (*bout)->ops->destroy   = VecDestroy_MPIFFTW;
539       }
540 
541       if (fout) {
542         data_fout=(fftw_complex *)fftw_malloc(sizeof(fftw_complex)*alloc_local);
543         ierr = VecCreateMPIWithArray(comm,1,n,N1,(PetscScalar*)data_fout,fout);CHKERRQ(ierr);
544         (*fout)->ops->destroy   = VecDestroy_MPIFFTW;
545       }
546 #else
547       alloc_local = fftw_mpi_local_size(fftw->ndim_fftw,fftw->dim_fftw,comm,&local_n0,&local_0_start);
548       if (fin) {
549         data_fin  = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
550         ierr = VecCreateMPIWithArray(comm,1,n,N,(const PetscScalar*)data_fin,fin);CHKERRQ(ierr);
551         (*fin)->ops->destroy   = VecDestroy_MPIFFTW;
552       }
553       if (fout) {
554         data_fout = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
555         ierr = VecCreateMPIWithArray(comm,1,n,N,(const PetscScalar*)data_fout,fout);CHKERRQ(ierr);
556         (*fout)->ops->destroy   = VecDestroy_MPIFFTW;
557       }
558       if (bout) {
559         data_bout  = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
560         ierr = VecCreateMPIWithArray(comm,1,n,N,(const PetscScalar*)data_bout,bout);CHKERRQ(ierr);
561         (*bout)->ops->destroy   = VecDestroy_MPIFFTW;
562       }
563 #endif
564       break;
565     }
566   }
567   PetscFunctionReturn(0);
568 }
569 EXTERN_C_END
570 
571 #undef __FUNCT__
572 #define __FUNCT__ "VecScatterPetscToFFTW"
573 /*@
574    VecScatterPetscToFFTW - Copies the PETSc vector to the vector that goes into FFTW block.
575 
576    Collective on Mat
577 
578    Input Parameters:
579 +  A - FFTW matrix
580 -  x - the PETSc vector
581 
582    Output Parameters:
583 .  y - the FFTW vector
584 
585   Options Database Keys:
586 . -mat_fftw_plannerflags - set FFTW planner flags
587 
588    Level: intermediate
589 
590    Note: For real parallel FFT, FFTW requires insertion of extra space at the end of last dimension. This required even when
591          one is not doing in-place transform. The last dimension size must be changed to 2*(dim[last]/2+1) to accommodate these extra
592          zeros. This routine does that job by scattering operation.
593 
594 .seealso: VecScatterFFTWToPetsc()
595 @*/
596 PetscErrorCode VecScatterPetscToFFTW(Mat A,Vec x,Vec y)
597 {
598   PetscErrorCode ierr;
599 
600   PetscFunctionBegin;
601   ierr = PetscTryMethod(A,"VecScatterPetscToFFTW_C",(Mat,Vec,Vec),(A,x,y));CHKERRQ(ierr);
602   PetscFunctionReturn(0);
603 }
604 
605 EXTERN_C_BEGIN
606 #undef __FUNCT__
607 #define __FUNCT__ "VecScatterPetscToFFTW_FFTW"
608 PetscErrorCode VecScatterPetscToFFTW_FFTW(Mat A,Vec x,Vec y)
609 {
610   PetscErrorCode ierr;
611   MPI_Comm       comm=((PetscObject)A)->comm;
612   Mat_FFT        *fft = (Mat_FFT*)A->data;
613   Mat_FFTW       *fftw = (Mat_FFTW*)fft->data;
614   PetscInt       N=fft->N;
615   PetscInt       ndim=fft->ndim,*dim=fft->dim;
616   PetscInt       low;
617   PetscInt       rank,size,vsize,vsize1;
618   ptrdiff_t      alloc_local,local_n0,local_0_start;
619   ptrdiff_t      local_n1,local_1_start;
620   VecScatter     vecscat;
621   IS             list1,list2;
622 #if !defined(PETSC_USE_COMPLEX)
623   PetscInt       i,j,k,partial_dim;
624   PetscInt       *indx1, *indx2, tempindx, tempindx1;
625   PetscInt       N1, n1 ,NM;
626   ptrdiff_t      temp;
627 #endif
628 
629   PetscFunctionBegin;
630   ierr = MPI_Comm_size(comm, &size);CHKERRQ(ierr);
631   ierr = MPI_Comm_rank(comm, &rank);CHKERRQ(ierr);
632   ierr = VecGetOwnershipRange(y,&low,PETSC_NULL);
633 
634   if (size==1) {
635     ierr = VecGetSize(x,&vsize);CHKERRQ(ierr);
636     ierr = VecGetSize(y,&vsize1);CHKERRQ(ierr);
637     ierr = ISCreateStride(PETSC_COMM_SELF,N,0,1,&list1);CHKERRQ(ierr);
638     ierr = VecScatterCreate(x,list1,y,list1,&vecscat);CHKERRQ(ierr);
639     ierr = VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
640     ierr = VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
641     ierr = VecScatterDestroy(&vecscat);CHKERRQ(ierr);
642     ierr = ISDestroy(&list1);CHKERRQ(ierr);
643   } else {
644     switch (ndim) {
645     case 1:
646 #if defined(PETSC_USE_COMPLEX)
647       alloc_local = fftw_mpi_local_size_1d(dim[0],comm,FFTW_FORWARD,FFTW_ESTIMATE,&local_n0,&local_0_start,&local_n1,&local_1_start);
648       ierr = ISCreateStride(comm,local_n0,local_0_start,1,&list1);
649       ierr = ISCreateStride(comm,local_n0,low,1,&list2);
650       ierr = VecScatterCreate(x,list1,y,list2,&vecscat);CHKERRQ(ierr);
651       ierr = VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
652       ierr = VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
653       ierr = VecScatterDestroy(&vecscat);CHKERRQ(ierr);
654       ierr = ISDestroy(&list1);CHKERRQ(ierr);
655       ierr = ISDestroy(&list2);CHKERRQ(ierr);
656 #else
657       SETERRQ(comm,PETSC_ERR_SUP,"FFTW does not support parallel 1D real transform");
658 #endif
659       break;
660     case 2:
661 #if defined(PETSC_USE_COMPLEX)
662       alloc_local = fftw_mpi_local_size_2d(dim[0],dim[1],comm,&local_n0,&local_0_start);
663       ierr = ISCreateStride(comm,local_n0*dim[1],local_0_start*dim[1],1,&list1);
664       ierr = ISCreateStride(comm,local_n0*dim[1],low,1,&list2);
665       ierr = VecScatterCreate(x,list1,y,list2,&vecscat);CHKERRQ(ierr);
666       ierr = VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
667       ierr = VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
668       ierr = VecScatterDestroy(&vecscat);CHKERRQ(ierr);
669       ierr = ISDestroy(&list1);CHKERRQ(ierr);
670       ierr = ISDestroy(&list2);CHKERRQ(ierr);
671 #else
672       alloc_local =  fftw_mpi_local_size_2d_transposed(dim[0],dim[1]/2+1,comm,&local_n0,&local_0_start,&local_n1,&local_1_start);
673       N1 = 2*dim[0]*(dim[1]/2+1); n1 = 2*local_n0*(dim[1]/2+1);
674       ierr = PetscMalloc(sizeof(PetscInt)*((PetscInt)local_n0)*dim[1],&indx1);CHKERRQ(ierr);
675       ierr = PetscMalloc(sizeof(PetscInt)*((PetscInt)local_n0)*dim[1],&indx2);CHKERRQ(ierr);
676 
677       if (dim[1]%2==0) {
678         NM = dim[1]+2;
679       } else {
680         NM = dim[1]+1;
681       }
682       for (i=0;i<local_n0;i++) {
683          for (j=0;j<dim[1];j++) {
684            tempindx = i*dim[1] + j;
685            tempindx1 = i*NM + j;
686            indx1[tempindx]=local_0_start*dim[1]+tempindx;
687            indx2[tempindx]=low+tempindx1;
688          }
689       }
690 
691       ierr = ISCreateGeneral(comm,local_n0*dim[1],indx1,PETSC_COPY_VALUES,&list1);CHKERRQ(ierr);
692       ierr = ISCreateGeneral(comm,local_n0*dim[1],indx2,PETSC_COPY_VALUES,&list2);CHKERRQ(ierr);
693 
694       ierr = VecScatterCreate(x,list1,y,list2,&vecscat);CHKERRQ(ierr);
695       ierr = VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
696       ierr = VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
697       ierr = VecScatterDestroy(&vecscat);CHKERRQ(ierr);
698       ierr = ISDestroy(&list1);CHKERRQ(ierr);
699       ierr = ISDestroy(&list2);CHKERRQ(ierr);
700       ierr = PetscFree(indx1);CHKERRQ(ierr);
701       ierr = PetscFree(indx2);CHKERRQ(ierr);
702 #endif
703       break;
704 
705     case 3:
706 #if defined(PETSC_USE_COMPLEX)
707       alloc_local = fftw_mpi_local_size_3d(dim[0],dim[1],dim[2],comm,&local_n0,&local_0_start);
708       ierr = ISCreateStride(comm,local_n0*dim[1]*dim[2],local_0_start*dim[1]*dim[2],1,&list1);
709       ierr = ISCreateStride(comm,local_n0*dim[1]*dim[2],low,1,&list2);
710       ierr = VecScatterCreate(x,list1,y,list2,&vecscat);CHKERRQ(ierr);
711       ierr = VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
712       ierr = VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
713       ierr = VecScatterDestroy(&vecscat);CHKERRQ(ierr);
714       ierr = ISDestroy(&list1);CHKERRQ(ierr);
715       ierr = ISDestroy(&list2);CHKERRQ(ierr);
716 #else
717       alloc_local =  fftw_mpi_local_size_3d_transposed(dim[0],dim[1],dim[2]/2+1,comm,&local_n0,&local_0_start,&local_n1,&local_1_start);
718       N1 = 2*dim[0]*dim[1]*(dim[2]/2+1); n1 = 2*local_n0*dim[1]*(dim[2]/2+1);
719 
720       ierr = PetscMalloc(sizeof(PetscInt)*((PetscInt)local_n0)*dim[1]*dim[2],&indx1);CHKERRQ(ierr);
721       ierr = PetscMalloc(sizeof(PetscInt)*((PetscInt)local_n0)*dim[1]*dim[2],&indx2);CHKERRQ(ierr);
722 
723       if (dim[2]%2==0) NM = dim[2]+2;
724       else             NM = dim[2]+1;
725 
726       for (i=0;i<local_n0;i++) {
727         for (j=0;j<dim[1];j++) {
728           for (k=0;k<dim[2];k++) {
729             tempindx = i*dim[1]*dim[2] + j*dim[2] + k;
730             tempindx1 = i*dim[1]*NM + j*NM + k;
731             indx1[tempindx]=local_0_start*dim[1]*dim[2]+tempindx;
732             indx2[tempindx]=low+tempindx1;
733           }
734         }
735       }
736 
737       ierr = ISCreateGeneral(comm,local_n0*dim[1]*dim[2],indx1,PETSC_COPY_VALUES,&list1);CHKERRQ(ierr);
738       ierr = ISCreateGeneral(comm,local_n0*dim[1]*dim[2],indx2,PETSC_COPY_VALUES,&list2);CHKERRQ(ierr);
739       ierr = VecScatterCreate(x,list1,y,list2,&vecscat);CHKERRQ(ierr);
740       ierr = VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
741       ierr = VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
742       ierr = VecScatterDestroy(&vecscat);CHKERRQ(ierr);
743       ierr = ISDestroy(&list1);CHKERRQ(ierr);
744       ierr = ISDestroy(&list2);CHKERRQ(ierr);
745       ierr = PetscFree(indx1);CHKERRQ(ierr);
746       ierr = PetscFree(indx2);CHKERRQ(ierr);
747 #endif
748       break;
749 
750     default:
751 #if defined(PETSC_USE_COMPLEX)
752       alloc_local = fftw_mpi_local_size(fftw->ndim_fftw,fftw->dim_fftw,comm,&local_n0,&local_0_start);
753       ierr = ISCreateStride(comm,local_n0*(fftw->partial_dim),local_0_start*(fftw->partial_dim),1,&list1);
754       ierr = ISCreateStride(comm,local_n0*(fftw->partial_dim),low,1,&list2);
755       ierr = VecScatterCreate(x,list1,y,list2,&vecscat);CHKERRQ(ierr);
756       ierr = VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
757       ierr = VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
758       ierr = VecScatterDestroy(&vecscat);CHKERRQ(ierr);
759       ierr = ISDestroy(&list1);CHKERRQ(ierr);
760       ierr = ISDestroy(&list2);CHKERRQ(ierr);
761 #else
762       temp = (fftw->dim_fftw)[fftw->ndim_fftw-1];
763       (fftw->dim_fftw)[fftw->ndim_fftw-1] = temp/2 + 1;
764       alloc_local = fftw_mpi_local_size_transposed(fftw->ndim_fftw,fftw->dim_fftw,comm,&local_n0,&local_0_start,&local_n1,&local_1_start);
765       N1 = 2*N*(PetscInt)((fftw->dim_fftw)[fftw->ndim_fftw-1])/((PetscInt) temp);
766       (fftw->dim_fftw)[fftw->ndim_fftw-1] = temp;
767 
768       partial_dim = fftw->partial_dim;
769 
770       ierr = PetscMalloc(sizeof(PetscInt)*((PetscInt)local_n0)*partial_dim,&indx1);CHKERRQ(ierr);
771       ierr = PetscMalloc(sizeof(PetscInt)*((PetscInt)local_n0)*partial_dim,&indx2);CHKERRQ(ierr);
772 
773       if (dim[ndim-1]%2==0) NM = 2;
774       else                  NM = 1;
775 
776       j = low;
777       for (i=0,k=1; i<((PetscInt)local_n0)*partial_dim;i++,k++) {
778         indx1[i] = local_0_start*partial_dim + i;
779         indx2[i] = j;
780         if (k%dim[ndim-1]==0) { j+=NM;}
781         j++;
782       }
783       ierr = ISCreateGeneral(comm,local_n0*partial_dim,indx1,PETSC_COPY_VALUES,&list1);CHKERRQ(ierr);
784       ierr = ISCreateGeneral(comm,local_n0*partial_dim,indx2,PETSC_COPY_VALUES,&list2);CHKERRQ(ierr);
785       ierr = VecScatterCreate(x,list1,y,list2,&vecscat);CHKERRQ(ierr);
786       ierr = VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
787       ierr = VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
788       ierr = VecScatterDestroy(&vecscat);CHKERRQ(ierr);
789       ierr = ISDestroy(&list1);CHKERRQ(ierr);
790       ierr = ISDestroy(&list2);CHKERRQ(ierr);
791       ierr = PetscFree(indx1);CHKERRQ(ierr);
792       ierr = PetscFree(indx2);CHKERRQ(ierr);
793 #endif
794       break;
795     }
796   }
797   PetscFunctionReturn(0);
798 }
799 EXTERN_C_END
800 
801 #undef __FUNCT__
802 #define __FUNCT__ "VecScatterFFTWToPetsc"
803 /*@
804    VecScatterFFTWToPetsc - Converts FFTW output to the PETSc vector.
805 
806    Collective on Mat
807 
808     Input Parameters:
809 +   A - FFTW matrix
810 -   x - FFTW vector
811 
812    Output Parameters:
813 .  y - PETSc vector
814 
815    Level: intermediate
816 
817    Note: While doing real transform the FFTW output of backward DFT contains extra zeros at the end of last dimension.
818          VecScatterFFTWToPetsc removes those extra zeros.
819 
820 .seealso: VecScatterPetscToFFTW()
821 @*/
822 PetscErrorCode VecScatterFFTWToPetsc(Mat A,Vec x,Vec y)
823 {
824   PetscErrorCode ierr;
825 
826   PetscFunctionBegin;
827   ierr = PetscTryMethod(A,"VecScatterFFTWToPetsc_C",(Mat,Vec,Vec),(A,x,y));CHKERRQ(ierr);
828   PetscFunctionReturn(0);
829 }
830 
831 EXTERN_C_BEGIN
832 #undef __FUNCT__
833 #define __FUNCT__ "VecScatterFFTWToPetsc_FFTW"
834 PetscErrorCode VecScatterFFTWToPetsc_FFTW(Mat A,Vec x,Vec y)
835 {
836   PetscErrorCode ierr;
837   MPI_Comm       comm=((PetscObject)A)->comm;
838   Mat_FFT        *fft = (Mat_FFT*)A->data;
839   Mat_FFTW       *fftw = (Mat_FFTW*)fft->data;
840   PetscInt       N=fft->N;
841   PetscInt       ndim=fft->ndim,*dim=fft->dim;
842   PetscInt       low;
843   PetscInt       rank,size;
844   ptrdiff_t      alloc_local,local_n0,local_0_start;
845   ptrdiff_t      local_n1,local_1_start;
846   VecScatter     vecscat;
847   IS             list1,list2;
848 #if !defined(PETSC_USE_COMPLEX)
849   PetscInt       i,j,k,partial_dim;
850   PetscInt       *indx1, *indx2, tempindx, tempindx1;
851   PetscInt       N1, n1 ,NM;
852   ptrdiff_t      temp;
853 #endif
854 
855   PetscFunctionBegin;
856   ierr = MPI_Comm_size(comm, &size);CHKERRQ(ierr);
857   ierr = MPI_Comm_rank(comm, &rank);CHKERRQ(ierr);
858   ierr = VecGetOwnershipRange(x,&low,PETSC_NULL);CHKERRQ(ierr);
859 
860   if (size==1) {
861     ierr = ISCreateStride(comm,N,0,1,&list1);CHKERRQ(ierr);
862     ierr = VecScatterCreate(x,list1,y,list1,&vecscat);CHKERRQ(ierr);
863     ierr = VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
864     ierr = VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
865     ierr = VecScatterDestroy(&vecscat);CHKERRQ(ierr);
866     ierr = ISDestroy(&list1);CHKERRQ(ierr);
867 
868   } else {
869     switch (ndim) {
870     case 1:
871 #if defined(PETSC_USE_COMPLEX)
872       alloc_local = fftw_mpi_local_size_1d(dim[0],comm,FFTW_BACKWARD,FFTW_ESTIMATE,&local_n0,&local_0_start,&local_n1,&local_1_start);
873       ierr = ISCreateStride(comm,local_n1,local_1_start,1,&list1);
874       ierr = ISCreateStride(comm,local_n1,low,1,&list2);
875       ierr = VecScatterCreate(x,list1,y,list2,&vecscat);CHKERRQ(ierr);
876       ierr = VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
877       ierr = VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
878       ierr = VecScatterDestroy(&vecscat);CHKERRQ(ierr);
879       ierr = ISDestroy(&list1);CHKERRQ(ierr);
880       ierr = ISDestroy(&list2);CHKERRQ(ierr);
881 #else
882       SETERRQ(comm,PETSC_ERR_SUP,"No support for real parallel 1D FFT");
883 #endif
884       break;
885     case 2:
886 #if defined(PETSC_USE_COMPLEX)
887       alloc_local = fftw_mpi_local_size_2d(dim[0],dim[1],comm,&local_n0,&local_0_start);
888       ierr = ISCreateStride(comm,local_n0*dim[1],local_0_start*dim[1],1,&list1);
889       ierr = ISCreateStride(comm,local_n0*dim[1],low,1,&list2);
890       ierr = VecScatterCreate(x,list2,y,list1,&vecscat);CHKERRQ(ierr);
891       ierr = VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
892       ierr = VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
893       ierr = VecScatterDestroy(&vecscat);CHKERRQ(ierr);
894       ierr = ISDestroy(&list1);CHKERRQ(ierr);
895       ierr = ISDestroy(&list2);CHKERRQ(ierr);
896 #else
897       alloc_local =  fftw_mpi_local_size_2d_transposed(dim[0],dim[1]/2+1,comm,&local_n0,&local_0_start,&local_n1,&local_1_start);
898       N1 = 2*dim[0]*(dim[1]/2+1); n1 = 2*local_n0*(dim[1]/2+1);
899 
900       ierr = PetscMalloc(sizeof(PetscInt)*((PetscInt)local_n0)*dim[1],&indx1);CHKERRQ(ierr);
901       ierr = PetscMalloc(sizeof(PetscInt)*((PetscInt)local_n0)*dim[1],&indx2);CHKERRQ(ierr);
902 
903       if (dim[1]%2==0) NM = dim[1]+2;
904       else             NM = dim[1]+1;
905 
906       for (i=0;i<local_n0;i++) {
907         for (j=0;j<dim[1];j++) {
908           tempindx = i*dim[1] + j;
909           tempindx1 = i*NM + j;
910           indx1[tempindx]=local_0_start*dim[1]+tempindx;
911           indx2[tempindx]=low+tempindx1;
912         }
913       }
914 
915       ierr = ISCreateGeneral(comm,local_n0*dim[1],indx1,PETSC_COPY_VALUES,&list1);CHKERRQ(ierr);
916       ierr = ISCreateGeneral(comm,local_n0*dim[1],indx2,PETSC_COPY_VALUES,&list2);CHKERRQ(ierr);
917 
918       ierr = VecScatterCreate(x,list2,y,list1,&vecscat);CHKERRQ(ierr);
919       ierr = VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
920       ierr = VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
921       ierr = VecScatterDestroy(&vecscat);CHKERRQ(ierr);
922       ierr = ISDestroy(&list1);CHKERRQ(ierr);
923       ierr = ISDestroy(&list2);CHKERRQ(ierr);
924       ierr = PetscFree(indx1);CHKERRQ(ierr);
925       ierr = PetscFree(indx2);CHKERRQ(ierr);
926 #endif
927       break;
928     case 3:
929 #if defined(PETSC_USE_COMPLEX)
930       alloc_local = fftw_mpi_local_size_3d(dim[0],dim[1],dim[2],comm,&local_n0,&local_0_start);
931       ierr = ISCreateStride(comm,local_n0*dim[1]*dim[2],local_0_start*dim[1]*dim[2],1,&list1);
932       ierr = ISCreateStride(comm,local_n0*dim[1]*dim[2],low,1,&list2);
933       ierr = VecScatterCreate(x,list1,y,list2,&vecscat);CHKERRQ(ierr);
934       ierr = VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
935       ierr = VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
936       ierr = VecScatterDestroy(&vecscat);CHKERRQ(ierr);
937       ierr = ISDestroy(&list1);CHKERRQ(ierr);
938       ierr = ISDestroy(&list2);CHKERRQ(ierr);
939 #else
940 
941       alloc_local =  fftw_mpi_local_size_3d_transposed(dim[0],dim[1],dim[2]/2+1,comm,&local_n0,&local_0_start,&local_n1,&local_1_start);
942       N1 = 2*dim[0]*dim[1]*(dim[2]/2+1); n1 = 2*local_n0*dim[1]*(dim[2]/2+1);
943 
944       ierr = PetscMalloc(sizeof(PetscInt)*((PetscInt)local_n0)*dim[1]*dim[2],&indx1);CHKERRQ(ierr);
945       ierr = PetscMalloc(sizeof(PetscInt)*((PetscInt)local_n0)*dim[1]*dim[2],&indx2);CHKERRQ(ierr);
946 
947       if (dim[2]%2==0) NM = dim[2]+2;
948       else             NM = dim[2]+1;
949 
950       for (i=0;i<local_n0;i++) {
951         for (j=0;j<dim[1];j++) {
952           for (k=0;k<dim[2];k++) {
953             tempindx = i*dim[1]*dim[2] + j*dim[2] + k;
954             tempindx1 = i*dim[1]*NM + j*NM + k;
955             indx1[tempindx]=local_0_start*dim[1]*dim[2]+tempindx;
956             indx2[tempindx]=low+tempindx1;
957           }
958         }
959       }
960 
961       ierr = ISCreateGeneral(comm,local_n0*dim[1]*dim[2],indx1,PETSC_COPY_VALUES,&list1);CHKERRQ(ierr);
962       ierr = ISCreateGeneral(comm,local_n0*dim[1]*dim[2],indx2,PETSC_COPY_VALUES,&list2);CHKERRQ(ierr);
963 
964       ierr = VecScatterCreate(x,list2,y,list1,&vecscat);CHKERRQ(ierr);
965       ierr = VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
966       ierr = VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
967       ierr = VecScatterDestroy(&vecscat);CHKERRQ(ierr);
968       ierr = ISDestroy(&list1);CHKERRQ(ierr);
969       ierr = ISDestroy(&list2);CHKERRQ(ierr);
970       ierr = PetscFree(indx1);CHKERRQ(ierr);
971       ierr = PetscFree(indx2);CHKERRQ(ierr);
972 #endif
973       break;
974     default:
975 #if defined(PETSC_USE_COMPLEX)
976       alloc_local = fftw_mpi_local_size(fftw->ndim_fftw,fftw->dim_fftw,comm,&local_n0,&local_0_start);
977       ierr = ISCreateStride(comm,local_n0*(fftw->partial_dim),local_0_start*(fftw->partial_dim),1,&list1);
978       ierr = ISCreateStride(comm,local_n0*(fftw->partial_dim),low,1,&list2);
979       ierr = VecScatterCreate(x,list1,y,list2,&vecscat);CHKERRQ(ierr);
980       ierr = VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
981       ierr = VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
982       ierr = VecScatterDestroy(&vecscat);CHKERRQ(ierr);
983       ierr = ISDestroy(&list1);CHKERRQ(ierr);
984       ierr = ISDestroy(&list2);CHKERRQ(ierr);
985 #else
986       temp = (fftw->dim_fftw)[fftw->ndim_fftw-1];
987       (fftw->dim_fftw)[fftw->ndim_fftw-1] = temp/2 + 1;
988       alloc_local = fftw_mpi_local_size_transposed(fftw->ndim_fftw,fftw->dim_fftw,comm,&local_n0,&local_0_start,&local_n1,&local_1_start);
989       N1 = 2*N*(PetscInt)((fftw->dim_fftw)[fftw->ndim_fftw-1])/((PetscInt) temp);
990       (fftw->dim_fftw)[fftw->ndim_fftw-1] = temp;
991 
992       partial_dim = fftw->partial_dim;
993 
994       ierr = PetscMalloc(sizeof(PetscInt)*((PetscInt)local_n0)*partial_dim,&indx1);CHKERRQ(ierr);
995       ierr = PetscMalloc(sizeof(PetscInt)*((PetscInt)local_n0)*partial_dim,&indx2);CHKERRQ(ierr);
996 
997       if (dim[ndim-1]%2==0) NM = 2;
998       else                  NM = 1;
999 
1000       j = low;
1001       for (i=0,k=1; i<((PetscInt)local_n0)*partial_dim;i++,k++) {
1002         indx1[i] = local_0_start*partial_dim + i;
1003         indx2[i] = j;
1004         if (k%dim[ndim-1]==0)j+=NM;
1005         j++;
1006       }
1007       ierr = ISCreateGeneral(comm,local_n0*partial_dim,indx1,PETSC_COPY_VALUES,&list1);CHKERRQ(ierr);
1008       ierr = ISCreateGeneral(comm,local_n0*partial_dim,indx2,PETSC_COPY_VALUES,&list2);CHKERRQ(ierr);
1009 
1010       ierr = VecScatterCreate(x,list2,y,list1,&vecscat);CHKERRQ(ierr);
1011       ierr = VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
1012       ierr = VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
1013       ierr = VecScatterDestroy(&vecscat);CHKERRQ(ierr);
1014       ierr = ISDestroy(&list1);CHKERRQ(ierr);
1015       ierr = ISDestroy(&list2);CHKERRQ(ierr);
1016       ierr = PetscFree(indx1);CHKERRQ(ierr);
1017       ierr = PetscFree(indx2);CHKERRQ(ierr);
1018 #endif
1019       break;
1020     }
1021   }
1022   PetscFunctionReturn(0);
1023 }
1024 EXTERN_C_END
1025 
1026 EXTERN_C_BEGIN
1027 #undef __FUNCT__
1028 #define __FUNCT__ "MatCreate_FFTW"
1029 /*
1030     MatCreate_FFTW - Creates a matrix object that provides FFT via the external package FFTW
1031 
1032   Options Database Keys:
1033 + -mat_fftw_plannerflags - set FFTW planner flags
1034 
1035    Level: intermediate
1036 
1037 */
1038 PetscErrorCode MatCreate_FFTW(Mat A)
1039 {
1040   PetscErrorCode ierr;
1041   MPI_Comm       comm=((PetscObject)A)->comm;
1042   Mat_FFT        *fft=(Mat_FFT*)A->data;
1043   Mat_FFTW       *fftw;
1044   PetscInt       n=fft->n,N=fft->N,ndim=fft->ndim,*dim = fft->dim;
1045   const char     *p_flags[]={"FFTW_ESTIMATE","FFTW_MEASURE","FFTW_PATIENT","FFTW_EXHAUSTIVE"};
1046   PetscBool      flg;
1047   PetscInt       p_flag,partial_dim=1,ctr;
1048   PetscMPIInt    size,rank;
1049   ptrdiff_t      *pdim;
1050   ptrdiff_t      local_n1,local_1_start;
1051 #if !defined(PETSC_USE_COMPLEX)
1052    ptrdiff_t     temp;
1053    PetscInt      N1,tot_dim;
1054 #else
1055    PetscInt      n1;
1056 #endif
1057 
1058   PetscFunctionBegin;
1059   ierr = MPI_Comm_size(comm, &size);CHKERRQ(ierr);
1060   ierr = MPI_Comm_rank(comm, &rank);CHKERRQ(ierr);
1061 
1062   fftw_mpi_init();
1063   pdim = (ptrdiff_t *)calloc(ndim,sizeof(ptrdiff_t));
1064   pdim[0] = dim[0];
1065 #if !defined(PETSC_USE_COMPLEX)
1066   tot_dim = 2*dim[0];
1067 #endif
1068   for (ctr=1;ctr<ndim;ctr++) {
1069     partial_dim *= dim[ctr];
1070     pdim[ctr] = dim[ctr];
1071 #if !defined(PETSC_USE_COMPLEX)
1072     if (ctr==ndim-1) tot_dim *= (dim[ctr]/2+1);
1073     else             tot_dim *= dim[ctr];
1074 #endif
1075   }
1076 
1077   if (size == 1) {
1078 #if defined(PETSC_USE_COMPLEX)
1079     ierr = MatSetSizes(A,N,N,N,N);CHKERRQ(ierr);
1080     n = N;
1081 #else
1082     ierr = MatSetSizes(A,tot_dim,tot_dim,tot_dim,tot_dim);CHKERRQ(ierr);
1083     n = tot_dim;
1084 #endif
1085 
1086   } else {
1087     ptrdiff_t alloc_local,local_n0,local_0_start;
1088     switch (ndim) {
1089     case 1:
1090 #if !defined(PETSC_USE_COMPLEX)
1091       SETERRQ(comm,PETSC_ERR_SUP,"FFTW does not support parallel 1D real transform");
1092 #else
1093       alloc_local = fftw_mpi_local_size_1d(dim[0],comm,FFTW_FORWARD,FFTW_ESTIMATE,&local_n0,&local_0_start,&local_n1,&local_1_start);
1094       n = (PetscInt)local_n0;
1095       n1 = (PetscInt) local_n1;
1096       ierr = MatSetSizes(A,n1,n,N,N);CHKERRQ(ierr);
1097 #endif
1098       break;
1099     case 2:
1100 #if defined(PETSC_USE_COMPLEX)
1101       alloc_local = fftw_mpi_local_size_2d(dim[0],dim[1],comm,&local_n0,&local_0_start);
1102       /*
1103        PetscSynchronizedPrintf(comm,"[%d] MatCreateSeqFFTW: local_n0, local_0_start %d %d, N %d,dim %d, %d\n",rank,(PetscInt)local_n0*dim[1],(PetscInt)local_0_start,m,dim[0],dim[1]);
1104        PetscSynchronizedFlush(comm);
1105        */
1106       n = (PetscInt)local_n0*dim[1];
1107       ierr = MatSetSizes(A,n,n,N,N);CHKERRQ(ierr);
1108 #else
1109       alloc_local = fftw_mpi_local_size_2d_transposed(dim[0],dim[1]/2+1,comm,&local_n0,&local_0_start,&local_n1,&local_1_start);
1110       n = 2*(PetscInt)local_n0*(dim[1]/2+1);
1111       ierr = MatSetSizes(A,n,n,2*dim[0]*(dim[1]/2+1),2*dim[0]*(dim[1]/2+1));
1112 #endif
1113       break;
1114     case 3:
1115 #if defined(PETSC_USE_COMPLEX)
1116       alloc_local = fftw_mpi_local_size_3d(dim[0],dim[1],dim[2],comm,&local_n0,&local_0_start);
1117       n = (PetscInt)local_n0*dim[1]*dim[2];
1118       ierr = MatSetSizes(A,n,n,N,N);CHKERRQ(ierr);
1119 #else
1120       alloc_local = fftw_mpi_local_size_3d_transposed(dim[0],dim[1],dim[2]/2+1,comm,&local_n0,&local_0_start,&local_n1,&local_1_start);
1121       n = 2*(PetscInt)local_n0*dim[1]*(dim[2]/2+1);
1122       ierr = MatSetSizes(A,n,n,2*dim[0]*dim[1]*(dim[2]/2+1),2*dim[0]*dim[1]*(dim[2]/2+1));
1123 #endif
1124       break;
1125     default:
1126 #if defined(PETSC_USE_COMPLEX)
1127       alloc_local = fftw_mpi_local_size(ndim,pdim,comm,&local_n0,&local_0_start);
1128       n = (PetscInt)local_n0*partial_dim;
1129       ierr = MatSetSizes(A,n,n,N,N);CHKERRQ(ierr);
1130 #else
1131       temp = pdim[ndim-1];
1132       pdim[ndim-1] = temp/2 + 1;
1133       alloc_local = fftw_mpi_local_size_transposed(ndim,pdim,comm,&local_n0,&local_0_start,&local_n1,&local_1_start);
1134       n = 2*(PetscInt)local_n0*partial_dim*pdim[ndim-1]/temp;
1135       N1 = 2*N*(PetscInt)pdim[ndim-1]/((PetscInt) temp);
1136       pdim[ndim-1] = temp;
1137       ierr = MatSetSizes(A,n,n,N1,N1);CHKERRQ(ierr);
1138 #endif
1139       break;
1140     }
1141   }
1142   ierr = PetscObjectChangeTypeName((PetscObject)A,MATFFTW);CHKERRQ(ierr);
1143   ierr = PetscNewLog(A,Mat_FFTW,&fftw);CHKERRQ(ierr);
1144   fft->data = (void*)fftw;
1145 
1146   fft->n            = n;
1147   fftw->ndim_fftw   = (ptrdiff_t)ndim; /* This is dimension of fft */
1148   fftw->partial_dim = partial_dim;
1149   ierr = PetscMalloc(ndim*sizeof(ptrdiff_t), (ptrdiff_t *)&(fftw->dim_fftw));CHKERRQ(ierr);
1150   for (ctr=0;ctr<ndim;ctr++) (fftw->dim_fftw)[ctr]=dim[ctr];
1151 
1152   fftw->p_forward  = 0;
1153   fftw->p_backward = 0;
1154   fftw->p_flag     = FFTW_ESTIMATE;
1155 
1156   if (size == 1) {
1157     A->ops->mult          = MatMult_SeqFFTW;
1158     A->ops->multtranspose = MatMultTranspose_SeqFFTW;
1159   } else {
1160     A->ops->mult          = MatMult_MPIFFTW;
1161     A->ops->multtranspose = MatMultTranspose_MPIFFTW;
1162   }
1163   fft->matdestroy = MatDestroy_FFTW;
1164   A->assembled    = PETSC_TRUE;
1165   A->preallocated = PETSC_TRUE;
1166   ierr = PetscObjectComposeFunctionDynamic((PetscObject)A,"MatGetVecsFFTW_C","MatGetVecsFFTW_FFTW",MatGetVecsFFTW_FFTW);CHKERRQ(ierr);
1167   ierr = PetscObjectComposeFunctionDynamic((PetscObject)A,"VecScatterPetscToFFTW_C","VecScatterPetscToFFTW_FFTW",VecScatterPetscToFFTW_FFTW);CHKERRQ(ierr);
1168   ierr = PetscObjectComposeFunctionDynamic((PetscObject)A,"VecScatterFFTWToPetsc_C","VecScatterFFTWToPetsc_FFTW",VecScatterFFTWToPetsc_FFTW);CHKERRQ(ierr);
1169 
1170   /* get runtime options */
1171   ierr = PetscOptionsBegin(((PetscObject)A)->comm,((PetscObject)A)->prefix,"FFTW Options","Mat");CHKERRQ(ierr);
1172     ierr = PetscOptionsEList("-mat_fftw_plannerflags","Planner Flags","None",p_flags,4,p_flags[0],&p_flag,&flg);CHKERRQ(ierr);
1173     if (flg) {fftw->p_flag = (unsigned)p_flag;}
1174   ierr = PetscOptionsEnd();CHKERRQ(ierr);
1175   PetscFunctionReturn(0);
1176 }
1177 EXTERN_C_END
1178 
1179 
1180 
1181 
1182