xref: /petsc/src/mat/impls/fft/fftw/fftw.c (revision 5c8f6a953e7ed1c81f507d64aebddb11080b60e9)
1 
2 /*
3     Provides an interface to the FFTW package.
4     Testing examples can be found in ~src/mat/examples/tests
5 */
6 
7 #include <../src/mat/impls/fft/fft.h>   /*I "petscmat.h" I*/
8 EXTERN_C_BEGIN
9 #include <fftw3-mpi.h>
10 EXTERN_C_END
11 
12 typedef struct {
13   ptrdiff_t   ndim_fftw,*dim_fftw;
14   PetscInt    partial_dim;
15   fftw_plan   p_forward,p_backward;
16   unsigned    p_flag; /* planner flags, FFTW_ESTIMATE,FFTW_MEASURE, FFTW_PATIENT, FFTW_EXHAUSTIVE */
17   PetscScalar *finarray,*foutarray,*binarray,*boutarray; /* keep track of arrays becaue fftw plan should be
18                                                             executed for the arrays with which the plan was created */
19 } Mat_FFTW;
20 
21 extern PetscErrorCode MatMult_SeqFFTW(Mat,Vec,Vec);
22 extern PetscErrorCode MatMultTranspose_SeqFFTW(Mat,Vec,Vec);
23 extern PetscErrorCode MatMult_MPIFFTW(Mat,Vec,Vec);
24 extern PetscErrorCode MatMultTranspose_MPIFFTW(Mat,Vec,Vec);
25 extern PetscErrorCode MatDestroy_FFTW(Mat);
26 extern PetscErrorCode VecDestroy_MPIFFTW(Vec);
27 
28 /* MatMult_SeqFFTW performs forward DFT in parallel
29    Input parameter:
30      A - the matrix
31      x - the vector on which FDFT will be performed
32 
33    Output parameter:
34      y - vector that stores result of FDFT
35 */
36 #undef __FUNCT__
37 #define __FUNCT__ "MatMult_SeqFFTW"
38 PetscErrorCode MatMult_SeqFFTW(Mat A,Vec x,Vec y)
39 {
40   PetscErrorCode ierr;
41   Mat_FFT        *fft  = (Mat_FFT*)A->data;
42   Mat_FFTW       *fftw = (Mat_FFTW*)fft->data;
43   PetscScalar    *x_array,*y_array;
44   PetscInt       ndim=fft->ndim,*dim=fft->dim;
45 
46   PetscFunctionBegin;
47   ierr = VecGetArray(x,&x_array);CHKERRQ(ierr);
48   ierr = VecGetArray(y,&y_array);CHKERRQ(ierr);
49   if (!fftw->p_forward) { /* create a plan, then excute it */
50     switch (ndim) {
51     case 1:
52 #if defined(PETSC_USE_COMPLEX)
53       fftw->p_forward = fftw_plan_dft_1d(dim[0],(fftw_complex*)x_array,(fftw_complex*)y_array,FFTW_FORWARD,fftw->p_flag);
54 #else
55       fftw->p_forward = fftw_plan_dft_r2c_1d(dim[0],(double *)x_array,(fftw_complex*)y_array,fftw->p_flag);
56 #endif
57       break;
58     case 2:
59 #if defined(PETSC_USE_COMPLEX)
60       fftw->p_forward = fftw_plan_dft_2d(dim[0],dim[1],(fftw_complex*)x_array,(fftw_complex*)y_array,FFTW_FORWARD,fftw->p_flag);
61 #else
62       fftw->p_forward = fftw_plan_dft_r2c_2d(dim[0],dim[1],(double *)x_array,(fftw_complex*)y_array,fftw->p_flag);
63 #endif
64       break;
65     case 3:
66 #if defined(PETSC_USE_COMPLEX)
67       fftw->p_forward = fftw_plan_dft_3d(dim[0],dim[1],dim[2],(fftw_complex*)x_array,(fftw_complex*)y_array,FFTW_FORWARD,fftw->p_flag);
68 #else
69       fftw->p_forward = fftw_plan_dft_r2c_3d(dim[0],dim[1],dim[2],(double *)x_array,(fftw_complex*)y_array,fftw->p_flag);
70 #endif
71       break;
72     default:
73 #if defined(PETSC_USE_COMPLEX)
74       fftw->p_forward = fftw_plan_dft(ndim,dim,(fftw_complex*)x_array,(fftw_complex*)y_array,FFTW_FORWARD,fftw->p_flag);
75 #else
76       fftw->p_forward = fftw_plan_dft_r2c(ndim,dim,(double *)x_array,(fftw_complex*)y_array,fftw->p_flag);
77 #endif
78       break;
79     }
80     fftw->finarray  = x_array;
81     fftw->foutarray = y_array;
82     /* Warning: if (fftw->p_flag!==FFTW_ESTIMATE) The data in the in/out arrays is overwritten!
83                 planning should be done before x is initialized! See FFTW manual sec2.1 or sec4 */
84     fftw_execute(fftw->p_forward);
85   } else { /* use existing plan */
86     if (fftw->finarray != x_array || fftw->foutarray != y_array) { /* use existing plan on new arrays */
87       fftw_execute_dft(fftw->p_forward,(fftw_complex*)x_array,(fftw_complex*)y_array);
88     } else {
89       fftw_execute(fftw->p_forward);
90     }
91   }
92   ierr = VecRestoreArray(y,&y_array);CHKERRQ(ierr);
93   ierr = VecRestoreArray(x,&x_array);CHKERRQ(ierr);
94   PetscFunctionReturn(0);
95 }
96 
97 /* MatMultTranspose_SeqFFTW performs serial backward DFT
98    Input parameter:
99      A - the matrix
100      x - the vector on which BDFT will be performed
101 
102    Output parameter:
103      y - vector that stores result of BDFT
104 */
105 
106 #undef __FUNCT__
107 #define __FUNCT__ "MatMultTranspose_SeqFFTW"
108 PetscErrorCode MatMultTranspose_SeqFFTW(Mat A,Vec x,Vec y)
109 {
110   PetscErrorCode ierr;
111   Mat_FFT        *fft = (Mat_FFT*)A->data;
112   Mat_FFTW       *fftw = (Mat_FFTW*)fft->data;
113   PetscScalar    *x_array,*y_array;
114   PetscInt       ndim=fft->ndim,*dim=fft->dim;
115 
116   PetscFunctionBegin;
117   ierr = VecGetArray(x,&x_array);CHKERRQ(ierr);
118   ierr = VecGetArray(y,&y_array);CHKERRQ(ierr);
119   if (!fftw->p_backward) { /* create a plan, then excute it */
120     switch (ndim) {
121     case 1:
122 #if defined(PETSC_USE_COMPLEX)
123       fftw->p_backward = fftw_plan_dft_1d(dim[0],(fftw_complex*)x_array,(fftw_complex*)y_array,FFTW_BACKWARD,fftw->p_flag);
124 #else
125       fftw->p_backward= fftw_plan_dft_c2r_1d(dim[0],(fftw_complex*)x_array,(double *)y_array,fftw->p_flag);
126 #endif
127       break;
128     case 2:
129 #if defined(PETSC_USE_COMPLEX)
130       fftw->p_backward = fftw_plan_dft_2d(dim[0],dim[1],(fftw_complex*)x_array,(fftw_complex*)y_array,FFTW_BACKWARD,fftw->p_flag);
131 #else
132       fftw->p_backward= fftw_plan_dft_c2r_2d(dim[0],dim[1],(fftw_complex*)x_array,(double *)y_array,fftw->p_flag);
133 #endif
134       break;
135     case 3:
136 #if defined(PETSC_USE_COMPLEX)
137       fftw->p_backward = fftw_plan_dft_3d(dim[0],dim[1],dim[2],(fftw_complex*)x_array,(fftw_complex*)y_array,FFTW_BACKWARD,fftw->p_flag);
138 #else
139       fftw->p_backward= fftw_plan_dft_c2r_3d(dim[0],dim[1],dim[2],(fftw_complex*)x_array,(double *)y_array,fftw->p_flag);
140 #endif
141       break;
142     default:
143 #if defined(PETSC_USE_COMPLEX)
144       fftw->p_backward = fftw_plan_dft(ndim,dim,(fftw_complex*)x_array,(fftw_complex*)y_array,FFTW_BACKWARD,fftw->p_flag);
145 #else
146       fftw->p_backward= fftw_plan_dft_c2r(ndim,dim,(fftw_complex*)x_array,(double *)y_array,fftw->p_flag);
147 #endif
148       break;
149     }
150     fftw->binarray  = x_array;
151     fftw->boutarray = y_array;
152     fftw_execute(fftw->p_backward);CHKERRQ(ierr);
153   } else { /* use existing plan */
154     if (fftw->binarray != x_array || fftw->boutarray != y_array) { /* use existing plan on new arrays */
155       fftw_execute_dft(fftw->p_backward,(fftw_complex*)x_array,(fftw_complex*)y_array);
156     } else {
157       fftw_execute(fftw->p_backward);CHKERRQ(ierr);
158     }
159   }
160   ierr = VecRestoreArray(y,&y_array);CHKERRQ(ierr);
161   ierr = VecRestoreArray(x,&x_array);CHKERRQ(ierr);
162   PetscFunctionReturn(0);
163 }
164 
165 /* MatMult_MPIFFTW performs forward DFT in parallel
166    Input parameter:
167      A - the matrix
168      x - the vector on which FDFT will be performed
169 
170    Output parameter:
171    y   - vector that stores result of FDFT
172 */
173 #undef __FUNCT__
174 #define __FUNCT__ "MatMult_MPIFFTW"
175 PetscErrorCode MatMult_MPIFFTW(Mat A,Vec x,Vec y)
176 {
177   PetscErrorCode ierr;
178   Mat_FFT        *fft  = (Mat_FFT*)A->data;
179   Mat_FFTW       *fftw = (Mat_FFTW*)fft->data;
180   PetscScalar    *x_array,*y_array;
181   PetscInt       ndim=fft->ndim,*dim=fft->dim;
182   MPI_Comm       comm=((PetscObject)A)->comm;
183 
184   PetscFunctionBegin;
185   ierr = VecGetArray(x,&x_array);CHKERRQ(ierr);
186   ierr = VecGetArray(y,&y_array);CHKERRQ(ierr);
187   if (!fftw->p_forward) { /* create a plan, then excute it */
188     switch (ndim) {
189     case 1:
190 #if defined(PETSC_USE_COMPLEX)
191       fftw->p_forward = fftw_mpi_plan_dft_1d(dim[0],(fftw_complex*)x_array,(fftw_complex*)y_array,comm,FFTW_FORWARD,fftw->p_flag);
192 #else
193       SETERRQ(comm,PETSC_ERR_SUP,"not support for real numbers yet");
194 #endif
195       break;
196     case 2:
197 #if defined(PETSC_USE_COMPLEX) /* For complex transforms call fftw_mpi_plan_dft, for real transforms call fftw_mpi_plan_dft_r2c */
198       fftw->p_forward = fftw_mpi_plan_dft_2d(dim[0],dim[1],(fftw_complex*)x_array,(fftw_complex*)y_array,comm,FFTW_FORWARD,fftw->p_flag);
199 #else
200       fftw->p_forward = fftw_mpi_plan_dft_r2c_2d(dim[0],dim[1],(double *)x_array,(fftw_complex*)y_array,comm,FFTW_ESTIMATE);
201 #endif
202       break;
203     case 3:
204 #if defined(PETSC_USE_COMPLEX)
205       fftw->p_forward = fftw_mpi_plan_dft_3d(dim[0],dim[1],dim[2],(fftw_complex*)x_array,(fftw_complex*)y_array,comm,FFTW_FORWARD,fftw->p_flag);
206 #else
207       fftw->p_forward = fftw_mpi_plan_dft_r2c_3d(dim[0],dim[1],dim[2],(double *)x_array,(fftw_complex*)y_array,comm,FFTW_ESTIMATE);
208 #endif
209       break;
210     default:
211 #if defined(PETSC_USE_COMPLEX)
212       fftw->p_forward = fftw_mpi_plan_dft(fftw->ndim_fftw,fftw->dim_fftw,(fftw_complex*)x_array,(fftw_complex*)y_array,comm,FFTW_FORWARD,fftw->p_flag);
213 #else
214       fftw->p_forward = fftw_mpi_plan_dft_r2c(fftw->ndim_fftw,fftw->dim_fftw,(double *)x_array,(fftw_complex*)y_array,comm,FFTW_ESTIMATE);
215 #endif
216       break;
217     }
218     fftw->finarray  = x_array;
219     fftw->foutarray = y_array;
220     /* Warning: if (fftw->p_flag!==FFTW_ESTIMATE) The data in the in/out arrays is overwritten!
221                 planning should be done before x is initialized! See FFTW manual sec2.1 or sec4 */
222     fftw_execute(fftw->p_forward);
223   } else { /* use existing plan */
224     if (fftw->finarray != x_array || fftw->foutarray != y_array) { /* use existing plan on new arrays */
225       fftw_execute_dft(fftw->p_forward,(fftw_complex*)x_array,(fftw_complex*)y_array);
226     } else {
227       fftw_execute(fftw->p_forward);
228     }
229   }
230   ierr = VecRestoreArray(y,&y_array);CHKERRQ(ierr);
231   ierr = VecRestoreArray(x,&x_array);CHKERRQ(ierr);
232   PetscFunctionReturn(0);
233 }
234 
235 /* MatMultTranspose_MPIFFTW performs parallel backward DFT
236    Input parameter:
237      A - the matrix
238      x - the vector on which BDFT will be performed
239 
240    Output parameter:
241      y - vector that stores result of BDFT
242 */
243 #undef __FUNCT__
244 #define __FUNCT__ "MatMultTranspose_MPIFFTW"
245 PetscErrorCode MatMultTranspose_MPIFFTW(Mat A,Vec x,Vec y)
246 {
247   PetscErrorCode ierr;
248   Mat_FFT        *fft  = (Mat_FFT*)A->data;
249   Mat_FFTW       *fftw = (Mat_FFTW*)fft->data;
250   PetscScalar    *x_array,*y_array;
251   PetscInt       ndim=fft->ndim,*dim=fft->dim;
252   MPI_Comm       comm=((PetscObject)A)->comm;
253 
254   PetscFunctionBegin;
255   ierr = VecGetArray(x,&x_array);CHKERRQ(ierr);
256   ierr = VecGetArray(y,&y_array);CHKERRQ(ierr);
257   if (!fftw->p_backward) { /* create a plan, then excute it */
258     switch (ndim) {
259     case 1:
260 #if defined(PETSC_USE_COMPLEX)
261       fftw->p_backward = fftw_mpi_plan_dft_1d(dim[0],(fftw_complex*)x_array,(fftw_complex*)y_array,comm,FFTW_BACKWARD,fftw->p_flag);
262 #else
263       SETERRQ(comm,PETSC_ERR_SUP,"not support for real numbers yet");
264 #endif
265       break;
266     case 2:
267 #if defined(PETSC_USE_COMPLEX) /* For complex transforms call fftw_mpi_plan_dft with flag FFTW_BACKWARD, for real transforms call fftw_mpi_plan_dft_c2r */
268       fftw->p_backward = fftw_mpi_plan_dft_2d(dim[0],dim[1],(fftw_complex*)x_array,(fftw_complex*)y_array,comm,FFTW_BACKWARD,fftw->p_flag);
269 #else
270       fftw->p_backward = fftw_mpi_plan_dft_c2r_2d(dim[0],dim[1],(fftw_complex*)x_array,(double *)y_array,comm,FFTW_ESTIMATE);
271 #endif
272       break;
273     case 3:
274 #if defined(PETSC_USE_COMPLEX)
275       fftw->p_backward = fftw_mpi_plan_dft_3d(dim[0],dim[1],dim[2],(fftw_complex*)x_array,(fftw_complex*)y_array,comm,FFTW_BACKWARD,fftw->p_flag);
276 #else
277       fftw->p_backward = fftw_mpi_plan_dft_c2r_3d(dim[0],dim[1],dim[2],(fftw_complex*)x_array,(double *)y_array,comm,FFTW_ESTIMATE);
278 #endif
279       break;
280     default:
281 #if defined(PETSC_USE_COMPLEX)
282       fftw->p_backward = fftw_mpi_plan_dft(fftw->ndim_fftw,fftw->dim_fftw,(fftw_complex*)x_array,(fftw_complex*)y_array,comm,FFTW_BACKWARD,fftw->p_flag);
283 #else
284       fftw->p_backward = fftw_mpi_plan_dft_c2r(fftw->ndim_fftw,fftw->dim_fftw,(fftw_complex*)x_array,(double *)y_array,comm,FFTW_ESTIMATE);
285 #endif
286       break;
287     }
288     fftw->binarray  = x_array;
289     fftw->boutarray = y_array;
290     fftw_execute(fftw->p_backward);CHKERRQ(ierr);
291   } else { /* use existing plan */
292     if (fftw->binarray != x_array || fftw->boutarray != y_array) { /* use existing plan on new arrays */
293       fftw_execute_dft(fftw->p_backward,(fftw_complex*)x_array,(fftw_complex*)y_array);
294     } else {
295       fftw_execute(fftw->p_backward);CHKERRQ(ierr);
296     }
297   }
298   ierr = VecRestoreArray(y,&y_array);CHKERRQ(ierr);
299   ierr = VecRestoreArray(x,&x_array);CHKERRQ(ierr);
300   PetscFunctionReturn(0);
301 }
302 
303 #undef __FUNCT__
304 #define __FUNCT__ "MatDestroy_FFTW"
305 PetscErrorCode MatDestroy_FFTW(Mat A)
306 {
307   Mat_FFT        *fft = (Mat_FFT*)A->data;
308   Mat_FFTW       *fftw = (Mat_FFTW*)fft->data;
309   PetscErrorCode ierr;
310 
311   PetscFunctionBegin;
312   fftw_destroy_plan(fftw->p_forward);
313   fftw_destroy_plan(fftw->p_backward);
314   ierr = PetscFree(fftw->dim_fftw);CHKERRQ(ierr);
315   ierr = PetscFree(fft->data);CHKERRQ(ierr);
316   fftw_mpi_cleanup();
317   PetscFunctionReturn(0);
318 }
319 
320 #include <../src/vec/vec/impls/mpi/pvecimpl.h>   /*I  "petscvec.h"   I*/
321 #undef __FUNCT__
322 #define __FUNCT__ "VecDestroy_MPIFFTW"
323 PetscErrorCode VecDestroy_MPIFFTW(Vec v)
324 {
325   PetscErrorCode  ierr;
326   PetscScalar     *array;
327 
328   PetscFunctionBegin;
329   ierr = VecGetArray(v,&array);CHKERRQ(ierr);
330   fftw_free((fftw_complex*)array);CHKERRQ(ierr);
331   ierr = VecRestoreArray(v,&array);CHKERRQ(ierr);
332   ierr = VecDestroy_MPI(v);CHKERRQ(ierr);
333   PetscFunctionReturn(0);
334 }
335 
336 #undef __FUNCT__
337 #define __FUNCT__ "MatGetVecsFFTW"
338 /*@
339    MatGetVecsFFTW - Get vector(s) compatible with the matrix, i.e. with the
340      parallel layout determined by FFTW
341 
342    Collective on Mat
343 
344    Input Parameter:
345 .   A - the matrix
346 
347    Output Parameter:
348 +   x - (optional) input vector of forward FFTW
349 -   y - (optional) output vector of forward FFTW
350 -   z - (optional) output vector of backward FFTW
351 
352   Level: advanced
353 
354   Note: The parallel layout of output of forward FFTW is always same as the input
355         of the backward FFTW. But parallel layout of the input vector of forward
356         FFTW might not be same as the output of backward FFTW.
357         Also note that we need to provide enough space while doing parallel real transform.
358         We need to pad extra zeros at the end of last dimension. For this reason the one needs to
359         invoke the routine fftw_mpi_local_size_transposed routines. Remember one has to change the
360         last dimension from n to n/2+1 while invoking this routine. The number of zeros to be padded
361         depends on if the last dimension is even or odd. If the last dimension is even need to pad two
362         zeros if it is odd only one zero is needed.
363         Lastly one needs some scratch space at the end of data set in each process. alloc_local
364         figures out how much space is needed, i.e. it figures out the data+scratch space for
365         each processor and returns that.
366 
367 .seealso: MatCreateFFTW()
368 @*/
369 PetscErrorCode MatGetVecsFFTW(Mat A,Vec *x,Vec *y,Vec *z)
370 {
371   PetscErrorCode ierr;
372 
373   PetscFunctionBegin;
374   ierr = PetscTryMethod(A,"MatGetVecsFFTW_C",(Mat,Vec*,Vec*,Vec*),(A,x,y,z));CHKERRQ(ierr);
375   PetscFunctionReturn(0);
376 }
377 
378 EXTERN_C_BEGIN
379 #undef __FUNCT__
380 #define __FUNCT__ "MatGetVecsFFTW_FFTW"
381 PetscErrorCode  MatGetVecsFFTW_FFTW(Mat A,Vec *fin,Vec *fout,Vec *bout)
382 {
383   PetscErrorCode ierr;
384   PetscMPIInt    size,rank;
385   MPI_Comm       comm=((PetscObject)A)->comm;
386   Mat_FFT        *fft = (Mat_FFT*)A->data;
387   Mat_FFTW       *fftw = (Mat_FFTW*)fft->data;
388   PetscInt       N=fft->N;
389   PetscInt       ndim=fft->ndim,*dim=fft->dim,n=fft->n;
390 
391   PetscFunctionBegin;
392   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
393   PetscValidType(A,1);
394 
395   ierr = MPI_Comm_size(comm, &size);CHKERRQ(ierr);
396   ierr = MPI_Comm_rank(comm, &rank);CHKERRQ(ierr);
397   if (size == 1) { /* sequential case */
398 #if defined(PETSC_USE_COMPLEX)
399     if (fin) {ierr = VecCreateSeq(PETSC_COMM_SELF,N,fin);CHKERRQ(ierr);}
400     if (fout) {ierr = VecCreateSeq(PETSC_COMM_SELF,N,fout);CHKERRQ(ierr);}
401     if (bout) {ierr = VecCreateSeq(PETSC_COMM_SELF,N,bout);CHKERRQ(ierr);}
402 #else
403     if (fin) {ierr = VecCreateSeq(PETSC_COMM_SELF,n,fin);CHKERRQ(ierr);}
404     if (fout) {ierr = VecCreateSeq(PETSC_COMM_SELF,n,fout);CHKERRQ(ierr);}
405     if (bout) {ierr = VecCreateSeq(PETSC_COMM_SELF,n,bout);CHKERRQ(ierr);}
406 #endif
407   } else { /* parallel cases */
408     ptrdiff_t      alloc_local,local_n0,local_0_start;
409     ptrdiff_t      local_n1;
410     fftw_complex   *data_fout;
411     ptrdiff_t      local_1_start;
412 #if defined(PETSC_USE_COMPLEX)
413     fftw_complex   *data_fin,*data_bout;
414 #else
415     double         *data_finr,*data_boutr;
416     PetscInt       n1,N1;
417     ptrdiff_t      temp;
418 #endif
419 
420     switch (ndim) {
421     case 1:
422 #if !defined(PETSC_USE_COMPLEX)
423       SETERRQ(comm,PETSC_ERR_SUP,"FFTW does not allow parallel real 1D transform");
424 #else
425       alloc_local = fftw_mpi_local_size_1d(dim[0],comm,FFTW_FORWARD,FFTW_ESTIMATE,&local_n0,&local_0_start,&local_n1,&local_1_start);
426       if (fin) {
427         data_fin  = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
428         ierr = VecCreateMPIWithArray(comm,1,local_n0,N,(const PetscScalar*)data_fin,fin);CHKERRQ(ierr);
429         (*fin)->ops->destroy = VecDestroy_MPIFFTW;
430       }
431       if (fout) {
432         data_fout = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
433         ierr = VecCreateMPIWithArray(comm,1,local_n1,N,(const PetscScalar*)data_fout,fout);CHKERRQ(ierr);
434         (*fout)->ops->destroy = VecDestroy_MPIFFTW;
435       }
436       alloc_local = fftw_mpi_local_size_1d(dim[0],comm,FFTW_BACKWARD,FFTW_ESTIMATE,&local_n0,&local_0_start,&local_n1,&local_1_start);
437       if (bout) {
438         data_bout = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
439         ierr = VecCreateMPIWithArray(comm,1,local_n1,N,(const PetscScalar*)data_bout,bout);CHKERRQ(ierr);
440         (*bout)->ops->destroy = VecDestroy_MPIFFTW;
441       }
442       break;
443 #endif
444     case 2:
445 #if !defined(PETSC_USE_COMPLEX) /* Note that N1 is no more the product of individual dimensions */
446       alloc_local =  fftw_mpi_local_size_2d_transposed(dim[0],dim[1]/2+1,comm,&local_n0,&local_0_start,&local_n1,&local_1_start);
447       N1 = 2*dim[0]*(dim[1]/2+1); n1 = 2*local_n0*(dim[1]/2+1);
448       if (fin) {
449         data_finr=(double *)fftw_malloc(sizeof(double)*alloc_local*2);
450         ierr = VecCreateMPIWithArray(comm,1,(PetscInt)n1,N1,(PetscScalar*)data_finr,fin);CHKERRQ(ierr);
451         (*fin)->ops->destroy   = VecDestroy_MPIFFTW;
452       }
453       if (bout) {
454         data_boutr=(double *)fftw_malloc(sizeof(double)*alloc_local*2);
455         ierr = VecCreateMPIWithArray(comm,1,(PetscInt)n1,N1,(PetscScalar*)data_boutr,bout);CHKERRQ(ierr);
456         (*bout)->ops->destroy   = VecDestroy_MPIFFTW;
457       }
458       if (fout) {
459         data_fout=(fftw_complex *)fftw_malloc(sizeof(fftw_complex)*alloc_local);
460         ierr = VecCreateMPIWithArray(comm,1,(PetscInt)n1,N1,(PetscScalar*)data_fout,fout);CHKERRQ(ierr);
461         (*fout)->ops->destroy   = VecDestroy_MPIFFTW;
462       }
463 #else
464       /* Get local size */
465       alloc_local = fftw_mpi_local_size_2d(dim[0],dim[1],comm,&local_n0,&local_0_start);
466       if (fin) {
467         data_fin  = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
468         ierr = VecCreateMPIWithArray(comm,1,n,N,(const PetscScalar*)data_fin,fin);CHKERRQ(ierr);
469         (*fin)->ops->destroy   = VecDestroy_MPIFFTW;
470       }
471       if (fout) {
472         data_fout = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
473         ierr = VecCreateMPIWithArray(comm,1,n,N,(const PetscScalar*)data_fout,fout);CHKERRQ(ierr);
474         (*fout)->ops->destroy   = VecDestroy_MPIFFTW;
475       }
476       if (bout) {
477         data_bout  = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
478         ierr = VecCreateMPIWithArray(comm,1,n,N,(const PetscScalar*)data_bout,bout);CHKERRQ(ierr);
479         (*bout)->ops->destroy   = VecDestroy_MPIFFTW;
480       }
481 #endif
482       break;
483     case 3:
484 #if !defined(PETSC_USE_COMPLEX)
485       alloc_local =  fftw_mpi_local_size_3d_transposed(dim[0],dim[1],dim[2]/2+1,comm,&local_n0,&local_0_start,&local_n1,&local_1_start);
486       N1 = 2*dim[0]*dim[1]*(dim[2]/2+1); n1 = 2*local_n0*dim[1]*(dim[2]/2+1);
487       if (fin) {
488         data_finr=(double *)fftw_malloc(sizeof(double)*alloc_local*2);
489         ierr = VecCreateMPIWithArray(comm,1,(PetscInt)n1,N1,(PetscScalar*)data_finr,fin);CHKERRQ(ierr);
490         (*fin)->ops->destroy   = VecDestroy_MPIFFTW;
491       }
492       if (bout) {
493         data_boutr=(double *)fftw_malloc(sizeof(double)*alloc_local*2);
494         ierr = VecCreateMPIWithArray(comm,1,(PetscInt)n1,N1,(PetscScalar*)data_boutr,bout);CHKERRQ(ierr);
495         (*bout)->ops->destroy   = VecDestroy_MPIFFTW;
496       }
497 
498       if (fout) {
499         data_fout=(fftw_complex *)fftw_malloc(sizeof(fftw_complex)*alloc_local);
500         ierr = VecCreateMPIWithArray(comm,1,n1,N1,(PetscScalar*)data_fout,fout);CHKERRQ(ierr);
501         (*fout)->ops->destroy   = VecDestroy_MPIFFTW;
502       }
503 #else
504       alloc_local = fftw_mpi_local_size_3d(dim[0],dim[1],dim[2],comm,&local_n0,&local_0_start);
505       if (fin) {
506         data_fin  = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
507         ierr = VecCreateMPIWithArray(comm,1,n,N,(const PetscScalar*)data_fin,fin);CHKERRQ(ierr);
508         (*fin)->ops->destroy   = VecDestroy_MPIFFTW;
509       }
510       if (fout) {
511         data_fout = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
512         ierr = VecCreateMPIWithArray(comm,1,n,N,(const PetscScalar*)data_fout,fout);CHKERRQ(ierr);
513         (*fout)->ops->destroy   = VecDestroy_MPIFFTW;
514       }
515       if (bout) {
516         data_bout  = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
517         ierr = VecCreateMPIWithArray(comm,1,n,N,(const PetscScalar*)data_bout,bout);CHKERRQ(ierr);
518         (*bout)->ops->destroy   = VecDestroy_MPIFFTW;
519       }
520 #endif
521       break;
522     default:
523 #if !defined(PETSC_USE_COMPLEX)
524       temp = (fftw->dim_fftw)[fftw->ndim_fftw-1];
525       (fftw->dim_fftw)[fftw->ndim_fftw-1] = temp/2 + 1;
526       alloc_local = fftw_mpi_local_size_transposed(fftw->ndim_fftw,fftw->dim_fftw,comm,&local_n0,&local_0_start,&local_n1,&local_1_start);
527       N1 = 2*N*(PetscInt)((fftw->dim_fftw)[fftw->ndim_fftw-1])/((PetscInt) temp);
528       (fftw->dim_fftw)[fftw->ndim_fftw-1] = temp;
529 
530       if (fin) {
531         data_finr=(double *)fftw_malloc(sizeof(double)*alloc_local*2);
532         ierr = VecCreateMPIWithArray(comm,1,(PetscInt)n,N1,(PetscScalar*)data_finr,fin);CHKERRQ(ierr);
533         (*fin)->ops->destroy   = VecDestroy_MPIFFTW;
534       }
535       if (bout) {
536         data_boutr=(double *)fftw_malloc(sizeof(double)*alloc_local*2);
537         ierr = VecCreateMPIWithArray(comm,1,(PetscInt)n,N1,(PetscScalar*)data_boutr,bout);CHKERRQ(ierr);
538         (*bout)->ops->destroy   = VecDestroy_MPIFFTW;
539       }
540 
541       if (fout) {
542         data_fout=(fftw_complex *)fftw_malloc(sizeof(fftw_complex)*alloc_local);
543         ierr = VecCreateMPIWithArray(comm,1,n,N1,(PetscScalar*)data_fout,fout);CHKERRQ(ierr);
544         (*fout)->ops->destroy   = VecDestroy_MPIFFTW;
545       }
546 #else
547       alloc_local = fftw_mpi_local_size(fftw->ndim_fftw,fftw->dim_fftw,comm,&local_n0,&local_0_start);
548       if (fin) {
549         data_fin  = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
550         ierr = VecCreateMPIWithArray(comm,1,n,N,(const PetscScalar*)data_fin,fin);CHKERRQ(ierr);
551         (*fin)->ops->destroy   = VecDestroy_MPIFFTW;
552       }
553       if (fout) {
554         data_fout = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
555         ierr = VecCreateMPIWithArray(comm,1,n,N,(const PetscScalar*)data_fout,fout);CHKERRQ(ierr);
556         (*fout)->ops->destroy   = VecDestroy_MPIFFTW;
557       }
558       if (bout) {
559         data_bout  = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
560         ierr = VecCreateMPIWithArray(comm,1,n,N,(const PetscScalar*)data_bout,bout);CHKERRQ(ierr);
561         (*bout)->ops->destroy   = VecDestroy_MPIFFTW;
562       }
563 #endif
564       break;
565     }
566   }
567   PetscFunctionReturn(0);
568 }
569 EXTERN_C_END
570 
571 #undef __FUNCT__
572 #define __FUNCT__ "VecScatterPetscToFFTW"
573 /*@
574    VecScatterPetscToFFTW - Copies the PETSc vector to the vector that goes into FFTW block.
575 
576    Collective on Mat
577 
578    Input Parameters:
579 +  A - FFTW matrix
580 -  x - the PETSc vector
581 
582    Output Parameters:
583 .  y - the FFTW vector
584 
585   Options Database Keys:
586 . -mat_fftw_plannerflags - set FFTW planner flags
587 
588    Level: intermediate
589 
590    Note: For real parallel FFT, FFTW requires insertion of extra space at the end of last dimension. This required even when
591          one is not doing in-place transform. The last dimension size must be changed to 2*(dim[last]/2+1) to accommodate these extra
592          zeros. This routine does that job by scattering operation.
593 
594 .seealso: VecScatterFFTWToPetsc()
595 @*/
596 PetscErrorCode VecScatterPetscToFFTW(Mat A,Vec x,Vec y)
597 {
598   PetscErrorCode ierr;
599 
600   PetscFunctionBegin;
601   ierr = PetscTryMethod(A,"VecScatterPetscToFFTW_C",(Mat,Vec,Vec),(A,x,y));CHKERRQ(ierr);
602   PetscFunctionReturn(0);
603 }
604 
605 EXTERN_C_BEGIN
606 #undef __FUNCT__
607 #define __FUNCT__ "VecScatterPetscToFFTW_FFTW"
608 PetscErrorCode VecScatterPetscToFFTW_FFTW(Mat A,Vec x,Vec y)
609 {
610   PetscErrorCode ierr;
611   MPI_Comm       comm=((PetscObject)A)->comm;
612   Mat_FFT        *fft = (Mat_FFT*)A->data;
613   Mat_FFTW       *fftw = (Mat_FFTW*)fft->data;
614   PetscInt       N=fft->N;
615   PetscInt       ndim=fft->ndim,*dim=fft->dim;
616   PetscInt       low;
617   PetscInt       rank,size,vsize,vsize1;
618   ptrdiff_t      alloc_local,local_n0,local_0_start;
619   ptrdiff_t      local_n1,local_1_start;
620   VecScatter     vecscat;
621   IS             list1,list2;
622 #if !defined(PETSC_USE_COMPLEX)
623   PetscInt       i,j,k,partial_dim;
624   PetscInt       *indx1, *indx2, tempindx, tempindx1;
625   PetscInt       N1, n1 ,NM;
626   ptrdiff_t      temp;
627 #endif
628 
629   PetscFunctionBegin;
630   ierr = MPI_Comm_size(comm, &size);CHKERRQ(ierr);
631   ierr = MPI_Comm_rank(comm, &rank);CHKERRQ(ierr);
632   ierr = VecGetOwnershipRange(y,&low,PETSC_NULL);
633 
634   if (size==1) {
635     ierr = VecGetSize(x,&vsize);CHKERRQ(ierr);
636     ierr = VecGetSize(y,&vsize1);CHKERRQ(ierr);
637     ierr = ISCreateStride(PETSC_COMM_SELF,N,0,1,&list1);CHKERRQ(ierr);
638     ierr = VecScatterCreate(x,list1,y,list1,&vecscat);CHKERRQ(ierr);
639     ierr = VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
640     ierr = VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
641     ierr = VecScatterDestroy(&vecscat);CHKERRQ(ierr);
642     ierr = ISDestroy(&list1);CHKERRQ(ierr);
643   } else {
644     switch (ndim) {
645     case 1:
646 #if defined(PETSC_USE_COMPLEX)
647       alloc_local = fftw_mpi_local_size_1d(dim[0],comm,FFTW_FORWARD,FFTW_ESTIMATE,&local_n0,&local_0_start,&local_n1,&local_1_start);
648       ierr = ISCreateStride(comm,local_n0,local_0_start,1,&list1);
649       ierr = ISCreateStride(comm,local_n0,low,1,&list2);
650       ierr = VecScatterCreate(x,list1,y,list2,&vecscat);CHKERRQ(ierr);
651       ierr = VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
652       ierr = VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
653       ierr = VecScatterDestroy(&vecscat);CHKERRQ(ierr);
654       ierr = ISDestroy(&list1);CHKERRQ(ierr);
655       ierr = ISDestroy(&list2);CHKERRQ(ierr);
656 #else
657       SETERRQ(comm,PETSC_ERR_SUP,"FFTW does not support parallel 1D real transform");
658 #endif
659       break;
660     case 2:
661 #if defined(PETSC_USE_COMPLEX)
662       alloc_local = fftw_mpi_local_size_2d(dim[0],dim[1],comm,&local_n0,&local_0_start);
663       ierr = ISCreateStride(comm,local_n0*dim[1],local_0_start*dim[1],1,&list1);
664       ierr = ISCreateStride(comm,local_n0*dim[1],low,1,&list2);
665       ierr = VecScatterCreate(x,list1,y,list2,&vecscat);CHKERRQ(ierr);
666       ierr = VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
667       ierr = VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
668       ierr = VecScatterDestroy(&vecscat);CHKERRQ(ierr);
669       ierr = ISDestroy(&list1);CHKERRQ(ierr);
670       ierr = ISDestroy(&list2);CHKERRQ(ierr);
671 #else
672       alloc_local =  fftw_mpi_local_size_2d_transposed(dim[0],dim[1]/2+1,comm,&local_n0,&local_0_start,&local_n1,&local_1_start);
673       N1 = 2*dim[0]*(dim[1]/2+1); n1 = 2*local_n0*(dim[1]/2+1);
674       ierr = PetscMalloc(sizeof(PetscInt)*((PetscInt)local_n0)*dim[1],&indx1);CHKERRQ(ierr);
675       ierr = PetscMalloc(sizeof(PetscInt)*((PetscInt)local_n0)*dim[1],&indx2);CHKERRQ(ierr);
676 
677       if (dim[1]%2==0) {
678         NM = dim[1]+2;
679       } else {
680         NM = dim[1]+1;
681       }
682       for (i=0;i<local_n0;i++) {
683          for (j=0;j<dim[1];j++) {
684            tempindx = i*dim[1] + j;
685            tempindx1 = i*NM + j;
686            indx1[tempindx]=local_0_start*dim[1]+tempindx;
687            indx2[tempindx]=low+tempindx1;
688          }
689       }
690 
691       ierr = ISCreateGeneral(comm,local_n0*dim[1],indx1,PETSC_COPY_VALUES,&list1);CHKERRQ(ierr);
692       ierr = ISCreateGeneral(comm,local_n0*dim[1],indx2,PETSC_COPY_VALUES,&list2);CHKERRQ(ierr);
693 
694       ierr = VecScatterCreate(x,list1,y,list2,&vecscat);CHKERRQ(ierr);
695       ierr = VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
696       ierr = VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
697       ierr = VecScatterDestroy(&vecscat);CHKERRQ(ierr);
698       ierr = ISDestroy(&list1);CHKERRQ(ierr);
699       ierr = ISDestroy(&list2);CHKERRQ(ierr);
700       ierr = PetscFree(indx1);CHKERRQ(ierr);
701       ierr = PetscFree(indx2);CHKERRQ(ierr);
702 #endif
703       break;
704 
705     case 3:
706 #if defined(PETSC_USE_COMPLEX)
707       alloc_local = fftw_mpi_local_size_3d(dim[0],dim[1],dim[2],comm,&local_n0,&local_0_start);
708       ierr = ISCreateStride(comm,local_n0*dim[1]*dim[2],local_0_start*dim[1]*dim[2],1,&list1);
709       ierr = ISCreateStride(comm,local_n0*dim[1]*dim[2],low,1,&list2);
710       ierr = VecScatterCreate(x,list1,y,list2,&vecscat);CHKERRQ(ierr);
711       ierr = VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
712       ierr = VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
713       ierr = VecScatterDestroy(&vecscat);CHKERRQ(ierr);
714       ierr = ISDestroy(&list1);CHKERRQ(ierr);
715       ierr = ISDestroy(&list2);CHKERRQ(ierr);
716 #else
717       alloc_local =  fftw_mpi_local_size_3d_transposed(dim[0],dim[1],dim[2]/2+1,comm,&local_n0,&local_0_start,&local_n1,&local_1_start);
718       N1 = 2*dim[0]*dim[1]*(dim[2]/2+1); n1 = 2*local_n0*dim[1]*(dim[2]/2+1);
719 
720       ierr = PetscMalloc(sizeof(PetscInt)*((PetscInt)local_n0)*dim[1]*dim[2],&indx1);CHKERRQ(ierr);
721       ierr = PetscMalloc(sizeof(PetscInt)*((PetscInt)local_n0)*dim[1]*dim[2],&indx2);CHKERRQ(ierr);
722 
723       if (dim[2]%2==0) NM = dim[2]+2;
724       else             NM = dim[2]+1;
725 
726       for (i=0;i<local_n0;i++) {
727         for (j=0;j<dim[1];j++) {
728           for (k=0;k<dim[2];k++) {
729             tempindx = i*dim[1]*dim[2] + j*dim[2] + k;
730             tempindx1 = i*dim[1]*NM + j*NM + k;
731             indx1[tempindx]=local_0_start*dim[1]*dim[2]+tempindx;
732             indx2[tempindx]=low+tempindx1;
733           }
734         }
735       }
736 
737       ierr = ISCreateGeneral(comm,local_n0*dim[1]*dim[2],indx1,PETSC_COPY_VALUES,&list1);CHKERRQ(ierr);
738       ierr = ISCreateGeneral(comm,local_n0*dim[1]*dim[2],indx2,PETSC_COPY_VALUES,&list2);CHKERRQ(ierr);
739       ierr = VecScatterCreate(x,list1,y,list2,&vecscat);CHKERRQ(ierr);
740       ierr = VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
741       ierr = VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
742       ierr = VecScatterDestroy(&vecscat);CHKERRQ(ierr);
743       ierr = ISDestroy(&list1);CHKERRQ(ierr);
744       ierr = ISDestroy(&list2);CHKERRQ(ierr);
745       ierr = PetscFree(indx1);CHKERRQ(ierr);
746       ierr = PetscFree(indx2);CHKERRQ(ierr);
747 #endif
748       break;
749 
750     default:
751 #if defined(PETSC_USE_COMPLEX)
752       alloc_local = fftw_mpi_local_size(fftw->ndim_fftw,fftw->dim_fftw,comm,&local_n0,&local_0_start);
753       ierr = ISCreateStride(comm,local_n0*(fftw->partial_dim),local_0_start*(fftw->partial_dim),1,&list1);
754       ierr = ISCreateStride(comm,local_n0*(fftw->partial_dim),low,1,&list2);
755       ierr = VecScatterCreate(x,list1,y,list2,&vecscat);CHKERRQ(ierr);
756       ierr = VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
757       ierr = VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
758       ierr = VecScatterDestroy(&vecscat);CHKERRQ(ierr);
759       ierr = ISDestroy(&list1);CHKERRQ(ierr);
760       ierr = ISDestroy(&list2);CHKERRQ(ierr);
761 #else
762       temp = (fftw->dim_fftw)[fftw->ndim_fftw-1];
763       (fftw->dim_fftw)[fftw->ndim_fftw-1] = temp/2 + 1;
764       alloc_local = fftw_mpi_local_size_transposed(fftw->ndim_fftw,fftw->dim_fftw,comm,&local_n0,&local_0_start,&local_n1,&local_1_start);
765       N1 = 2*N*(PetscInt)((fftw->dim_fftw)[fftw->ndim_fftw-1])/((PetscInt) temp);
766       (fftw->dim_fftw)[fftw->ndim_fftw-1] = temp;
767 
768       partial_dim = fftw->partial_dim;
769 
770       ierr = PetscMalloc(sizeof(PetscInt)*((PetscInt)local_n0)*partial_dim,&indx1);CHKERRQ(ierr);
771       ierr = PetscMalloc(sizeof(PetscInt)*((PetscInt)local_n0)*partial_dim,&indx2);CHKERRQ(ierr);
772 
773       if (dim[ndim-1]%2==0) NM = 2;
774       else                  NM = 1;
775 
776       j = low;
777       for (i=0,k=1; i<((PetscInt)local_n0)*partial_dim;i++,k++) {
778         indx1[i] = local_0_start*partial_dim + i;
779         indx2[i] = j;
780         if (k%dim[ndim-1]==0) { j+=NM;}
781         j++;
782       }
783       ierr = ISCreateGeneral(comm,local_n0*partial_dim,indx1,PETSC_COPY_VALUES,&list1);CHKERRQ(ierr);
784       ierr = ISCreateGeneral(comm,local_n0*partial_dim,indx2,PETSC_COPY_VALUES,&list2);CHKERRQ(ierr);
785       ierr = VecScatterCreate(x,list1,y,list2,&vecscat);CHKERRQ(ierr);
786       ierr = VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
787       ierr = VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
788       ierr = VecScatterDestroy(&vecscat);CHKERRQ(ierr);
789       ierr = ISDestroy(&list1);CHKERRQ(ierr);
790       ierr = ISDestroy(&list2);CHKERRQ(ierr);
791       ierr = PetscFree(indx1);CHKERRQ(ierr);
792       ierr = PetscFree(indx2);CHKERRQ(ierr);
793 #endif
794       break;
795     }
796   }
797   PetscFunctionReturn(0);
798 }
799 EXTERN_C_END
800 
801 #undef __FUNCT__
802 #define __FUNCT__ "VecScatterFFTWToPetsc"
803 /*@
804    VecScatterFFTWToPetsc - Converts FFTW output to the PETSc vector.
805 
806    Collective on Mat
807 
808     Input Parameters:
809 +   A - FFTW matrix
810 -   x - FFTW vector
811 
812    Output Parameters:
813 .  y - PETSc vector
814 
815    Level: intermediate
816 
817    Note: While doing real transform the FFTW output of backward DFT contains extra zeros at the end of last dimension.
818          VecScatterFFTWToPetsc removes those extra zeros.
819 
820 .seealso: VecScatterPetscToFFTW()
821 @*/
822 PetscErrorCode VecScatterFFTWToPetsc(Mat A,Vec x,Vec y)
823 {
824   PetscErrorCode ierr;
825   PetscFunctionBegin;
826   ierr = PetscTryMethod(A,"VecScatterFFTWToPetsc_C",(Mat,Vec,Vec),(A,x,y));CHKERRQ(ierr);
827   PetscFunctionReturn(0);
828 }
829 
830 EXTERN_C_BEGIN
831 #undef __FUNCT__
832 #define __FUNCT__ "VecScatterFFTWToPetsc_FFTW"
833 PetscErrorCode VecScatterFFTWToPetsc_FFTW(Mat A,Vec x,Vec y)
834 {
835   PetscErrorCode ierr;
836   MPI_Comm       comm=((PetscObject)A)->comm;
837   Mat_FFT        *fft = (Mat_FFT*)A->data;
838   Mat_FFTW       *fftw = (Mat_FFTW*)fft->data;
839   PetscInt       N=fft->N;
840   PetscInt       ndim=fft->ndim,*dim=fft->dim;
841   PetscInt       low;
842   PetscInt       rank,size;
843   ptrdiff_t      alloc_local,local_n0,local_0_start;
844   ptrdiff_t      local_n1,local_1_start;
845   VecScatter     vecscat;
846   IS             list1,list2;
847 #if !defined(PETSC_USE_COMPLEX)
848   PetscInt       i,j,k,partial_dim;
849   PetscInt       *indx1, *indx2, tempindx, tempindx1;
850   PetscInt       N1, n1 ,NM;
851   ptrdiff_t      temp;
852 #endif
853 
854   PetscFunctionBegin;
855   ierr = MPI_Comm_size(comm, &size);CHKERRQ(ierr);
856   ierr = MPI_Comm_rank(comm, &rank);CHKERRQ(ierr);
857   ierr = VecGetOwnershipRange(x,&low,PETSC_NULL);CHKERRQ(ierr);
858 
859   if (size==1) {
860     ierr = ISCreateStride(comm,N,0,1,&list1);CHKERRQ(ierr);
861     ierr = VecScatterCreate(x,list1,y,list1,&vecscat);CHKERRQ(ierr);
862     ierr = VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
863     ierr = VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
864     ierr = VecScatterDestroy(&vecscat);CHKERRQ(ierr);
865     ierr = ISDestroy(&list1);CHKERRQ(ierr);
866 
867   } else {
868     switch (ndim) {
869     case 1:
870 #if defined(PETSC_USE_COMPLEX)
871       alloc_local = fftw_mpi_local_size_1d(dim[0],comm,FFTW_BACKWARD,FFTW_ESTIMATE,&local_n0,&local_0_start,&local_n1,&local_1_start);
872       ierr = ISCreateStride(comm,local_n1,local_1_start,1,&list1);
873       ierr = ISCreateStride(comm,local_n1,low,1,&list2);
874       ierr = VecScatterCreate(x,list1,y,list2,&vecscat);CHKERRQ(ierr);
875       ierr = VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
876       ierr = VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
877       ierr = VecScatterDestroy(&vecscat);CHKERRQ(ierr);
878       ierr = ISDestroy(&list1);CHKERRQ(ierr);
879       ierr = ISDestroy(&list2);CHKERRQ(ierr);
880 #else
881       SETERRQ(comm,PETSC_ERR_SUP,"No support for real parallel 1D FFT");
882 #endif
883       break;
884     case 2:
885 #if defined(PETSC_USE_COMPLEX)
886       alloc_local = fftw_mpi_local_size_2d(dim[0],dim[1],comm,&local_n0,&local_0_start);
887       ierr = ISCreateStride(comm,local_n0*dim[1],local_0_start*dim[1],1,&list1);
888       ierr = ISCreateStride(comm,local_n0*dim[1],low,1,&list2);
889       ierr = VecScatterCreate(x,list2,y,list1,&vecscat);CHKERRQ(ierr);
890       ierr = VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
891       ierr = VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
892       ierr = VecScatterDestroy(&vecscat);CHKERRQ(ierr);
893       ierr = ISDestroy(&list1);CHKERRQ(ierr);
894       ierr = ISDestroy(&list2);CHKERRQ(ierr);
895 #else
896       alloc_local =  fftw_mpi_local_size_2d_transposed(dim[0],dim[1]/2+1,comm,&local_n0,&local_0_start,&local_n1,&local_1_start);
897       N1 = 2*dim[0]*(dim[1]/2+1); n1 = 2*local_n0*(dim[1]/2+1);
898 
899       ierr = PetscMalloc(sizeof(PetscInt)*((PetscInt)local_n0)*dim[1],&indx1);CHKERRQ(ierr);
900       ierr = PetscMalloc(sizeof(PetscInt)*((PetscInt)local_n0)*dim[1],&indx2);CHKERRQ(ierr);
901 
902       if (dim[1]%2==0) NM = dim[1]+2;
903       else             NM = dim[1]+1;
904 
905       for (i=0;i<local_n0;i++) {
906         for (j=0;j<dim[1];j++) {
907           tempindx = i*dim[1] + j;
908           tempindx1 = i*NM + j;
909           indx1[tempindx]=local_0_start*dim[1]+tempindx;
910           indx2[tempindx]=low+tempindx1;
911         }
912       }
913 
914       ierr = ISCreateGeneral(comm,local_n0*dim[1],indx1,PETSC_COPY_VALUES,&list1);CHKERRQ(ierr);
915       ierr = ISCreateGeneral(comm,local_n0*dim[1],indx2,PETSC_COPY_VALUES,&list2);CHKERRQ(ierr);
916 
917       ierr = VecScatterCreate(x,list2,y,list1,&vecscat);CHKERRQ(ierr);
918       ierr = VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
919       ierr = VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
920       ierr = VecScatterDestroy(&vecscat);CHKERRQ(ierr);
921       ierr = ISDestroy(&list1);CHKERRQ(ierr);
922       ierr = ISDestroy(&list2);CHKERRQ(ierr);
923       ierr = PetscFree(indx1);CHKERRQ(ierr);
924       ierr = PetscFree(indx2);CHKERRQ(ierr);
925 #endif
926       break;
927     case 3:
928 #if defined(PETSC_USE_COMPLEX)
929       alloc_local = fftw_mpi_local_size_3d(dim[0],dim[1],dim[2],comm,&local_n0,&local_0_start);
930       ierr = ISCreateStride(comm,local_n0*dim[1]*dim[2],local_0_start*dim[1]*dim[2],1,&list1);
931       ierr = ISCreateStride(comm,local_n0*dim[1]*dim[2],low,1,&list2);
932       ierr = VecScatterCreate(x,list1,y,list2,&vecscat);CHKERRQ(ierr);
933       ierr = VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
934       ierr = VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
935       ierr = VecScatterDestroy(&vecscat);CHKERRQ(ierr);
936       ierr = ISDestroy(&list1);CHKERRQ(ierr);
937       ierr = ISDestroy(&list2);CHKERRQ(ierr);
938 #else
939 
940       alloc_local =  fftw_mpi_local_size_3d_transposed(dim[0],dim[1],dim[2]/2+1,comm,&local_n0,&local_0_start,&local_n1,&local_1_start);
941       N1 = 2*dim[0]*dim[1]*(dim[2]/2+1); n1 = 2*local_n0*dim[1]*(dim[2]/2+1);
942 
943       ierr = PetscMalloc(sizeof(PetscInt)*((PetscInt)local_n0)*dim[1]*dim[2],&indx1);CHKERRQ(ierr);
944       ierr = PetscMalloc(sizeof(PetscInt)*((PetscInt)local_n0)*dim[1]*dim[2],&indx2);CHKERRQ(ierr);
945 
946       if (dim[2]%2==0) NM = dim[2]+2;
947       else             NM = dim[2]+1;
948 
949       for (i=0;i<local_n0;i++) {
950         for (j=0;j<dim[1];j++) {
951           for (k=0;k<dim[2];k++) {
952             tempindx = i*dim[1]*dim[2] + j*dim[2] + k;
953             tempindx1 = i*dim[1]*NM + j*NM + k;
954             indx1[tempindx]=local_0_start*dim[1]*dim[2]+tempindx;
955             indx2[tempindx]=low+tempindx1;
956           }
957         }
958       }
959 
960       ierr = ISCreateGeneral(comm,local_n0*dim[1]*dim[2],indx1,PETSC_COPY_VALUES,&list1);CHKERRQ(ierr);
961       ierr = ISCreateGeneral(comm,local_n0*dim[1]*dim[2],indx2,PETSC_COPY_VALUES,&list2);CHKERRQ(ierr);
962 
963       ierr = VecScatterCreate(x,list2,y,list1,&vecscat);CHKERRQ(ierr);
964       ierr = VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
965       ierr = VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
966       ierr = VecScatterDestroy(&vecscat);CHKERRQ(ierr);
967       ierr = ISDestroy(&list1);CHKERRQ(ierr);
968       ierr = ISDestroy(&list2);CHKERRQ(ierr);
969       ierr = PetscFree(indx1);CHKERRQ(ierr);
970       ierr = PetscFree(indx2);CHKERRQ(ierr);
971 #endif
972       break;
973     default:
974 #if defined(PETSC_USE_COMPLEX)
975       alloc_local = fftw_mpi_local_size(fftw->ndim_fftw,fftw->dim_fftw,comm,&local_n0,&local_0_start);
976       ierr = ISCreateStride(comm,local_n0*(fftw->partial_dim),local_0_start*(fftw->partial_dim),1,&list1);
977       ierr = ISCreateStride(comm,local_n0*(fftw->partial_dim),low,1,&list2);
978       ierr = VecScatterCreate(x,list1,y,list2,&vecscat);CHKERRQ(ierr);
979       ierr = VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
980       ierr = VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
981       ierr = VecScatterDestroy(&vecscat);CHKERRQ(ierr);
982       ierr = ISDestroy(&list1);CHKERRQ(ierr);
983       ierr = ISDestroy(&list2);CHKERRQ(ierr);
984 #else
985       temp = (fftw->dim_fftw)[fftw->ndim_fftw-1];
986       (fftw->dim_fftw)[fftw->ndim_fftw-1] = temp/2 + 1;
987       alloc_local = fftw_mpi_local_size_transposed(fftw->ndim_fftw,fftw->dim_fftw,comm,&local_n0,&local_0_start,&local_n1,&local_1_start);
988       N1 = 2*N*(PetscInt)((fftw->dim_fftw)[fftw->ndim_fftw-1])/((PetscInt) temp);
989       (fftw->dim_fftw)[fftw->ndim_fftw-1] = temp;
990 
991       partial_dim = fftw->partial_dim;
992 
993       ierr = PetscMalloc(sizeof(PetscInt)*((PetscInt)local_n0)*partial_dim,&indx1);CHKERRQ(ierr);
994       ierr = PetscMalloc(sizeof(PetscInt)*((PetscInt)local_n0)*partial_dim,&indx2);CHKERRQ(ierr);
995 
996       if (dim[ndim-1]%2==0) NM = 2;
997       else                  NM = 1;
998 
999       j = low;
1000       for (i=0,k=1; i<((PetscInt)local_n0)*partial_dim;i++,k++) {
1001         indx1[i] = local_0_start*partial_dim + i;
1002         indx2[i] = j;
1003         if (k%dim[ndim-1]==0)j+=NM;
1004         j++;
1005       }
1006       ierr = ISCreateGeneral(comm,local_n0*partial_dim,indx1,PETSC_COPY_VALUES,&list1);CHKERRQ(ierr);
1007       ierr = ISCreateGeneral(comm,local_n0*partial_dim,indx2,PETSC_COPY_VALUES,&list2);CHKERRQ(ierr);
1008 
1009       ierr = VecScatterCreate(x,list2,y,list1,&vecscat);CHKERRQ(ierr);
1010       ierr = VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
1011       ierr = VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
1012       ierr = VecScatterDestroy(&vecscat);CHKERRQ(ierr);
1013       ierr = ISDestroy(&list1);CHKERRQ(ierr);
1014       ierr = ISDestroy(&list2);CHKERRQ(ierr);
1015       ierr = PetscFree(indx1);CHKERRQ(ierr);
1016       ierr = PetscFree(indx2);CHKERRQ(ierr);
1017 #endif
1018       break;
1019     }
1020   }
1021   PetscFunctionReturn(0);
1022 }
1023 EXTERN_C_END
1024 
1025 EXTERN_C_BEGIN
1026 #undef __FUNCT__
1027 #define __FUNCT__ "MatCreate_FFTW"
1028 /*
1029     MatCreate_FFTW - Creates a matrix object that provides FFT via the external package FFTW
1030 
1031   Options Database Keys:
1032 + -mat_fftw_plannerflags - set FFTW planner flags
1033 
1034    Level: intermediate
1035 
1036 */
1037 PetscErrorCode MatCreate_FFTW(Mat A)
1038 {
1039   PetscErrorCode ierr;
1040   MPI_Comm       comm=((PetscObject)A)->comm;
1041   Mat_FFT        *fft=(Mat_FFT*)A->data;
1042   Mat_FFTW       *fftw;
1043   PetscInt       n=fft->n,N=fft->N,ndim=fft->ndim,*dim = fft->dim;
1044   const char     *p_flags[]={"FFTW_ESTIMATE","FFTW_MEASURE","FFTW_PATIENT","FFTW_EXHAUSTIVE"};
1045   PetscBool      flg;
1046   PetscInt       p_flag,partial_dim=1,ctr;
1047   PetscMPIInt    size,rank;
1048   ptrdiff_t      *pdim;
1049   ptrdiff_t      local_n1,local_1_start;
1050 #if !defined(PETSC_USE_COMPLEX)
1051    ptrdiff_t     temp;
1052    PetscInt      N1,tot_dim;
1053 #else
1054    PetscInt      n1;
1055 #endif
1056 
1057   PetscFunctionBegin;
1058   ierr = MPI_Comm_size(comm, &size);CHKERRQ(ierr);
1059   ierr = MPI_Comm_rank(comm, &rank);CHKERRQ(ierr);
1060 
1061   fftw_mpi_init();
1062   pdim = (ptrdiff_t *)calloc(ndim,sizeof(ptrdiff_t));
1063   pdim[0] = dim[0];
1064 #if !defined(PETSC_USE_COMPLEX)
1065   tot_dim = 2*dim[0];
1066 #endif
1067   for (ctr=1;ctr<ndim;ctr++) {
1068     partial_dim *= dim[ctr];
1069     pdim[ctr] = dim[ctr];
1070 #if !defined(PETSC_USE_COMPLEX)
1071     if (ctr==ndim-1) tot_dim *= (dim[ctr]/2+1);
1072     else             tot_dim *= dim[ctr];
1073 #endif
1074   }
1075 
1076   if (size == 1) {
1077 #if defined(PETSC_USE_COMPLEX)
1078     ierr = MatSetSizes(A,N,N,N,N);CHKERRQ(ierr);
1079     n = N;
1080 #else
1081     ierr = MatSetSizes(A,tot_dim,tot_dim,tot_dim,tot_dim);CHKERRQ(ierr);
1082     n = tot_dim;
1083 #endif
1084 
1085   } else {
1086     ptrdiff_t alloc_local,local_n0,local_0_start;
1087     switch (ndim) {
1088     case 1:
1089 #if !defined(PETSC_USE_COMPLEX)
1090       SETERRQ(comm,PETSC_ERR_SUP,"FFTW does not support parallel 1D real transform");
1091 #else
1092       alloc_local = fftw_mpi_local_size_1d(dim[0],comm,FFTW_FORWARD,FFTW_ESTIMATE,&local_n0,&local_0_start,&local_n1,&local_1_start);
1093       n = (PetscInt)local_n0;
1094       n1 = (PetscInt) local_n1;
1095       ierr = MatSetSizes(A,n1,n,N,N);CHKERRQ(ierr);
1096 #endif
1097       break;
1098     case 2:
1099 #if defined(PETSC_USE_COMPLEX)
1100       alloc_local = fftw_mpi_local_size_2d(dim[0],dim[1],comm,&local_n0,&local_0_start);
1101       /*
1102        PetscSynchronizedPrintf(comm,"[%d] MatCreateSeqFFTW: local_n0, local_0_start %d %d, N %d,dim %d, %d\n",rank,(PetscInt)local_n0*dim[1],(PetscInt)local_0_start,m,dim[0],dim[1]);
1103        PetscSynchronizedFlush(comm);
1104        */
1105       n = (PetscInt)local_n0*dim[1];
1106       ierr = MatSetSizes(A,n,n,N,N);CHKERRQ(ierr);
1107 #else
1108       alloc_local = fftw_mpi_local_size_2d_transposed(dim[0],dim[1]/2+1,comm,&local_n0,&local_0_start,&local_n1,&local_1_start);
1109       n = 2*(PetscInt)local_n0*(dim[1]/2+1);
1110       ierr = MatSetSizes(A,n,n,2*dim[0]*(dim[1]/2+1),2*dim[0]*(dim[1]/2+1));
1111 #endif
1112       break;
1113     case 3:
1114 #if defined(PETSC_USE_COMPLEX)
1115       alloc_local = fftw_mpi_local_size_3d(dim[0],dim[1],dim[2],comm,&local_n0,&local_0_start);
1116       n = (PetscInt)local_n0*dim[1]*dim[2];
1117       ierr = MatSetSizes(A,n,n,N,N);CHKERRQ(ierr);
1118 #else
1119       alloc_local = fftw_mpi_local_size_3d_transposed(dim[0],dim[1],dim[2]/2+1,comm,&local_n0,&local_0_start,&local_n1,&local_1_start);
1120       n = 2*(PetscInt)local_n0*dim[1]*(dim[2]/2+1);
1121       ierr = MatSetSizes(A,n,n,2*dim[0]*dim[1]*(dim[2]/2+1),2*dim[0]*dim[1]*(dim[2]/2+1));
1122 #endif
1123       break;
1124     default:
1125 #if defined(PETSC_USE_COMPLEX)
1126       alloc_local = fftw_mpi_local_size(ndim,pdim,comm,&local_n0,&local_0_start);
1127       n = (PetscInt)local_n0*partial_dim;
1128       ierr = MatSetSizes(A,n,n,N,N);CHKERRQ(ierr);
1129 #else
1130       temp = pdim[ndim-1];
1131       pdim[ndim-1] = temp/2 + 1;
1132       alloc_local = fftw_mpi_local_size_transposed(ndim,pdim,comm,&local_n0,&local_0_start,&local_n1,&local_1_start);
1133       n = 2*(PetscInt)local_n0*partial_dim*pdim[ndim-1]/temp;
1134       N1 = 2*N*(PetscInt)pdim[ndim-1]/((PetscInt) temp);
1135       pdim[ndim-1] = temp;
1136       ierr = MatSetSizes(A,n,n,N1,N1);CHKERRQ(ierr);
1137 #endif
1138       break;
1139     }
1140   }
1141   ierr = PetscObjectChangeTypeName((PetscObject)A,MATFFTW);CHKERRQ(ierr);
1142   ierr = PetscNewLog(A,Mat_FFTW,&fftw);CHKERRQ(ierr);
1143   fft->data = (void*)fftw;
1144 
1145   fft->n            = n;
1146   fftw->ndim_fftw   = (ptrdiff_t)ndim; /* This is dimension of fft */
1147   fftw->partial_dim = partial_dim;
1148   ierr = PetscMalloc(ndim*sizeof(ptrdiff_t), (ptrdiff_t *)&(fftw->dim_fftw));CHKERRQ(ierr);
1149   for (ctr=0;ctr<ndim;ctr++) (fftw->dim_fftw)[ctr]=dim[ctr];
1150 
1151   fftw->p_forward  = 0;
1152   fftw->p_backward = 0;
1153   fftw->p_flag     = FFTW_ESTIMATE;
1154 
1155   if (size == 1) {
1156     A->ops->mult          = MatMult_SeqFFTW;
1157     A->ops->multtranspose = MatMultTranspose_SeqFFTW;
1158   } else {
1159     A->ops->mult          = MatMult_MPIFFTW;
1160     A->ops->multtranspose = MatMultTranspose_MPIFFTW;
1161   }
1162   fft->matdestroy = MatDestroy_FFTW;
1163   A->assembled    = PETSC_TRUE;
1164   A->preallocated = PETSC_TRUE;
1165   ierr = PetscObjectComposeFunctionDynamic((PetscObject)A,"MatGetVecsFFTW_C","MatGetVecsFFTW_FFTW",MatGetVecsFFTW_FFTW);CHKERRQ(ierr);
1166   ierr = PetscObjectComposeFunctionDynamic((PetscObject)A,"VecScatterPetscToFFTW_C","VecScatterPetscToFFTW_FFTW",VecScatterPetscToFFTW_FFTW);CHKERRQ(ierr);
1167   ierr = PetscObjectComposeFunctionDynamic((PetscObject)A,"VecScatterFFTWToPetsc_C","VecScatterFFTWToPetsc_FFTW",VecScatterFFTWToPetsc_FFTW);CHKERRQ(ierr);
1168 
1169   /* get runtime options */
1170   ierr = PetscOptionsBegin(((PetscObject)A)->comm,((PetscObject)A)->prefix,"FFTW Options","Mat");CHKERRQ(ierr);
1171     ierr = PetscOptionsEList("-mat_fftw_plannerflags","Planner Flags","None",p_flags,4,p_flags[0],&p_flag,&flg);CHKERRQ(ierr);
1172     if (flg) {fftw->p_flag = (unsigned)p_flag;}
1173   ierr = PetscOptionsEnd();CHKERRQ(ierr);
1174   PetscFunctionReturn(0);
1175 }
1176 EXTERN_C_END
1177 
1178 
1179 
1180 
1181