xref: /petsc/src/mat/impls/fft/fftw/fftw.c (revision ff218e97a57ed641f3ebc93f697e38ef0f3aa217)
1 
2 /*
3     Provides an interface to the FFTW package.
4     Testing examples can be found in ~src/mat/examples/tests
5 */
6 
7 #include <../src/mat/impls/fft/fft.h>   /*I "petscmat.h" I*/
8 EXTERN_C_BEGIN
9 #include <fftw3-mpi.h>
10 EXTERN_C_END
11 
12 typedef struct {
13   ptrdiff_t   ndim_fftw,*dim_fftw;
14   PetscInt    partial_dim;
15   fftw_plan   p_forward,p_backward;
16   unsigned    p_flag; /* planner flags, FFTW_ESTIMATE,FFTW_MEASURE, FFTW_PATIENT, FFTW_EXHAUSTIVE */
17   PetscScalar *finarray,*foutarray,*binarray,*boutarray; /* keep track of arrays becaue fftw plan should be
18                                                             executed for the arrays with which the plan was created */
19 } Mat_FFTW;
20 
21 extern PetscErrorCode MatMult_SeqFFTW(Mat,Vec,Vec);
22 extern PetscErrorCode MatMultTranspose_SeqFFTW(Mat,Vec,Vec);
23 extern PetscErrorCode MatMult_MPIFFTW(Mat,Vec,Vec);
24 extern PetscErrorCode MatMultTranspose_MPIFFTW(Mat,Vec,Vec);
25 extern PetscErrorCode MatDestroy_FFTW(Mat);
26 extern PetscErrorCode VecDestroy_MPIFFTW(Vec);
27 
28 /* MatMult_SeqFFTW performs forward DFT in parallel
29 
30    Input parameter:
31    mat - the matrix
32    x   - the vector on which FDFT will be performed
33 
34    Output parameter:
35    y   - vector that stores result of FDFT
36 
37 */
38 #undef __FUNCT__
39 #define __FUNCT__ "MatMult_SeqFFTW"
40 PetscErrorCode MatMult_SeqFFTW(Mat A,Vec x,Vec y)
41 {
42   PetscErrorCode ierr;
43   Mat_FFT        *fft  = (Mat_FFT*)A->data;
44   Mat_FFTW       *fftw = (Mat_FFTW*)fft->data;
45   PetscScalar    *x_array,*y_array;
46   PetscInt       ndim=fft->ndim,*dim=fft->dim;
47 
48   PetscFunctionBegin;
49   ierr = VecGetArray(x,&x_array);CHKERRQ(ierr);
50   ierr = VecGetArray(y,&y_array);CHKERRQ(ierr);
51   if (!fftw->p_forward){ /* create a plan, then excute it */
52     switch (ndim){
53     case 1:
54 #if defined(PETSC_USE_COMPLEX)
55       fftw->p_forward = fftw_plan_dft_1d(dim[0],(fftw_complex*)x_array,(fftw_complex*)y_array,FFTW_FORWARD,fftw->p_flag);
56 #else
57       fftw->p_forward = fftw_plan_dft_r2c_1d(dim[0],(double *)x_array,(fftw_complex*)y_array,fftw->p_flag);
58 #endif
59       break;
60     case 2:
61 #if defined(PETSC_USE_COMPLEX)
62       fftw->p_forward = fftw_plan_dft_2d(dim[0],dim[1],(fftw_complex*)x_array,(fftw_complex*)y_array,FFTW_FORWARD,fftw->p_flag);
63 #else
64       fftw->p_forward = fftw_plan_dft_r2c_2d(dim[0],dim[1],(double *)x_array,(fftw_complex*)y_array,fftw->p_flag);
65 #endif
66       break;
67     case 3:
68 #if defined(PETSC_USE_COMPLEX)
69       fftw->p_forward = fftw_plan_dft_3d(dim[0],dim[1],dim[2],(fftw_complex*)x_array,(fftw_complex*)y_array,FFTW_FORWARD,fftw->p_flag);
70 #else
71       fftw->p_forward = fftw_plan_dft_r2c_3d(dim[0],dim[1],dim[2],(double *)x_array,(fftw_complex*)y_array,fftw->p_flag);
72 #endif
73       break;
74     default:
75 #if defined(PETSC_USE_COMPLEX)
76       fftw->p_forward = fftw_plan_dft(ndim,dim,(fftw_complex*)x_array,(fftw_complex*)y_array,FFTW_FORWARD,fftw->p_flag);
77 #else
78       fftw->p_forward = fftw_plan_dft_r2c(ndim,dim,(double *)x_array,(fftw_complex*)y_array,fftw->p_flag);
79 #endif
80       break;
81     }
82     fftw->finarray  = x_array;
83     fftw->foutarray = y_array;
84     /* Warning: if (fftw->p_flag!==FFTW_ESTIMATE) The data in the in/out arrays is overwritten!
85                 planning should be done before x is initialized! See FFTW manual sec2.1 or sec4 */
86     fftw_execute(fftw->p_forward);
87   } else { /* use existing plan */
88     if (fftw->finarray != x_array || fftw->foutarray != y_array){ /* use existing plan on new arrays */
89       fftw_execute_dft(fftw->p_forward,(fftw_complex*)x_array,(fftw_complex*)y_array);
90     } else {
91       fftw_execute(fftw->p_forward);
92     }
93   }
94   ierr = VecRestoreArray(y,&y_array);CHKERRQ(ierr);
95   ierr = VecRestoreArray(x,&x_array);CHKERRQ(ierr);
96   PetscFunctionReturn(0);
97 }
98 
99 /* MatMultTranspose_SeqFFTW performs serial backward DFT
100 
101    Input parameter:
102    mat - the matrix
103    x   - the vector on which BDFT will be performed
104 
105    Output parameter:
106    y   - vector that stores result of BDFT
107 
108 */
109 
110 #undef __FUNCT__
111 #define __FUNCT__ "MatMultTranspose_SeqFFTW"
112 PetscErrorCode MatMultTranspose_SeqFFTW(Mat A,Vec x,Vec y)
113 {
114   PetscErrorCode ierr;
115   Mat_FFT        *fft = (Mat_FFT*)A->data;
116   Mat_FFTW       *fftw = (Mat_FFTW*)fft->data;
117   PetscScalar    *x_array,*y_array;
118   PetscInt       ndim=fft->ndim,*dim=fft->dim;
119 
120   PetscFunctionBegin;
121   ierr = VecGetArray(x,&x_array);CHKERRQ(ierr);
122   ierr = VecGetArray(y,&y_array);CHKERRQ(ierr);
123   if (!fftw->p_backward){ /* create a plan, then excute it */
124     switch (ndim){
125     case 1:
126 #if defined(PETSC_USE_COMPLEX)
127       fftw->p_backward = fftw_plan_dft_1d(dim[0],(fftw_complex*)x_array,(fftw_complex*)y_array,FFTW_BACKWARD,fftw->p_flag);
128 #else
129       fftw->p_backward= fftw_plan_dft_c2r_1d(dim[0],(fftw_complex*)x_array,(double *)y_array,fftw->p_flag);
130 #endif
131       break;
132     case 2:
133 #if defined(PETSC_USE_COMPLEX)
134       fftw->p_backward = fftw_plan_dft_2d(dim[0],dim[1],(fftw_complex*)x_array,(fftw_complex*)y_array,FFTW_BACKWARD,fftw->p_flag);
135 #else
136       fftw->p_backward= fftw_plan_dft_c2r_2d(dim[0],dim[1],(fftw_complex*)x_array,(double *)y_array,fftw->p_flag);
137 #endif
138       break;
139     case 3:
140 #if defined(PETSC_USE_COMPLEX)
141       fftw->p_backward = fftw_plan_dft_3d(dim[0],dim[1],dim[2],(fftw_complex*)x_array,(fftw_complex*)y_array,FFTW_BACKWARD,fftw->p_flag);
142 #else
143       fftw->p_backward= fftw_plan_dft_c2r_3d(dim[0],dim[1],dim[2],(fftw_complex*)x_array,(double *)y_array,fftw->p_flag);
144 #endif
145       break;
146     default:
147 #if defined(PETSC_USE_COMPLEX)
148       fftw->p_backward = fftw_plan_dft(ndim,dim,(fftw_complex*)x_array,(fftw_complex*)y_array,FFTW_BACKWARD,fftw->p_flag);
149 #else
150       fftw->p_backward= fftw_plan_dft_c2r(ndim,dim,(fftw_complex*)x_array,(double *)y_array,fftw->p_flag);
151 #endif
152       break;
153     }
154     fftw->binarray  = x_array;
155     fftw->boutarray = y_array;
156     fftw_execute(fftw->p_backward);CHKERRQ(ierr);
157   } else { /* use existing plan */
158     if (fftw->binarray != x_array || fftw->boutarray != y_array){ /* use existing plan on new arrays */
159       fftw_execute_dft(fftw->p_backward,(fftw_complex*)x_array,(fftw_complex*)y_array);
160     } else {
161       fftw_execute(fftw->p_backward);CHKERRQ(ierr);
162     }
163   }
164   ierr = VecRestoreArray(y,&y_array);CHKERRQ(ierr);
165   ierr = VecRestoreArray(x,&x_array);CHKERRQ(ierr);
166   PetscFunctionReturn(0);
167 }
168 
169 /* MatMult_MPIFFTW performs forward DFT in parallel
170 
171    Input parameter:
172    mat - the matrix
173    x   - the vector on which FDFT will be performed
174 
175    Output parameter:
176    y   - vector that stores result of FDFT
177 
178 */
179 
180 #undef __FUNCT__
181 #define __FUNCT__ "MatMult_MPIFFTW"
182 PetscErrorCode MatMult_MPIFFTW(Mat A,Vec x,Vec y)
183 {
184   PetscErrorCode ierr;
185   Mat_FFT        *fft  = (Mat_FFT*)A->data;
186   Mat_FFTW       *fftw = (Mat_FFTW*)fft->data;
187   PetscScalar    *x_array,*y_array;
188   PetscInt       ndim=fft->ndim,*dim=fft->dim;
189   MPI_Comm       comm=((PetscObject)A)->comm;
190 
191   PetscFunctionBegin;
192   ierr = VecGetArray(x,&x_array);CHKERRQ(ierr);
193   ierr = VecGetArray(y,&y_array);CHKERRQ(ierr);
194   if (!fftw->p_forward){ /* create a plan, then excute it */
195     switch (ndim){
196     case 1:
197 #if defined(PETSC_USE_COMPLEX)
198       fftw->p_forward = fftw_mpi_plan_dft_1d(dim[0],(fftw_complex*)x_array,(fftw_complex*)y_array,comm,FFTW_FORWARD,fftw->p_flag);
199 #else
200       SETERRQ(PETSC_COMM_WORLD,PETSC_ERR_SUP,"not support for real numbers yet");
201 #endif
202       break;
203     case 2:
204 #if defined(PETSC_USE_COMPLEX) /* For complex transforms call fftw_mpi_plan_dft, for real transforms call fftw_mpi_plan_dft_r2c */
205       fftw->p_forward = fftw_mpi_plan_dft_2d(dim[0],dim[1],(fftw_complex*)x_array,(fftw_complex*)y_array,comm,FFTW_FORWARD,fftw->p_flag);
206 #else
207       fftw->p_forward = fftw_mpi_plan_dft_r2c_2d(dim[0],dim[1],(double *)x_array,(fftw_complex*)y_array,comm,FFTW_ESTIMATE);
208 #endif
209       break;
210     case 3:
211 #if defined(PETSC_USE_COMPLEX)
212       fftw->p_forward = fftw_mpi_plan_dft_3d(dim[0],dim[1],dim[2],(fftw_complex*)x_array,(fftw_complex*)y_array,comm,FFTW_FORWARD,fftw->p_flag);
213 #else
214       fftw->p_forward = fftw_mpi_plan_dft_r2c_3d(dim[0],dim[1],dim[2],(double *)x_array,(fftw_complex*)y_array,comm,FFTW_ESTIMATE);
215 #endif
216       break;
217     default:
218 #if defined(PETSC_USE_COMPLEX)
219       fftw->p_forward = fftw_mpi_plan_dft(fftw->ndim_fftw,fftw->dim_fftw,(fftw_complex*)x_array,(fftw_complex*)y_array,comm,FFTW_FORWARD,fftw->p_flag);
220 #else
221       fftw->p_forward = fftw_mpi_plan_dft_r2c(fftw->ndim_fftw,fftw->dim_fftw,(double *)x_array,(fftw_complex*)y_array,comm,FFTW_ESTIMATE);
222 #endif
223       break;
224     }
225     fftw->finarray  = x_array;
226     fftw->foutarray = y_array;
227     /* Warning: if (fftw->p_flag!==FFTW_ESTIMATE) The data in the in/out arrays is overwritten!
228                 planning should be done before x is initialized! See FFTW manual sec2.1 or sec4 */
229     fftw_execute(fftw->p_forward);
230   } else { /* use existing plan */
231     if (fftw->finarray != x_array || fftw->foutarray != y_array){ /* use existing plan on new arrays */
232       fftw_execute_dft(fftw->p_forward,(fftw_complex*)x_array,(fftw_complex*)y_array);
233     } else {
234       fftw_execute(fftw->p_forward);
235     }
236   }
237   ierr = VecRestoreArray(y,&y_array);CHKERRQ(ierr);
238   ierr = VecRestoreArray(x,&x_array);CHKERRQ(ierr);
239   PetscFunctionReturn(0);
240 }
241 
242 /* MatMultTranspose_MPIFFTW performs parallel backward DFT
243 
244    Input parameter:
245    mat - the matrix
246    x   - the vector on which BDFT will be performed
247 
248    Output parameter:
249    y   - vector that stores result of BDFT
250 
251 */
252 #undef __FUNCT__
253 #define __FUNCT__ "MatMultTranspose_MPIFFTW"
254 PetscErrorCode MatMultTranspose_MPIFFTW(Mat A,Vec x,Vec y)
255 {
256   PetscErrorCode ierr;
257   Mat_FFT        *fft  = (Mat_FFT*)A->data;
258   Mat_FFTW       *fftw = (Mat_FFTW*)fft->data;
259   PetscScalar    *x_array,*y_array;
260   PetscInt       ndim=fft->ndim,*dim=fft->dim;
261   MPI_Comm       comm=((PetscObject)A)->comm;
262 
263   PetscFunctionBegin;
264   ierr = VecGetArray(x,&x_array);CHKERRQ(ierr);
265   ierr = VecGetArray(y,&y_array);CHKERRQ(ierr);
266   if (!fftw->p_backward){ /* create a plan, then excute it */
267     switch (ndim){
268     case 1:
269 #if defined(PETSC_USE_COMPLEX)
270       fftw->p_backward = fftw_mpi_plan_dft_1d(dim[0],(fftw_complex*)x_array,(fftw_complex*)y_array,comm,FFTW_BACKWARD,fftw->p_flag);
271 #else
272       SETERRQ(PETSC_COMM_WORLD,PETSC_ERR_SUP,"not support for real numbers yet");
273 #endif
274       break;
275     case 2:
276 #if defined(PETSC_USE_COMPLEX) /* For complex transforms call fftw_mpi_plan_dft with flag FFTW_BACKWARD, for real transforms call fftw_mpi_plan_dft_c2r */
277       fftw->p_backward = fftw_mpi_plan_dft_2d(dim[0],dim[1],(fftw_complex*)x_array,(fftw_complex*)y_array,comm,FFTW_BACKWARD,fftw->p_flag);
278 #else
279       fftw->p_backward = fftw_mpi_plan_dft_c2r_2d(dim[0],dim[1],(fftw_complex*)x_array,(double *)y_array,comm,FFTW_ESTIMATE);
280 #endif
281       break;
282     case 3:
283 #if defined(PETSC_USE_COMPLEX)
284       fftw->p_backward = fftw_mpi_plan_dft_3d(dim[0],dim[1],dim[2],(fftw_complex*)x_array,(fftw_complex*)y_array,comm,FFTW_BACKWARD,fftw->p_flag);
285 #else
286       fftw->p_backward = fftw_mpi_plan_dft_c2r_3d(dim[0],dim[1],dim[2],(fftw_complex*)x_array,(double *)y_array,comm,FFTW_ESTIMATE);
287 #endif
288       break;
289     default:
290 #if defined(PETSC_USE_COMPLEX)
291       fftw->p_backward = fftw_mpi_plan_dft(fftw->ndim_fftw,fftw->dim_fftw,(fftw_complex*)x_array,(fftw_complex*)y_array,comm,FFTW_BACKWARD,fftw->p_flag);
292 #else
293       fftw->p_backward = fftw_mpi_plan_dft_c2r(fftw->ndim_fftw,fftw->dim_fftw,(fftw_complex*)x_array,(double *)y_array,comm,FFTW_ESTIMATE);
294 #endif
295       break;
296     }
297     fftw->binarray  = x_array;
298     fftw->boutarray = y_array;
299     fftw_execute(fftw->p_backward);CHKERRQ(ierr);
300   } else { /* use existing plan */
301     if (fftw->binarray != x_array || fftw->boutarray != y_array){ /* use existing plan on new arrays */
302       fftw_execute_dft(fftw->p_backward,(fftw_complex*)x_array,(fftw_complex*)y_array);
303     } else {
304       fftw_execute(fftw->p_backward);CHKERRQ(ierr);
305     }
306   }
307   ierr = VecRestoreArray(y,&y_array);CHKERRQ(ierr);
308   ierr = VecRestoreArray(x,&x_array);CHKERRQ(ierr);
309   PetscFunctionReturn(0);
310 }
311 
312 #undef __FUNCT__
313 #define __FUNCT__ "MatDestroy_FFTW"
314 PetscErrorCode MatDestroy_FFTW(Mat A)
315 {
316   Mat_FFT        *fft = (Mat_FFT*)A->data;
317   Mat_FFTW       *fftw = (Mat_FFTW*)fft->data;
318   PetscErrorCode ierr;
319 
320   PetscFunctionBegin;
321   fftw_destroy_plan(fftw->p_forward);
322   fftw_destroy_plan(fftw->p_backward);
323   ierr = PetscFree(fftw->dim_fftw);CHKERRQ(ierr);
324   ierr = PetscFree(fft->data);CHKERRQ(ierr);
325   PetscFunctionReturn(0);
326 }
327 
328 #include <../src/vec/vec/impls/mpi/pvecimpl.h>   /*I  "petscvec.h"   I*/
329 #undef __FUNCT__
330 #define __FUNCT__ "VecDestroy_MPIFFTW"
331 PetscErrorCode VecDestroy_MPIFFTW(Vec v)
332 {
333   PetscErrorCode  ierr;
334   PetscScalar     *array;
335 
336   PetscFunctionBegin;
337   ierr = VecGetArray(v,&array);CHKERRQ(ierr);
338   fftw_free((fftw_complex*)array);CHKERRQ(ierr);
339   ierr = VecRestoreArray(v,&array);CHKERRQ(ierr);
340   ierr = VecDestroy_MPI(v);CHKERRQ(ierr);
341   PetscFunctionReturn(0);
342 }
343 
344 
345 #undef __FUNCT__
346 #define __FUNCT__ "MatGetVecsFFTW"
347 /*@
348    MatGetVecsFFTW - Get vector(s) compatible with the matrix, i.e. with the
349      parallel layout determined by FFTW
350 
351    Collective on Mat
352 
353    Input Parameter:
354 .  mat - the matrix
355 
356    Output Parameter:
357 +   fin - (optional) input vector of forward FFTW
358 -   fout - (optional) output vector of forward FFTW
359 -   bout - (optional) output vector of backward FFTW
360 
361   Level: advanced
362   Note: The parallel layout of output of forward FFTW is always same as the input
363         of the backward FFTW. But parallel layout of the input vector of forward
364         FFTW might not be same as the output of backward FFTW.
365         Also note that we need to provide enough space while doing parallel real transform.
366         We need to pad extra zeros at the end of last dimension. For this reason the one needs to
367         invoke the routine fftw_mpi_local_size_transposed routines. Remember one has to change the
368         last dimension from n to n/2+1 while invoking this routine.
369 
370 .seealso: MatCreateFFTW()
371 @*/
372 PetscErrorCode MatGetVecsFFTW(Mat A,Vec *x,Vec *y,Vec *z)
373 {
374   PetscErrorCode ierr;
375 
376   PetscFunctionBegin;
377   ierr = PetscTryMethod(A,"MatGetVecsFFTW_C",(Mat,Vec*,Vec*,Vec*),(A,x,y,z));CHKERRQ(ierr);
378   PetscFunctionReturn(0);
379 }
380 
381 EXTERN_C_BEGIN
382 #undef __FUNCT__
383 #define __FUNCT__ "MatGetVecsFFTW_FFTW"
384 PetscErrorCode  MatGetVecsFFTW_FFTW(Mat A,Vec *fin,Vec *fout,Vec *bout)
385 {
386   PetscErrorCode ierr;
387   PetscMPIInt    size,rank;
388   MPI_Comm       comm=((PetscObject)A)->comm;
389   Mat_FFT        *fft = (Mat_FFT*)A->data;
390   Mat_FFTW       *fftw = (Mat_FFTW*)fft->data;
391   PetscInt       N=fft->N;
392   PetscInt       ndim=fft->ndim,*dim=fft->dim,n=fft->n;
393 
394   PetscFunctionBegin;
395   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
396   PetscValidType(A,1);
397 
398   ierr = MPI_Comm_size(PETSC_COMM_WORLD, &size);CHKERRQ(ierr);
399   ierr = MPI_Comm_rank(PETSC_COMM_WORLD, &rank);CHKERRQ(ierr);
400   if (size == 1){ /* sequential case */
401 #if defined(PETSC_USE_COMPLEX)
402     if (fin) {ierr = VecCreateSeq(PETSC_COMM_SELF,N,fin);CHKERRQ(ierr);}
403     if (fout){ierr = VecCreateSeq(PETSC_COMM_SELF,N,fout);CHKERRQ(ierr);}
404     if (bout){ierr = VecCreateSeq(PETSC_COMM_SELF,N,bout);CHKERRQ(ierr);}
405 #else
406     if (fin) {ierr = VecCreateSeq(PETSC_COMM_SELF,n,fin);CHKERRQ(ierr);}
407     if (fout){ierr = VecCreateSeq(PETSC_COMM_SELF,n,fout);CHKERRQ(ierr);}
408     if (bout){ierr = VecCreateSeq(PETSC_COMM_SELF,n,bout);CHKERRQ(ierr);}
409 #endif
410   } else { /* parallel cases */
411     ptrdiff_t      alloc_local,local_n0,local_0_start;
412     ptrdiff_t      local_n1;
413     fftw_complex   *data_fout;
414     ptrdiff_t      local_1_start;
415 #if defined(PETSC_USE_COMPLEX)
416     fftw_complex   *data_fin,*data_bout;
417 #else
418     double         *data_finr,*data_boutr;
419     PetscInt       n1,N1,vsize;
420     ptrdiff_t      temp;
421 #endif
422 
423     switch (ndim){
424           case 1:
425 #if !defined(PETSC_USE_COMPLEX)
426                  SETERRQ(PETSC_COMM_WORLD,PETSC_ERR_SUP,"FFTW does not allow parallel real 1D transform");
427 #else
428                  alloc_local = fftw_mpi_local_size_1d(dim[0],comm,FFTW_FORWARD,FFTW_ESTIMATE,&local_n0,&local_0_start,&local_n1,&local_1_start);
429                  if (fin) {
430                          data_fin  = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
431                          ierr = VecCreateMPIWithArray(comm,local_n0,N,(const PetscScalar*)data_fin,fin);CHKERRQ(ierr);
432                          (*fin)->ops->destroy = VecDestroy_MPIFFTW;
433                          }
434                  if (fout) {
435                           data_fout = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
436                           ierr = VecCreateMPIWithArray(comm,local_n1,N,(const PetscScalar*)data_fout,fout);CHKERRQ(ierr);
437                           (*fout)->ops->destroy = VecDestroy_MPIFFTW;
438                          }
439                  alloc_local = fftw_mpi_local_size_1d(dim[0],comm,FFTW_BACKWARD,FFTW_ESTIMATE,&local_n0,&local_0_start,&local_n1,&local_1_start);
440                  if (bout) {
441                           data_bout = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
442                           ierr = VecCreateMPIWithArray(comm,local_n1,N,(const PetscScalar*)data_bout,bout);CHKERRQ(ierr);
443                           (*bout)->ops->destroy = VecDestroy_MPIFFTW;
444                          }
445           break;
446 #endif
447           case 2:
448 #if !defined(PETSC_USE_COMPLEX) /* Note that N1 is no more the product of individual dimensions */
449                  alloc_local =  fftw_mpi_local_size_2d_transposed(dim[0],dim[1]/2+1,comm,&local_n0,&local_0_start,&local_n1,&local_1_start);
450                  N1 = 2*dim[0]*(dim[1]/2+1); n1 = 2*local_n0*(dim[1]/2+1);
451                  if (fin) {
452                            data_finr=(double *)fftw_malloc(sizeof(double)*alloc_local*2);
453                            ierr = VecCreateMPIWithArray(PETSC_COMM_WORLD,(PetscInt)n1,N1,(PetscScalar*)data_finr,fin);CHKERRQ(ierr);
454                            (*fin)->ops->destroy   = VecDestroy_MPIFFTW;
455                           }
456                  if (bout) {
457                            data_boutr=(double *)fftw_malloc(sizeof(double)*alloc_local*2);
458                            ierr = VecCreateMPIWithArray(PETSC_COMM_WORLD,(PetscInt)n1,N1,(PetscScalar*)data_boutr,bout);CHKERRQ(ierr);
459                            ierr = VecGetSize(*bout,&vsize);CHKERRQ(ierr);
460                            (*bout)->ops->destroy   = VecDestroy_MPIFFTW;
461                           }
462                  //n1 = 2*local_n1*dim[0];
463                  if (fout) {
464                             data_fout=(fftw_complex *)fftw_malloc(sizeof(fftw_complex)*alloc_local);
465                             ierr = VecCreateMPIWithArray(PETSC_COMM_WORLD,(PetscInt)n1,N1,(PetscScalar*)data_fout,fout);CHKERRQ(ierr);
466                             (*fout)->ops->destroy   = VecDestroy_MPIFFTW;
467                            }
468 #else
469       /* Get local size */
470                  alloc_local = fftw_mpi_local_size_2d(dim[0],dim[1],comm,&local_n0,&local_0_start);
471                  if (fin) {
472                            data_fin  = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
473                            ierr = VecCreateMPIWithArray(comm,n,N,(const PetscScalar*)data_fin,fin);CHKERRQ(ierr);
474                            (*fin)->ops->destroy   = VecDestroy_MPIFFTW;
475                           }
476                  if (fout) {
477                             data_fout = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
478                             ierr = VecCreateMPIWithArray(comm,n,N,(const PetscScalar*)data_fout,fout);CHKERRQ(ierr);
479                             (*fout)->ops->destroy   = VecDestroy_MPIFFTW;
480                            }
481                  if (bout) {
482                            data_bout  = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
483                            ierr = VecCreateMPIWithArray(comm,n,N,(const PetscScalar*)data_bout,bout);CHKERRQ(ierr);
484                            (*bout)->ops->destroy   = VecDestroy_MPIFFTW;
485                           }
486 #endif
487           break;
488           case 3:
489 #if !defined(PETSC_USE_COMPLEX)
490                  alloc_local =  fftw_mpi_local_size_3d_transposed(dim[0],dim[1],dim[2]/2+1,comm,&local_n0,&local_0_start,&local_n1,&local_1_start);
491                  N1 = 2*dim[0]*dim[1]*(dim[2]/2+1); n1 = 2*local_n0*dim[1]*(dim[2]/2+1);
492                  if (fin) {
493                          data_finr=(double *)fftw_malloc(sizeof(double)*alloc_local*2);
494                          ierr = VecCreateMPIWithArray(PETSC_COMM_WORLD,(PetscInt)n1,N1,(PetscScalar*)data_finr,fin);CHKERRQ(ierr);
495                          (*fin)->ops->destroy   = VecDestroy_MPIFFTW;
496                          }
497                  if (bout) {
498                          data_boutr=(double *)fftw_malloc(sizeof(double)*alloc_local*2);
499                          ierr = VecCreateMPIWithArray(PETSC_COMM_WORLD,(PetscInt)n1,N1,(PetscScalar*)data_boutr,bout);CHKERRQ(ierr);
500                          (*bout)->ops->destroy   = VecDestroy_MPIFFTW;
501                           }
502                  //n1 = 2*local_n1*dim[0]*(dim[2]/2+1);
503                  if (fout) {
504                           data_fout=(fftw_complex *)fftw_malloc(sizeof(fftw_complex)*alloc_local);
505                           ierr = VecCreateMPIWithArray(PETSC_COMM_WORLD,n1,N1,(PetscScalar*)data_fout,fout);CHKERRQ(ierr);
506                           (*fout)->ops->destroy   = VecDestroy_MPIFFTW;
507                           }
508 #else
509                  alloc_local = fftw_mpi_local_size_3d(dim[0],dim[1],dim[2],comm,&local_n0,&local_0_start);
510                  if (fin) {
511                          data_fin  = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
512                          ierr = VecCreateMPIWithArray(comm,n,N,(const PetscScalar*)data_fin,fin);CHKERRQ(ierr);
513                          (*fin)->ops->destroy   = VecDestroy_MPIFFTW;
514                          }
515                  if (fout) {
516                           data_fout = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
517                           ierr = VecCreateMPIWithArray(comm,n,N,(const PetscScalar*)data_fout,fout);CHKERRQ(ierr);
518                           (*fout)->ops->destroy   = VecDestroy_MPIFFTW;
519                          }
520                  if (bout) {
521                          data_bout  = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
522                          ierr = VecCreateMPIWithArray(comm,n,N,(const PetscScalar*)data_bout,bout);CHKERRQ(ierr);
523                          (*bout)->ops->destroy   = VecDestroy_MPIFFTW;
524                          }
525 #endif
526           break;
527           default:
528 #if !defined(PETSC_USE_COMPLEX)
529                  temp = (fftw->dim_fftw)[fftw->ndim_fftw-1];
530                  (fftw->dim_fftw)[fftw->ndim_fftw-1] = temp/2 + 1;
531                  alloc_local = fftw_mpi_local_size_transposed(fftw->ndim_fftw,fftw->dim_fftw,comm,&local_n0,&local_0_start,&local_n1,&local_1_start);
532                  N1 = 2*N*(PetscInt)((fftw->dim_fftw)[fftw->ndim_fftw-1])/((PetscInt) temp);
533                  (fftw->dim_fftw)[fftw->ndim_fftw-1] = temp;
534 
535                  if (fin) {
536                          data_finr=(double *)fftw_malloc(sizeof(double)*alloc_local*2);
537                          ierr = VecCreateMPIWithArray(PETSC_COMM_WORLD,(PetscInt)n,N1,(PetscScalar*)data_finr,fin);CHKERRQ(ierr);
538                          (*fin)->ops->destroy   = VecDestroy_MPIFFTW;
539                         }
540                  if (bout) {
541                          data_boutr=(double *)fftw_malloc(sizeof(double)*alloc_local*2);
542                          ierr = VecCreateMPIWithArray(PETSC_COMM_WORLD,(PetscInt)n,N1,(PetscScalar*)data_boutr,bout);CHKERRQ(ierr);
543                          (*bout)->ops->destroy   = VecDestroy_MPIFFTW;
544                         }
545                  //temp = fftw->partial_dim;
546                  //fftw->partial_dim = fftw->partial_dim * ((fftw->dim_fftw)[fftw->ndim_fftw-1]/2+1)*(fftw->dim_fftw)[1]/((fftw->dim_fftw)[2]*(fftw->dim_fftw)[fftw->ndim_fftw-1]);
547                  //n1 = 2*local_n1*(fftw->partial_dim);  fftw->partial_dim = temp;
548                  if (fout) {
549                           data_fout=(fftw_complex *)fftw_malloc(sizeof(fftw_complex)*alloc_local);
550                           ierr = VecCreateMPIWithArray(PETSC_COMM_WORLD,n,N1,(PetscScalar*)data_fout,fout);CHKERRQ(ierr);
551                           (*fout)->ops->destroy   = VecDestroy_MPIFFTW;
552                         }
553 #else
554                 alloc_local = fftw_mpi_local_size(fftw->ndim_fftw,fftw->dim_fftw,comm,&local_n0,&local_0_start);
555                 if (fin) {
556                        data_fin  = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
557                        ierr = VecCreateMPIWithArray(comm,n,N,(const PetscScalar*)data_fin,fin);CHKERRQ(ierr);
558                        (*fin)->ops->destroy   = VecDestroy_MPIFFTW;
559                        }
560                 if (fout) {
561                          data_fout = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
562                          ierr = VecCreateMPIWithArray(comm,n,N,(const PetscScalar*)data_fout,fout);CHKERRQ(ierr);
563                          (*fout)->ops->destroy   = VecDestroy_MPIFFTW;
564                        }
565                 if (bout) {
566                        data_bout  = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
567                        ierr = VecCreateMPIWithArray(comm,n,N,(const PetscScalar*)data_bout,bout);CHKERRQ(ierr);
568                        (*bout)->ops->destroy   = VecDestroy_MPIFFTW;
569                        }
570 #endif
571             break;
572           }
573   }
574   PetscFunctionReturn(0);
575 }
576 EXTERN_C_END
577 
578 #undef __FUNCT__
579 #define __FUNCT__ "VecScatterPetscToFFTW"
580 PetscErrorCode VecScatterPetscToFFTW(Mat A,Vec x,Vec y)
581 {
582   PetscErrorCode ierr;
583   PetscFunctionBegin;
584   ierr = PetscTryMethod(A,"VecScatterPetscToFFTW_C",(Mat,Vec,Vec),(A,x,y));CHKERRQ(ierr);
585   PetscFunctionReturn(0);
586 }
587 
588 /*
589       VecScatterPetscToFFTW_FFTW - Copies the user data to the vector that goes into FFTW block.
590   Input A, x, y
591   A - FFTW matrix
592   x - user data
593   Options Database Keys:
594 + -mat_fftw_plannerflags - set FFTW planner flags
595 
596    Level: intermediate
597    Note: For real parallel FFT, FFTW requires insertion of extra space at the end of last dimension. This required even when
598          one is not doing in-place transform. The last dimension size must be changed to 2*(dim[last]/2+1) to accommodate these extra
599          zeros. This routine does that job by scattering operation.
600 
601 
602 */
603 
604 EXTERN_C_BEGIN
605 #undef __FUNCT__
606 #define __FUNCT__ "VecScatterPetscToFFTW_FTTW"
607 PetscErrorCode VecScatterPetscToFFTW_FFTW(Mat A,Vec x,Vec y)
608 {
609   PetscErrorCode ierr;
610   MPI_Comm       comm=((PetscObject)A)->comm;
611   Mat_FFT        *fft = (Mat_FFT*)A->data;
612   Mat_FFTW       *fftw = (Mat_FFTW*)fft->data;
613   PetscInt       N=fft->N;
614   PetscInt       ndim=fft->ndim,*dim=fft->dim;
615   PetscInt       low;
616   PetscInt       rank,size,vsize,vsize1;
617   ptrdiff_t      alloc_local,local_n0,local_0_start;
618   ptrdiff_t      local_n1,local_1_start;
619   VecScatter     vecscat;
620   IS             list1,list2;
621 #if !defined(PETSC_USE_COMPLEX)
622   PetscInt       i,j,k,partial_dim;
623   PetscInt       *indx1, *indx2, tempindx, tempindx1;
624   PetscInt       N1, n1 ,NM;
625   ptrdiff_t      temp;
626 #endif
627 
628   ierr = MPI_Comm_size(comm, &size);CHKERRQ(ierr);
629   ierr = MPI_Comm_rank(comm, &rank);CHKERRQ(ierr);
630   ierr = VecGetOwnershipRange(y,&low,PETSC_NULL);
631 
632   if (size==1)
633     {
634           ierr = VecGetSize(x,&vsize);CHKERRQ(ierr);
635           ierr = VecGetSize(y,&vsize1);CHKERRQ(ierr);
636           ierr = ISCreateStride(PETSC_COMM_SELF,N,0,1,&list1);CHKERRQ(ierr);
637           ierr = VecScatterCreate(x,list1,y,list1,&vecscat);CHKERRQ(ierr);
638           ierr = VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
639           ierr = VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
640           ierr = VecScatterDestroy(&vecscat);CHKERRQ(ierr);
641           ierr = ISDestroy(&list1);CHKERRQ(ierr);
642     }
643 
644  else{
645 
646  switch (ndim){
647  case 1:
648 #if defined(PETSC_USE_COMPLEX)
649       alloc_local = fftw_mpi_local_size_1d(dim[0],comm,FFTW_FORWARD,FFTW_ESTIMATE,&local_n0,&local_0_start,&local_n1,&local_1_start);
650       ierr = ISCreateStride(PETSC_COMM_WORLD,local_n0,local_0_start,1,&list1);
651       ierr = ISCreateStride(PETSC_COMM_WORLD,local_n0,low,1,&list2);
652       ierr = VecScatterCreate(x,list1,y,list2,&vecscat);CHKERRQ(ierr);
653       ierr = VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
654       ierr = VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
655       ierr = VecScatterDestroy(&vecscat);CHKERRQ(ierr);
656       ierr = ISDestroy(&list1);CHKERRQ(ierr);
657       ierr = ISDestroy(&list2);CHKERRQ(ierr);
658 #else
659       SETERRQ(comm,PETSC_ERR_SUP,"FFTW does not support parallel 1D real transform");
660 #endif
661  break;
662  case 2:
663 #if defined(PETSC_USE_COMPLEX)
664       alloc_local = fftw_mpi_local_size_2d(dim[0],dim[1],comm,&local_n0,&local_0_start);
665       ierr = ISCreateStride(PETSC_COMM_WORLD,local_n0*dim[1],local_0_start*dim[1],1,&list1);
666       ierr = ISCreateStride(PETSC_COMM_WORLD,local_n0*dim[1],low,1,&list2);
667       ierr = VecScatterCreate(x,list1,y,list2,&vecscat);CHKERRQ(ierr);
668       ierr = VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
669       ierr = VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
670       ierr = VecScatterDestroy(&vecscat);CHKERRQ(ierr);
671       ierr = ISDestroy(&list1);CHKERRQ(ierr);
672       ierr = ISDestroy(&list2);CHKERRQ(ierr);
673 #else
674       alloc_local =  fftw_mpi_local_size_2d_transposed(dim[0],dim[1]/2+1,comm,&local_n0,&local_0_start,&local_n1,&local_1_start);
675       N1 = 2*dim[0]*(dim[1]/2+1); n1 = 2*local_n0*(dim[1]/2+1);
676       //ierr = PetscMalloc(sizeof(PetscInt)*((PetscInt)local_n0)*N1,&indx1);CHKERRQ(ierr);
677       //ierr = PetscMalloc(sizeof(PetscInt)*((PetscInt)local_n0)*N1,&indx2);CHKERRQ(ierr);
678       ierr = PetscMalloc(sizeof(PetscInt)*((PetscInt)local_n0)*dim[1],&indx1);CHKERRQ(ierr);
679       ierr = PetscMalloc(sizeof(PetscInt)*((PetscInt)local_n0)*dim[1],&indx2);CHKERRQ(ierr);
680 
681       if (dim[1]%2==0)
682         NM = dim[1]+2;
683       else
684         NM = dim[1]+1;
685 
686       for (i=0;i<local_n0;i++){
687          for (j=0;j<dim[1];j++){
688             tempindx = i*dim[1] + j;
689             tempindx1 = i*NM + j;
690             indx1[tempindx]=local_0_start*dim[1]+tempindx;
691             indx2[tempindx]=low+tempindx1;
692         }
693      }
694 
695       ierr = ISCreateGeneral(comm,local_n0*dim[1],indx1,PETSC_COPY_VALUES,&list1);CHKERRQ(ierr);
696       ierr = ISCreateGeneral(comm,local_n0*dim[1],indx2,PETSC_COPY_VALUES,&list2);CHKERRQ(ierr);
697 
698       ierr = VecScatterCreate(x,list1,y,list2,&vecscat);CHKERRQ(ierr);
699       ierr = VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
700       ierr = VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
701       ierr = VecScatterDestroy(&vecscat);CHKERRQ(ierr);
702       ierr = ISDestroy(&list1);CHKERRQ(ierr);
703       ierr = ISDestroy(&list2);CHKERRQ(ierr);
704       ierr = PetscFree(indx1);CHKERRQ(ierr);
705       ierr = PetscFree(indx2);CHKERRQ(ierr);
706 #endif
707  break;
708 
709  case 3:
710 #if defined(PETSC_USE_COMPLEX)
711       alloc_local = fftw_mpi_local_size_3d(dim[0],dim[1],dim[2],comm,&local_n0,&local_0_start);
712       ierr = ISCreateStride(PETSC_COMM_WORLD,local_n0*dim[1]*dim[2],local_0_start*dim[1]*dim[2],1,&list1);
713       ierr = ISCreateStride(PETSC_COMM_WORLD,local_n0*dim[1]*dim[2],low,1,&list2);
714       ierr = VecScatterCreate(x,list1,y,list2,&vecscat);CHKERRQ(ierr);
715       ierr = VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
716       ierr = VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
717       ierr = VecScatterDestroy(&vecscat);CHKERRQ(ierr);
718       ierr = ISDestroy(&list1);CHKERRQ(ierr);
719       ierr = ISDestroy(&list2);CHKERRQ(ierr);
720 #else
721       alloc_local =  fftw_mpi_local_size_3d_transposed(dim[0],dim[1],dim[2]/2+1,comm,&local_n0,&local_0_start,&local_n1,&local_1_start);
722       N1 = 2*dim[0]*dim[1]*(dim[2]/2+1); n1 = 2*local_n0*dim[1]*(dim[2]/2+1);
723 
724       ierr = PetscMalloc(sizeof(PetscInt)*((PetscInt)local_n0)*dim[1]*dim[2],&indx1);CHKERRQ(ierr);
725       ierr = PetscMalloc(sizeof(PetscInt)*((PetscInt)local_n0)*dim[1]*dim[2],&indx2);CHKERRQ(ierr);
726 
727       if (dim[2]%2==0)
728         NM = dim[2]+2;
729       else
730         NM = dim[2]+1;
731 
732       for (i=0;i<local_n0;i++){
733          for (j=0;j<dim[1];j++){
734             for (k=0;k<dim[2];k++){
735                tempindx = i*dim[1]*dim[2] + j*dim[2] + k;
736                tempindx1 = i*dim[1]*NM + j*NM + k;
737                indx1[tempindx]=local_0_start*dim[1]*dim[2]+tempindx;
738                indx2[tempindx]=low+tempindx1;
739             }
740          }
741       }
742 
743       ierr = ISCreateGeneral(comm,local_n0*dim[1]*dim[2],indx1,PETSC_COPY_VALUES,&list1);CHKERRQ(ierr);
744       ierr = ISCreateGeneral(comm,local_n0*dim[1]*dim[2],indx2,PETSC_COPY_VALUES,&list2);CHKERRQ(ierr);
745       ierr = VecScatterCreate(x,list1,y,list2,&vecscat);CHKERRQ(ierr);
746       ierr = VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
747       ierr = VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
748       ierr = VecScatterDestroy(&vecscat);CHKERRQ(ierr);
749       ierr = ISDestroy(&list1);CHKERRQ(ierr);
750       ierr = ISDestroy(&list2);CHKERRQ(ierr);
751       ierr = PetscFree(indx1);CHKERRQ(ierr);
752       ierr = PetscFree(indx2);CHKERRQ(ierr);
753 #endif
754  break;
755 
756  default:
757 #if defined(PETSC_USE_COMPLEX)
758       alloc_local = fftw_mpi_local_size(fftw->ndim_fftw,fftw->dim_fftw,comm,&local_n0,&local_0_start);
759       ierr = ISCreateStride(PETSC_COMM_WORLD,local_n0*(fftw->partial_dim),local_0_start*(fftw->partial_dim),1,&list1);
760       ierr = ISCreateStride(PETSC_COMM_WORLD,local_n0*(fftw->partial_dim),low,1,&list2);
761       ierr = VecScatterCreate(x,list1,y,list2,&vecscat);CHKERRQ(ierr);
762       ierr = VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
763       ierr = VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
764       ierr = VecScatterDestroy(&vecscat);CHKERRQ(ierr);
765       ierr = ISDestroy(&list1);CHKERRQ(ierr);
766       ierr = ISDestroy(&list2);CHKERRQ(ierr);
767 #else
768       temp = (fftw->dim_fftw)[fftw->ndim_fftw-1];
769       (fftw->dim_fftw)[fftw->ndim_fftw-1] = temp/2 + 1;
770       alloc_local = fftw_mpi_local_size_transposed(fftw->ndim_fftw,fftw->dim_fftw,comm,&local_n0,&local_0_start,&local_n1,&local_1_start);
771       N1 = 2*N*(PetscInt)((fftw->dim_fftw)[fftw->ndim_fftw-1])/((PetscInt) temp);
772       (fftw->dim_fftw)[fftw->ndim_fftw-1] = temp;
773 
774       partial_dim = fftw->partial_dim;
775 
776       ierr = PetscMalloc(sizeof(PetscInt)*((PetscInt)local_n0)*partial_dim,&indx1);CHKERRQ(ierr);
777       ierr = PetscMalloc(sizeof(PetscInt)*((PetscInt)local_n0)*partial_dim,&indx2);CHKERRQ(ierr);
778 
779       if (dim[ndim-1]%2==0)
780         NM = 2;
781       else
782         NM = 1;
783 
784       j = low;
785       for (i=0,k=1; i<((PetscInt)local_n0)*partial_dim;i++,k++)
786          {
787           indx1[i] = local_0_start*partial_dim + i;
788           indx2[i] = j;
789           if (k%dim[ndim-1]==0)
790             { j+=NM;}
791           j++;
792          }
793       ierr = ISCreateGeneral(comm,local_n0*partial_dim,indx1,PETSC_COPY_VALUES,&list1);CHKERRQ(ierr);
794       ierr = ISCreateGeneral(comm,local_n0*partial_dim,indx2,PETSC_COPY_VALUES,&list2);CHKERRQ(ierr);
795       ierr = VecScatterCreate(x,list1,y,list2,&vecscat);CHKERRQ(ierr);
796       ierr = VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
797       ierr = VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
798       ierr = VecScatterDestroy(&vecscat);CHKERRQ(ierr);
799       ierr = ISDestroy(&list1);CHKERRQ(ierr);
800       ierr = ISDestroy(&list2);CHKERRQ(ierr);
801       ierr = PetscFree(indx1);CHKERRQ(ierr);
802       ierr = PetscFree(indx2);CHKERRQ(ierr);
803 #endif
804  break;
805   }
806  }
807  return(0);
808 }
809 EXTERN_C_END
810 
811 #undef __FUNCT__
812 #define __FUNCT__ "VecScatterFFTWToPetsc"
813 PetscErrorCode VecScatterFFTWToPetsc(Mat A,Vec x,Vec y)
814 {
815   PetscErrorCode ierr;
816   PetscFunctionBegin;
817   ierr = PetscTryMethod(A,"VecScatterFFTWToPetsc_C",(Mat,Vec,Vec),(A,x,y));CHKERRQ(ierr);
818   PetscFunctionReturn(0);
819 }
820 
821 /*
822       VecScatterFFTWToPetsc_FFTW - Converts FFTW output to the PETSc vector that user can use.
823   Input A, x, y
824   A - FFTW matrix
825   x - FFTW vector
826   y - PETSc vector that user can read
827 
828    Level: intermediate
829    Note: While doing real transform the FFTW output of backward DFT contains extra zeros at the end of last dimension.
830          VecScatterFFTWToPetsc removes those extra zeros.
831 
832 */
833 
834 EXTERN_C_BEGIN
835 #undef __FUNCT__
836 #define __FUNCT__ "VecScatterFFTWToPetsc_FFTW"
837 PetscErrorCode VecScatterFFTWToPetsc_FFTW(Mat A,Vec x,Vec y)
838 {
839   PetscErrorCode ierr;
840   MPI_Comm       comm=((PetscObject)A)->comm;
841   Mat_FFT        *fft = (Mat_FFT*)A->data;
842   Mat_FFTW       *fftw = (Mat_FFTW*)fft->data;
843   PetscInt       N=fft->N;
844   PetscInt       ndim=fft->ndim,*dim=fft->dim;
845   PetscInt       low;
846   PetscInt       rank,size;
847   ptrdiff_t      alloc_local,local_n0,local_0_start;
848   ptrdiff_t      local_n1,local_1_start;
849   VecScatter     vecscat;
850   IS             list1,list2;
851 #if !defined(PETSC_USE_COMPLEX)
852   PetscInt       i,j,k,partial_dim;
853   PetscInt       *indx1, *indx2, tempindx, tempindx1;
854   PetscInt       N1, n1 ,NM;
855   ptrdiff_t      temp;
856 #endif
857 
858 
859   ierr = MPI_Comm_size(comm, &size);CHKERRQ(ierr);
860   ierr = MPI_Comm_rank(comm, &rank);CHKERRQ(ierr);
861   ierr = VecGetOwnershipRange(x,&low,PETSC_NULL);CHKERRQ(ierr);
862 
863   if (size==1){
864     ierr = ISCreateStride(comm,N,0,1,&list1);CHKERRQ(ierr);
865     ierr = VecScatterCreate(x,list1,y,list1,&vecscat);CHKERRQ(ierr);
866     ierr = VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
867     ierr = VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
868     ierr = VecScatterDestroy(&vecscat);CHKERRQ(ierr);
869     ierr = ISDestroy(&list1);CHKERRQ(ierr);
870   }
871   else{
872 
873   switch (ndim){
874   case 1:
875 #if defined(PETSC_USE_COMPLEX)
876       alloc_local = fftw_mpi_local_size_1d(dim[0],comm,FFTW_BACKWARD,FFTW_ESTIMATE,&local_n0,&local_0_start,&local_n1,&local_1_start);
877       ierr = ISCreateStride(PETSC_COMM_WORLD,local_n1,local_1_start,1,&list1);
878       ierr = ISCreateStride(PETSC_COMM_WORLD,local_n1,low,1,&list2);
879       ierr = VecScatterCreate(x,list1,y,list2,&vecscat);CHKERRQ(ierr);
880       ierr = VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
881       ierr = VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
882       ierr = VecScatterDestroy(&vecscat);CHKERRQ(ierr);
883       ierr = ISDestroy(&list1);CHKERRQ(ierr);
884       ierr = ISDestroy(&list2);CHKERRQ(ierr);
885 #else
886       SETERRQ(comm,PETSC_ERR_SUP,"No support for real parallel 1D FFT");
887 #endif
888   break;
889   case 2:
890 #if defined(PETSC_USE_COMPLEX)
891       alloc_local = fftw_mpi_local_size_2d(dim[0],dim[1],comm,&local_n0,&local_0_start);
892       ierr = ISCreateStride(PETSC_COMM_WORLD,local_n0*dim[1],local_0_start*dim[1],1,&list1);
893       ierr = ISCreateStride(PETSC_COMM_WORLD,local_n0*dim[1],low,1,&list2);
894       ierr = VecScatterCreate(x,list2,y,list1,&vecscat);CHKERRQ(ierr);
895       ierr = VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
896       ierr = VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
897       ierr = VecScatterDestroy(&vecscat);CHKERRQ(ierr);
898       ierr = ISDestroy(&list1);CHKERRQ(ierr);
899       ierr = ISDestroy(&list2);CHKERRQ(ierr);
900 #else
901       alloc_local =  fftw_mpi_local_size_2d_transposed(dim[0],dim[1]/2+1,comm,&local_n0,&local_0_start,&local_n1,&local_1_start);
902       N1 = 2*dim[0]*(dim[1]/2+1); n1 = 2*local_n0*(dim[1]/2+1);
903 
904       ierr = PetscMalloc(sizeof(PetscInt)*((PetscInt)local_n0)*dim[1],&indx1);CHKERRQ(ierr);
905       ierr = PetscMalloc(sizeof(PetscInt)*((PetscInt)local_n0)*dim[1],&indx2);CHKERRQ(ierr);
906 
907       if (dim[1]%2==0)
908         NM = dim[1]+2;
909       else
910         NM = dim[1]+1;
911 
912       for (i=0;i<local_n0;i++){
913          for (j=0;j<dim[1];j++){
914             tempindx = i*dim[1] + j;
915             tempindx1 = i*NM + j;
916             indx1[tempindx]=local_0_start*dim[1]+tempindx;
917             indx2[tempindx]=low+tempindx1;
918          }
919        }
920 
921        ierr = ISCreateGeneral(comm,local_n0*dim[1],indx1,PETSC_COPY_VALUES,&list1);CHKERRQ(ierr);
922        ierr = ISCreateGeneral(comm,local_n0*dim[1],indx2,PETSC_COPY_VALUES,&list2);CHKERRQ(ierr);
923 
924        ierr = VecScatterCreate(x,list2,y,list1,&vecscat);CHKERRQ(ierr);
925        ierr = VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
926        ierr = VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
927        ierr = VecScatterDestroy(&vecscat);CHKERRQ(ierr);
928        ierr = ISDestroy(&list1);CHKERRQ(ierr);
929        ierr = ISDestroy(&list2);CHKERRQ(ierr);
930        ierr = PetscFree(indx1);CHKERRQ(ierr);
931        ierr = PetscFree(indx2);CHKERRQ(ierr);
932 #endif
933   break;
934   case 3:
935 #if defined(PETSC_USE_COMPLEX)
936        alloc_local = fftw_mpi_local_size_3d(dim[0],dim[1],dim[2],comm,&local_n0,&local_0_start);
937        ierr = ISCreateStride(PETSC_COMM_WORLD,local_n0*dim[1]*dim[2],local_0_start*dim[1]*dim[2],1,&list1);
938        ierr = ISCreateStride(PETSC_COMM_WORLD,local_n0*dim[1]*dim[2],low,1,&list2);
939        ierr = VecScatterCreate(x,list1,y,list2,&vecscat);CHKERRQ(ierr);
940        ierr = VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
941        ierr = VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
942        ierr = VecScatterDestroy(&vecscat);CHKERRQ(ierr);
943        ierr = ISDestroy(&list1);CHKERRQ(ierr);
944        ierr = ISDestroy(&list2);CHKERRQ(ierr);
945 #else
946 
947        alloc_local =  fftw_mpi_local_size_3d_transposed(dim[0],dim[1],dim[2]/2+1,comm,&local_n0,&local_0_start,&local_n1,&local_1_start);
948        N1 = 2*dim[0]*dim[1]*(dim[2]/2+1); n1 = 2*local_n0*dim[1]*(dim[2]/2+1);
949 
950 //     ierr = PetscMalloc(sizeof(PetscInt)*((PetscInt)local_n0)*N1,&indx1);CHKERRQ(ierr);
951 //     ierr = PetscMalloc(sizeof(PetscInt)*((PetscInt)local_n0)*N1,&indx2);CHKERRQ(ierr);
952        ierr = PetscMalloc(sizeof(PetscInt)*((PetscInt)local_n0)*dim[1]*dim[2],&indx1);CHKERRQ(ierr);
953        ierr = PetscMalloc(sizeof(PetscInt)*((PetscInt)local_n0)*dim[1]*dim[2],&indx2);CHKERRQ(ierr);
954 
955        if (dim[2]%2==0)
956         NM = dim[2]+2;
957        else
958         NM = dim[2]+1;
959 
960        for (i=0;i<local_n0;i++){
961           for (j=0;j<dim[1];j++){
962              for (k=0;k<dim[2];k++){
963                 tempindx = i*dim[1]*dim[2] + j*dim[2] + k;
964                 tempindx1 = i*dim[1]*NM + j*NM + k;
965                 indx1[tempindx]=local_0_start*dim[1]*dim[2]+tempindx;
966                 indx2[tempindx]=low+tempindx1;
967              }
968           }
969        }
970 
971        ierr = ISCreateGeneral(comm,local_n0*dim[1]*dim[2],indx1,PETSC_COPY_VALUES,&list1);CHKERRQ(ierr);
972        ierr = ISCreateGeneral(comm,local_n0*dim[1]*dim[2],indx2,PETSC_COPY_VALUES,&list2);CHKERRQ(ierr);
973 
974        ierr = VecScatterCreate(x,list2,y,list1,&vecscat);CHKERRQ(ierr);
975        ierr = VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
976        ierr = VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
977        ierr = VecScatterDestroy(&vecscat);CHKERRQ(ierr);
978        ierr = ISDestroy(&list1);CHKERRQ(ierr);
979        ierr = ISDestroy(&list2);CHKERRQ(ierr);
980        ierr = PetscFree(indx1);CHKERRQ(ierr);
981        ierr = PetscFree(indx2);CHKERRQ(ierr);
982 #endif
983   break;
984   default:
985 #if defined(PETSC_USE_COMPLEX)
986        alloc_local = fftw_mpi_local_size(fftw->ndim_fftw,fftw->dim_fftw,comm,&local_n0,&local_0_start);
987        ierr = ISCreateStride(PETSC_COMM_WORLD,local_n0*(fftw->partial_dim),local_0_start*(fftw->partial_dim),1,&list1);
988        ierr = ISCreateStride(PETSC_COMM_WORLD,local_n0*(fftw->partial_dim),low,1,&list2);
989        ierr = VecScatterCreate(x,list1,y,list2,&vecscat);CHKERRQ(ierr);
990        ierr = VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
991        ierr = VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
992        ierr = VecScatterDestroy(&vecscat);CHKERRQ(ierr);
993        ierr = ISDestroy(&list1);CHKERRQ(ierr);
994        ierr = ISDestroy(&list2);CHKERRQ(ierr);
995 #else
996        temp = (fftw->dim_fftw)[fftw->ndim_fftw-1];
997        (fftw->dim_fftw)[fftw->ndim_fftw-1] = temp/2 + 1;
998        alloc_local = fftw_mpi_local_size_transposed(fftw->ndim_fftw,fftw->dim_fftw,comm,&local_n0,&local_0_start,&local_n1,&local_1_start);
999        N1 = 2*N*(PetscInt)((fftw->dim_fftw)[fftw->ndim_fftw-1])/((PetscInt) temp);
1000        (fftw->dim_fftw)[fftw->ndim_fftw-1] = temp;
1001 
1002        partial_dim = fftw->partial_dim;
1003 
1004        ierr = PetscMalloc(sizeof(PetscInt)*((PetscInt)local_n0)*partial_dim,&indx1);CHKERRQ(ierr);
1005        ierr = PetscMalloc(sizeof(PetscInt)*((PetscInt)local_n0)*partial_dim,&indx2);CHKERRQ(ierr);
1006 
1007        if (dim[ndim-1]%2==0)
1008          NM = 2;
1009        else
1010          NM = 1;
1011 
1012        j = low;
1013        for (i=0,k=1; i<((PetscInt)local_n0)*partial_dim;i++,k++)
1014           {
1015            indx1[i] = local_0_start*partial_dim + i;
1016            indx2[i] = j;
1017            if (k%dim[ndim-1]==0)
1018              { j+=NM;}
1019            j++;
1020           }
1021        ierr = ISCreateGeneral(comm,local_n0*partial_dim,indx1,PETSC_COPY_VALUES,&list1);CHKERRQ(ierr);
1022        ierr = ISCreateGeneral(comm,local_n0*partial_dim,indx2,PETSC_COPY_VALUES,&list2);CHKERRQ(ierr);
1023 
1024        ierr = VecScatterCreate(x,list2,y,list1,&vecscat);CHKERRQ(ierr);
1025        ierr = VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
1026        ierr = VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
1027        ierr = VecScatterDestroy(&vecscat);CHKERRQ(ierr);
1028        ierr = ISDestroy(&list1);CHKERRQ(ierr);
1029        ierr = ISDestroy(&list2);CHKERRQ(ierr);
1030        ierr = PetscFree(indx1);CHKERRQ(ierr);
1031        ierr = PetscFree(indx2);CHKERRQ(ierr);
1032 #endif
1033   break;
1034   }
1035   }
1036   return 0;
1037 }
1038 EXTERN_C_END
1039 
1040 EXTERN_C_BEGIN
1041 #undef __FUNCT__
1042 #define __FUNCT__ "MatCreate_FFTW"
1043 /*
1044       MatCreate_FFTW - Creates a matrix object that provides FFT
1045   via the external package FFTW
1046   Options Database Keys:
1047 + -mat_fftw_plannerflags - set FFTW planner flags
1048 
1049    Level: intermediate
1050 
1051 */
1052 
1053 PetscErrorCode MatCreate_FFTW(Mat A)
1054 {
1055   PetscErrorCode ierr;
1056   MPI_Comm       comm=((PetscObject)A)->comm;
1057   Mat_FFT        *fft=(Mat_FFT*)A->data;
1058   Mat_FFTW       *fftw;
1059   PetscInt       n=fft->n,N=fft->N,ndim=fft->ndim,*dim = fft->dim;
1060   const char     *p_flags[]={"FFTW_ESTIMATE","FFTW_MEASURE","FFTW_PATIENT","FFTW_EXHAUSTIVE"};
1061   PetscBool      flg;
1062   PetscInt       p_flag,partial_dim=1,ctr;
1063   PetscMPIInt    size,rank;
1064   ptrdiff_t      *pdim;
1065   ptrdiff_t      local_n1,local_1_start;
1066 #if !defined(PETSC_USE_COMPLEX)
1067    ptrdiff_t     temp;
1068    PetscInt      N1,tot_dim;
1069 #else
1070    PetscInt n1;
1071 #endif
1072 
1073   PetscFunctionBegin;
1074   ierr = MPI_Comm_size(comm, &size);CHKERRQ(ierr);
1075   ierr = MPI_Comm_rank(comm, &rank);CHKERRQ(ierr);
1076 
1077   pdim = (ptrdiff_t *)calloc(ndim,sizeof(ptrdiff_t));
1078   pdim[0] = dim[0];
1079 #if !defined(PETSC_USE_COMPLEX)
1080   tot_dim = 2*dim[0];
1081 #endif
1082   for (ctr=1;ctr<ndim;ctr++)
1083      {
1084           partial_dim *= dim[ctr];
1085           pdim[ctr] = dim[ctr];
1086 #if !defined(PETSC_USE_COMPLEX)
1087           if (ctr==ndim-1)
1088             tot_dim *= (dim[ctr]/2+1);
1089           else
1090             tot_dim *= dim[ctr];
1091 #endif
1092      }
1093 
1094   if (size == 1) {
1095 #if defined(PETSC_USE_COMPLEX)
1096     ierr = MatSetSizes(A,N,N,N,N);CHKERRQ(ierr);
1097     n = N;
1098 #else
1099     ierr = MatSetSizes(A,tot_dim,tot_dim,tot_dim,tot_dim);CHKERRQ(ierr);
1100     n = tot_dim;
1101 #endif
1102 
1103   } else {
1104     ptrdiff_t alloc_local,local_n0,local_0_start;
1105     switch (ndim){
1106     case 1:
1107 #if !defined(PETSC_USE_COMPLEX)
1108    SETERRQ(comm,PETSC_ERR_SUP,"FFTW does not support parallel 1D real transform");
1109 #else
1110       alloc_local = fftw_mpi_local_size_1d(dim[0],comm,FFTW_FORWARD,FFTW_ESTIMATE,&local_n0,&local_0_start,&local_n1,&local_1_start);
1111       n = (PetscInt)local_n0;
1112       n1 = (PetscInt) local_n1;
1113       ierr = MatSetSizes(A,n1,n,N,N);CHKERRQ(ierr);
1114 #endif
1115       break;
1116     case 2:
1117 #if defined(PETSC_USE_COMPLEX)
1118       alloc_local = fftw_mpi_local_size_2d(dim[0],dim[1],comm,&local_n0,&local_0_start);
1119       /*
1120        PetscSynchronizedPrintf(comm,"[%d] MatCreateSeqFFTW: local_n0, local_0_start %d %d, N %d,dim %d, %d\n",rank,(PetscInt)local_n0*dim[1],(PetscInt)local_0_start,m,dim[0],dim[1]);
1121        PetscSynchronizedFlush(comm);
1122        */
1123       n = (PetscInt)local_n0*dim[1];
1124       ierr = MatSetSizes(A,n,n,N,N);CHKERRQ(ierr);
1125 #else
1126       alloc_local = fftw_mpi_local_size_2d_transposed(dim[0],dim[1]/2+1,PETSC_COMM_WORLD,&local_n0,&local_0_start,&local_n1,&local_1_start);
1127       n = 2*(PetscInt)local_n0*(dim[1]/2+1);
1128       ierr = MatSetSizes(A,n,n,2*dim[0]*(dim[1]/2+1),2*dim[0]*(dim[1]/2+1));
1129 #endif
1130       break;
1131     case 3:
1132 #if defined(PETSC_USE_COMPLEX)
1133       alloc_local = fftw_mpi_local_size_3d(dim[0],dim[1],dim[2],comm,&local_n0,&local_0_start);
1134       n = (PetscInt)local_n0*dim[1]*dim[2];
1135       ierr = MatSetSizes(A,n,n,N,N);CHKERRQ(ierr);
1136 #else
1137       alloc_local = fftw_mpi_local_size_3d_transposed(dim[0],dim[1],dim[2]/2+1,PETSC_COMM_WORLD,&local_n0,&local_0_start,&local_n1,&local_1_start);
1138       n = 2*(PetscInt)local_n0*dim[1]*(dim[2]/2+1);
1139       ierr = MatSetSizes(A,n,n,2*dim[0]*dim[1]*(dim[2]/2+1),2*dim[0]*dim[1]*(dim[2]/2+1));
1140 #endif
1141       break;
1142     default:
1143 #if defined(PETSC_USE_COMPLEX)
1144       alloc_local = fftw_mpi_local_size(ndim,pdim,comm,&local_n0,&local_0_start);
1145       n = (PetscInt)local_n0*partial_dim;
1146       ierr = MatSetSizes(A,n,n,N,N);CHKERRQ(ierr);
1147 #else
1148       temp = pdim[ndim-1];
1149       pdim[ndim-1] = temp/2 + 1;
1150       alloc_local = fftw_mpi_local_size_transposed(ndim,pdim,PETSC_COMM_WORLD,&local_n0,&local_0_start,&local_n1,&local_1_start);
1151       n = 2*(PetscInt)local_n0*partial_dim*pdim[ndim-1]/temp;
1152       N1 = 2*N*(PetscInt)pdim[ndim-1]/((PetscInt) temp);
1153       pdim[ndim-1] = temp;
1154       ierr = MatSetSizes(A,n,n,N1,N1);CHKERRQ(ierr);
1155 #endif
1156       break;
1157     }
1158   }
1159   ierr = PetscObjectChangeTypeName((PetscObject)A,MATFFTW);CHKERRQ(ierr);
1160   ierr = PetscNewLog(A,Mat_FFTW,&fftw);CHKERRQ(ierr);
1161   fft->data = (void*)fftw;
1162 
1163   fft->n           = n;
1164   fftw->ndim_fftw  = (ptrdiff_t)ndim; // This is dimension of fft
1165   fftw->partial_dim = partial_dim;
1166   ierr = PetscMalloc(ndim*sizeof(ptrdiff_t), (ptrdiff_t *)&(fftw->dim_fftw));CHKERRQ(ierr);
1167   for(ctr=0;ctr<ndim;ctr++) (fftw->dim_fftw)[ctr]=dim[ctr];
1168 
1169   fftw->p_forward  = 0;
1170   fftw->p_backward = 0;
1171   fftw->p_flag     = FFTW_ESTIMATE;
1172 
1173   if (size == 1){
1174     A->ops->mult          = MatMult_SeqFFTW;
1175     A->ops->multtranspose = MatMultTranspose_SeqFFTW;
1176   } else {
1177     A->ops->mult          = MatMult_MPIFFTW;
1178     A->ops->multtranspose = MatMultTranspose_MPIFFTW;
1179   }
1180   fft->matdestroy          = MatDestroy_FFTW;
1181   A->assembled          = PETSC_TRUE;
1182   PetscObjectComposeFunctionDynamic((PetscObject)A,"MatGetVecsFFTW_C","MatGetVecsFFTW_FFTW",MatGetVecsFFTW_FFTW);
1183   PetscObjectComposeFunctionDynamic((PetscObject)A,"VecScatterPetscToFFTW_C","VecScatterPetscToFFTW_FFTW",VecScatterPetscToFFTW_FFTW);
1184   PetscObjectComposeFunctionDynamic((PetscObject)A,"VecScatterFFTWToPetsc_C","VecScatterFFTWToPetsc_FFTW",VecScatterFFTWToPetsc_FFTW);
1185 
1186   /* get runtime options */
1187   ierr = PetscOptionsBegin(((PetscObject)A)->comm,((PetscObject)A)->prefix,"FFTW Options","Mat");CHKERRQ(ierr);
1188     ierr = PetscOptionsEList("-mat_fftw_plannerflags","Planner Flags","None",p_flags,4,p_flags[0],&p_flag,&flg);CHKERRQ(ierr);
1189     if (flg) {fftw->p_flag = (unsigned)p_flag;}
1190   PetscOptionsEnd();
1191   PetscFunctionReturn(0);
1192 }
1193 EXTERN_C_END
1194 
1195 
1196 
1197 
1198