xref: /petsc/src/mat/impls/fft/fftw/fftw.c (revision bef158480efac06de457f7a665168877ab3c2fd7)
1 
2 /*
3     Provides an interface to the FFTW package.
4     Testing examples can be found in ~src/mat/tests
5 */
6 
7 #include <../src/mat/impls/fft/fft.h>   /*I "petscmat.h" I*/
8 EXTERN_C_BEGIN
9 #include <fftw3-mpi.h>
10 EXTERN_C_END
11 
12 typedef struct {
13   ptrdiff_t    ndim_fftw,*dim_fftw;
14 #if defined(PETSC_USE_64BIT_INDICES)
15   fftw_iodim64 *iodims;
16 #else
17   fftw_iodim   *iodims;
18 #endif
19   PetscInt     partial_dim;
20   fftw_plan    p_forward,p_backward;
21   unsigned     p_flag; /* planner flags, FFTW_ESTIMATE,FFTW_MEASURE, FFTW_PATIENT, FFTW_EXHAUSTIVE */
22   PetscScalar  *finarray,*foutarray,*binarray,*boutarray; /* keep track of arrays becaue fftw plan should be
23                                                             executed for the arrays with which the plan was created */
24 } Mat_FFTW;
25 
26 extern PetscErrorCode MatMult_SeqFFTW(Mat,Vec,Vec);
27 extern PetscErrorCode MatMultTranspose_SeqFFTW(Mat,Vec,Vec);
28 extern PetscErrorCode MatMult_MPIFFTW(Mat,Vec,Vec);
29 extern PetscErrorCode MatMultTranspose_MPIFFTW(Mat,Vec,Vec);
30 extern PetscErrorCode MatDestroy_FFTW(Mat);
31 extern PetscErrorCode VecDestroy_MPIFFTW(Vec);
32 extern PetscErrorCode MatCreateVecsFFTW_FFTW(Mat,Vec*,Vec*,Vec*);
33 
34 /* MatMult_SeqFFTW performs forward DFT in parallel
35    Input parameter:
36      A - the matrix
37      x - the vector on which FDFT will be performed
38 
39    Output parameter:
40      y - vector that stores result of FDFT
41 */
42 PetscErrorCode MatMult_SeqFFTW(Mat A,Vec x,Vec y)
43 {
44   PetscErrorCode ierr;
45   Mat_FFT        *fft  = (Mat_FFT*)A->data;
46   Mat_FFTW       *fftw = (Mat_FFTW*)fft->data;
47   const PetscScalar *x_array;
48   PetscScalar    *y_array;
49 #if defined(PETSC_USE_COMPLEX)
50 #if defined(PETSC_USE_64BIT_INDICES)
51   fftw_iodim64   *iodims;
52 #else
53   fftw_iodim     *iodims;
54 #endif
55   PetscInt       i;
56 #endif
57   PetscInt       ndim = fft->ndim,*dim = fft->dim;
58 
59   PetscFunctionBegin;
60   ierr = VecGetArrayRead(x,&x_array);CHKERRQ(ierr);
61   ierr = VecGetArray(y,&y_array);CHKERRQ(ierr);
62   if (!fftw->p_forward) { /* create a plan, then excute it */
63     switch (ndim) {
64     case 1:
65 #if defined(PETSC_USE_COMPLEX)
66       fftw->p_forward = fftw_plan_dft_1d(dim[0],(fftw_complex*)x_array,(fftw_complex*)y_array,FFTW_FORWARD,fftw->p_flag);
67 #else
68       fftw->p_forward = fftw_plan_dft_r2c_1d(dim[0],(double*)x_array,(fftw_complex*)y_array,fftw->p_flag);
69 #endif
70       break;
71     case 2:
72 #if defined(PETSC_USE_COMPLEX)
73       fftw->p_forward = fftw_plan_dft_2d(dim[0],dim[1],(fftw_complex*)x_array,(fftw_complex*)y_array,FFTW_FORWARD,fftw->p_flag);
74 #else
75       fftw->p_forward = fftw_plan_dft_r2c_2d(dim[0],dim[1],(double*)x_array,(fftw_complex*)y_array,fftw->p_flag);
76 #endif
77       break;
78     case 3:
79 #if defined(PETSC_USE_COMPLEX)
80       fftw->p_forward = fftw_plan_dft_3d(dim[0],dim[1],dim[2],(fftw_complex*)x_array,(fftw_complex*)y_array,FFTW_FORWARD,fftw->p_flag);
81 #else
82       fftw->p_forward = fftw_plan_dft_r2c_3d(dim[0],dim[1],dim[2],(double*)x_array,(fftw_complex*)y_array,fftw->p_flag);
83 #endif
84       break;
85     default:
86 #if defined(PETSC_USE_COMPLEX)
87       iodims = fftw->iodims;
88 #if defined(PETSC_USE_64BIT_INDICES)
89       if (ndim) {
90         iodims[ndim-1].n = (ptrdiff_t)dim[ndim-1];
91         iodims[ndim-1].is = iodims[ndim-1].os = 1;
92         for (i=ndim-2; i>=0; --i) {
93           iodims[i].n = (ptrdiff_t)dim[i];
94           iodims[i].is = iodims[i].os = iodims[i+1].is * iodims[i+1].n;
95         }
96       }
97       fftw->p_forward = fftw_plan_guru64_dft((int)ndim,(fftw_iodim64*)iodims,0,NULL,(fftw_complex*)x_array,(fftw_complex*)y_array,FFTW_FORWARD,fftw->p_flag);
98 #else
99       if (ndim) {
100         iodims[ndim-1].n = (int)dim[ndim-1];
101         iodims[ndim-1].is = iodims[ndim-1].os = 1;
102         for (i=ndim-2; i>=0; --i) {
103           iodims[i].n = (int)dim[i];
104           iodims[i].is = iodims[i].os = iodims[i+1].is * iodims[i+1].n;
105         }
106       }
107       fftw->p_forward = fftw_plan_guru_dft((int)ndim,(fftw_iodim*)iodims,0,NULL,(fftw_complex*)x_array,(fftw_complex*)y_array,FFTW_FORWARD,fftw->p_flag);
108 #endif
109 
110 #else
111       fftw->p_forward = fftw_plan_dft_r2c(ndim,(int*)dim,(double*)x_array,(fftw_complex*)y_array,fftw->p_flag);
112 #endif
113       break;
114     }
115     fftw->finarray  = (PetscScalar *) x_array;
116     fftw->foutarray = y_array;
117     /* Warning: if (fftw->p_flag!==FFTW_ESTIMATE) The data in the in/out arrays is overwritten!
118                 planning should be done before x is initialized! See FFTW manual sec2.1 or sec4 */
119     fftw_execute(fftw->p_forward);
120   } else { /* use existing plan */
121     if (fftw->finarray != x_array || fftw->foutarray != y_array) { /* use existing plan on new arrays */
122 #if defined(PETSC_USE_COMPLEX)
123       fftw_execute_dft(fftw->p_forward,(fftw_complex*)x_array,(fftw_complex*)y_array);
124 #else
125       fftw_execute_dft_r2c(fftw->p_forward,(double*)x_array,(fftw_complex*)y_array);
126 #endif
127     } else {
128       fftw_execute(fftw->p_forward);
129     }
130   }
131   ierr = VecRestoreArray(y,&y_array);CHKERRQ(ierr);
132   ierr = VecRestoreArrayRead(x,&x_array);CHKERRQ(ierr);
133   PetscFunctionReturn(0);
134 }
135 
136 /* MatMultTranspose_SeqFFTW performs serial backward DFT
137    Input parameter:
138      A - the matrix
139      x - the vector on which BDFT will be performed
140 
141    Output parameter:
142      y - vector that stores result of BDFT
143 */
144 
145 PetscErrorCode MatMultTranspose_SeqFFTW(Mat A,Vec x,Vec y)
146 {
147   PetscErrorCode ierr;
148   Mat_FFT        *fft  = (Mat_FFT*)A->data;
149   Mat_FFTW       *fftw = (Mat_FFTW*)fft->data;
150   const PetscScalar *x_array;
151   PetscScalar    *y_array;
152   PetscInt       ndim=fft->ndim,*dim=fft->dim;
153 #if defined(PETSC_USE_COMPLEX)
154 #if defined(PETSC_USE_64BIT_INDICES)
155   fftw_iodim64   *iodims=fftw->iodims;
156 #else
157   fftw_iodim     *iodims=fftw->iodims;
158 #endif
159 #endif
160 
161   PetscFunctionBegin;
162   ierr = VecGetArrayRead(x,&x_array);CHKERRQ(ierr);
163   ierr = VecGetArray(y,&y_array);CHKERRQ(ierr);
164   if (!fftw->p_backward) { /* create a plan, then excute it */
165     switch (ndim) {
166     case 1:
167 #if defined(PETSC_USE_COMPLEX)
168       fftw->p_backward = fftw_plan_dft_1d(dim[0],(fftw_complex*)x_array,(fftw_complex*)y_array,FFTW_BACKWARD,fftw->p_flag);
169 #else
170       fftw->p_backward= fftw_plan_dft_c2r_1d(dim[0],(fftw_complex*)x_array,(double*)y_array,fftw->p_flag);
171 #endif
172       break;
173     case 2:
174 #if defined(PETSC_USE_COMPLEX)
175       fftw->p_backward = fftw_plan_dft_2d(dim[0],dim[1],(fftw_complex*)x_array,(fftw_complex*)y_array,FFTW_BACKWARD,fftw->p_flag);
176 #else
177       fftw->p_backward= fftw_plan_dft_c2r_2d(dim[0],dim[1],(fftw_complex*)x_array,(double*)y_array,fftw->p_flag);
178 #endif
179       break;
180     case 3:
181 #if defined(PETSC_USE_COMPLEX)
182       fftw->p_backward = fftw_plan_dft_3d(dim[0],dim[1],dim[2],(fftw_complex*)x_array,(fftw_complex*)y_array,FFTW_BACKWARD,fftw->p_flag);
183 #else
184       fftw->p_backward= fftw_plan_dft_c2r_3d(dim[0],dim[1],dim[2],(fftw_complex*)x_array,(double*)y_array,fftw->p_flag);
185 #endif
186       break;
187     default:
188 #if defined(PETSC_USE_COMPLEX)
189 #if defined(PETSC_USE_64BIT_INDICES)
190       fftw->p_backward = fftw_plan_guru64_dft((int)ndim,(fftw_iodim64*)iodims,0,NULL,(fftw_complex*)x_array,(fftw_complex*)y_array,FFTW_BACKWARD,fftw->p_flag);
191 #else
192       fftw->p_backward = fftw_plan_guru_dft((int)ndim,iodims,0,NULL,(fftw_complex*)x_array,(fftw_complex*)y_array,FFTW_BACKWARD,fftw->p_flag);
193 #endif
194 #else
195       fftw->p_backward= fftw_plan_dft_c2r((int)ndim,(int*)dim,(fftw_complex*)x_array,(double*)y_array,fftw->p_flag);
196 #endif
197       break;
198     }
199     fftw->binarray  = (PetscScalar *) x_array;
200     fftw->boutarray = y_array;
201     fftw_execute(fftw->p_backward);CHKERRQ(ierr);
202   } else { /* use existing plan */
203     if (fftw->binarray != x_array || fftw->boutarray != y_array) { /* use existing plan on new arrays */
204 #if defined(PETSC_USE_COMPLEX)
205       fftw_execute_dft(fftw->p_backward,(fftw_complex*)x_array,(fftw_complex*)y_array);
206 #else
207       fftw_execute_dft_c2r(fftw->p_backward,(fftw_complex*)x_array,(double*)y_array);
208 #endif
209     } else {
210       fftw_execute(fftw->p_backward);CHKERRQ(ierr);
211     }
212   }
213   ierr = VecRestoreArray(y,&y_array);CHKERRQ(ierr);
214   ierr = VecRestoreArrayRead(x,&x_array);CHKERRQ(ierr);
215   PetscFunctionReturn(0);
216 }
217 
218 /* MatMult_MPIFFTW performs forward DFT in parallel
219    Input parameter:
220      A - the matrix
221      x - the vector on which FDFT will be performed
222 
223    Output parameter:
224    y   - vector that stores result of FDFT
225 */
226 PetscErrorCode MatMult_MPIFFTW(Mat A,Vec x,Vec y)
227 {
228   PetscErrorCode ierr;
229   Mat_FFT        *fft  = (Mat_FFT*)A->data;
230   Mat_FFTW       *fftw = (Mat_FFTW*)fft->data;
231   const PetscScalar *x_array;
232   PetscScalar    *y_array;
233   PetscInt       ndim=fft->ndim,*dim=fft->dim;
234   MPI_Comm       comm;
235 
236   PetscFunctionBegin;
237   ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr);
238   ierr = VecGetArrayRead(x,&x_array);CHKERRQ(ierr);
239   ierr = VecGetArray(y,&y_array);CHKERRQ(ierr);
240   if (!fftw->p_forward) { /* create a plan, then excute it */
241     switch (ndim) {
242     case 1:
243 #if defined(PETSC_USE_COMPLEX)
244       fftw->p_forward = fftw_mpi_plan_dft_1d(dim[0],(fftw_complex*)x_array,(fftw_complex*)y_array,comm,FFTW_FORWARD,fftw->p_flag);
245 #else
246       SETERRQ(comm,PETSC_ERR_SUP,"not support for real numbers yet");
247 #endif
248       break;
249     case 2:
250 #if defined(PETSC_USE_COMPLEX) /* For complex transforms call fftw_mpi_plan_dft, for real transforms call fftw_mpi_plan_dft_r2c */
251       fftw->p_forward = fftw_mpi_plan_dft_2d(dim[0],dim[1],(fftw_complex*)x_array,(fftw_complex*)y_array,comm,FFTW_FORWARD,fftw->p_flag);
252 #else
253       fftw->p_forward = fftw_mpi_plan_dft_r2c_2d(dim[0],dim[1],(double*)x_array,(fftw_complex*)y_array,comm,FFTW_ESTIMATE);
254 #endif
255       break;
256     case 3:
257 #if defined(PETSC_USE_COMPLEX)
258       fftw->p_forward = fftw_mpi_plan_dft_3d(dim[0],dim[1],dim[2],(fftw_complex*)x_array,(fftw_complex*)y_array,comm,FFTW_FORWARD,fftw->p_flag);
259 #else
260       fftw->p_forward = fftw_mpi_plan_dft_r2c_3d(dim[0],dim[1],dim[2],(double*)x_array,(fftw_complex*)y_array,comm,FFTW_ESTIMATE);
261 #endif
262       break;
263     default:
264 #if defined(PETSC_USE_COMPLEX)
265       fftw->p_forward = fftw_mpi_plan_dft(fftw->ndim_fftw,fftw->dim_fftw,(fftw_complex*)x_array,(fftw_complex*)y_array,comm,FFTW_FORWARD,fftw->p_flag);
266 #else
267       fftw->p_forward = fftw_mpi_plan_dft_r2c(fftw->ndim_fftw,fftw->dim_fftw,(double*)x_array,(fftw_complex*)y_array,comm,FFTW_ESTIMATE);
268 #endif
269       break;
270     }
271     fftw->finarray  = (PetscScalar *) x_array;
272     fftw->foutarray = y_array;
273     /* Warning: if (fftw->p_flag!==FFTW_ESTIMATE) The data in the in/out arrays is overwritten!
274                 planning should be done before x is initialized! See FFTW manual sec2.1 or sec4 */
275     fftw_execute(fftw->p_forward);
276   } else { /* use existing plan */
277     if (fftw->finarray != x_array || fftw->foutarray != y_array) { /* use existing plan on new arrays */
278       fftw_execute_dft(fftw->p_forward,(fftw_complex*)x_array,(fftw_complex*)y_array);
279     } else {
280       fftw_execute(fftw->p_forward);
281     }
282   }
283   ierr = VecRestoreArray(y,&y_array);CHKERRQ(ierr);
284   ierr = VecRestoreArrayRead(x,&x_array);CHKERRQ(ierr);
285   PetscFunctionReturn(0);
286 }
287 
288 /* MatMultTranspose_MPIFFTW performs parallel backward DFT
289    Input parameter:
290      A - the matrix
291      x - the vector on which BDFT will be performed
292 
293    Output parameter:
294      y - vector that stores result of BDFT
295 */
296 PetscErrorCode MatMultTranspose_MPIFFTW(Mat A,Vec x,Vec y)
297 {
298   PetscErrorCode ierr;
299   Mat_FFT        *fft  = (Mat_FFT*)A->data;
300   Mat_FFTW       *fftw = (Mat_FFTW*)fft->data;
301   const PetscScalar *x_array;
302   PetscScalar    *y_array;
303   PetscInt       ndim=fft->ndim,*dim=fft->dim;
304   MPI_Comm       comm;
305 
306   PetscFunctionBegin;
307   ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr);
308   ierr = VecGetArrayRead(x,&x_array);CHKERRQ(ierr);
309   ierr = VecGetArray(y,&y_array);CHKERRQ(ierr);
310   if (!fftw->p_backward) { /* create a plan, then excute it */
311     switch (ndim) {
312     case 1:
313 #if defined(PETSC_USE_COMPLEX)
314       fftw->p_backward = fftw_mpi_plan_dft_1d(dim[0],(fftw_complex*)x_array,(fftw_complex*)y_array,comm,FFTW_BACKWARD,fftw->p_flag);
315 #else
316       SETERRQ(comm,PETSC_ERR_SUP,"not support for real numbers yet");
317 #endif
318       break;
319     case 2:
320 #if defined(PETSC_USE_COMPLEX) /* For complex transforms call fftw_mpi_plan_dft with flag FFTW_BACKWARD, for real transforms call fftw_mpi_plan_dft_c2r */
321       fftw->p_backward = fftw_mpi_plan_dft_2d(dim[0],dim[1],(fftw_complex*)x_array,(fftw_complex*)y_array,comm,FFTW_BACKWARD,fftw->p_flag);
322 #else
323       fftw->p_backward = fftw_mpi_plan_dft_c2r_2d(dim[0],dim[1],(fftw_complex*)x_array,(double*)y_array,comm,FFTW_ESTIMATE);
324 #endif
325       break;
326     case 3:
327 #if defined(PETSC_USE_COMPLEX)
328       fftw->p_backward = fftw_mpi_plan_dft_3d(dim[0],dim[1],dim[2],(fftw_complex*)x_array,(fftw_complex*)y_array,comm,FFTW_BACKWARD,fftw->p_flag);
329 #else
330       fftw->p_backward = fftw_mpi_plan_dft_c2r_3d(dim[0],dim[1],dim[2],(fftw_complex*)x_array,(double*)y_array,comm,FFTW_ESTIMATE);
331 #endif
332       break;
333     default:
334 #if defined(PETSC_USE_COMPLEX)
335       fftw->p_backward = fftw_mpi_plan_dft(fftw->ndim_fftw,fftw->dim_fftw,(fftw_complex*)x_array,(fftw_complex*)y_array,comm,FFTW_BACKWARD,fftw->p_flag);
336 #else
337       fftw->p_backward = fftw_mpi_plan_dft_c2r(fftw->ndim_fftw,fftw->dim_fftw,(fftw_complex*)x_array,(double*)y_array,comm,FFTW_ESTIMATE);
338 #endif
339       break;
340     }
341     fftw->binarray  = (PetscScalar *) x_array;
342     fftw->boutarray = y_array;
343     fftw_execute(fftw->p_backward);CHKERRQ(ierr);
344   } else { /* use existing plan */
345     if (fftw->binarray != x_array || fftw->boutarray != y_array) { /* use existing plan on new arrays */
346       fftw_execute_dft(fftw->p_backward,(fftw_complex*)x_array,(fftw_complex*)y_array);
347     } else {
348       fftw_execute(fftw->p_backward);CHKERRQ(ierr);
349     }
350   }
351   ierr = VecRestoreArray(y,&y_array);CHKERRQ(ierr);
352   ierr = VecRestoreArrayRead(x,&x_array);CHKERRQ(ierr);
353   PetscFunctionReturn(0);
354 }
355 
356 PetscErrorCode MatDestroy_FFTW(Mat A)
357 {
358   Mat_FFT        *fft  = (Mat_FFT*)A->data;
359   Mat_FFTW       *fftw = (Mat_FFTW*)fft->data;
360   PetscErrorCode ierr;
361 
362   PetscFunctionBegin;
363   fftw_destroy_plan(fftw->p_forward);
364   fftw_destroy_plan(fftw->p_backward);
365   if (fftw->iodims) {
366     free(fftw->iodims);
367   }
368   ierr = PetscFree(fftw->dim_fftw);CHKERRQ(ierr);
369   ierr = PetscFree(fft->data);CHKERRQ(ierr);
370   fftw_mpi_cleanup();
371   PetscFunctionReturn(0);
372 }
373 
374 #include <../src/vec/vec/impls/mpi/pvecimpl.h>   /*I  "petscvec.h"   I*/
375 PetscErrorCode VecDestroy_MPIFFTW(Vec v)
376 {
377   PetscErrorCode ierr;
378   PetscScalar    *array;
379 
380   PetscFunctionBegin;
381   ierr = VecGetArray(v,&array);CHKERRQ(ierr);
382   fftw_free((fftw_complex*)array);CHKERRQ(ierr);
383   ierr = VecRestoreArray(v,&array);CHKERRQ(ierr);
384   ierr = VecDestroy_MPI(v);CHKERRQ(ierr);
385   PetscFunctionReturn(0);
386 }
387 
388 
389 static PetscErrorCode VecDuplicate_FFTW_fin(Vec fin,Vec *fin_new)
390 {
391   PetscErrorCode ierr;
392   Mat            A;
393 
394   PetscFunctionBegin;
395   ierr = PetscObjectQuery((PetscObject)fin,"FFTmatrix",(PetscObject*)&A);CHKERRQ(ierr);
396   ierr = MatCreateVecsFFTW_FFTW(A,fin_new,NULL,NULL);CHKERRQ(ierr);
397   PetscFunctionReturn(0);
398 }
399 
400 static PetscErrorCode VecDuplicate_FFTW_fout(Vec fout,Vec *fout_new)
401 {
402   PetscErrorCode ierr;
403   Mat            A;
404 
405   PetscFunctionBegin;
406   ierr = PetscObjectQuery((PetscObject)fout,"FFTmatrix",(PetscObject*)&A);CHKERRQ(ierr);
407   ierr = MatCreateVecsFFTW_FFTW(A,NULL,fout_new,NULL);CHKERRQ(ierr);
408   PetscFunctionReturn(0);
409 }
410 
411 static PetscErrorCode VecDuplicate_FFTW_bout(Vec bout, Vec *bout_new)
412 {
413   PetscErrorCode ierr;
414   Mat            A;
415 
416   PetscFunctionBegin;
417   ierr = PetscObjectQuery((PetscObject)bout,"FFTmatrix",(PetscObject*)&A);CHKERRQ(ierr);
418   ierr = MatCreateVecsFFTW_FFTW(A,NULL,NULL,bout_new);CHKERRQ(ierr);
419   PetscFunctionReturn(0);
420 }
421 
422 
423 /*@
424    MatCreateVecsFFTW - Get vector(s) compatible with the matrix, i.e. with the
425      parallel layout determined by FFTW
426 
427    Collective on Mat
428 
429    Input Parameter:
430 .   A - the matrix
431 
432    Output Parameter:
433 +   x - (optional) input vector of forward FFTW
434 -   y - (optional) output vector of forward FFTW
435 -   z - (optional) output vector of backward FFTW
436 
437   Level: advanced
438 
439   Note: The parallel layout of output of forward FFTW is always same as the input
440         of the backward FFTW. But parallel layout of the input vector of forward
441         FFTW might not be same as the output of backward FFTW.
442         Also note that we need to provide enough space while doing parallel real transform.
443         We need to pad extra zeros at the end of last dimension. For this reason the one needs to
444         invoke the routine fftw_mpi_local_size_transposed routines. Remember one has to change the
445         last dimension from n to n/2+1 while invoking this routine. The number of zeros to be padded
446         depends on if the last dimension is even or odd. If the last dimension is even need to pad two
447         zeros if it is odd only one zero is needed.
448         Lastly one needs some scratch space at the end of data set in each process. alloc_local
449         figures out how much space is needed, i.e. it figures out the data+scratch space for
450         each processor and returns that.
451 
452 .seealso: MatCreateFFT()
453 @*/
454 PetscErrorCode MatCreateVecsFFTW(Mat A,Vec *x,Vec *y,Vec *z)
455 {
456   PetscErrorCode ierr;
457 
458   PetscFunctionBegin;
459   ierr = PetscUseMethod(A,"MatCreateVecsFFTW_C",(Mat,Vec*,Vec*,Vec*),(A,x,y,z));CHKERRQ(ierr);
460   PetscFunctionReturn(0);
461 }
462 
463 PetscErrorCode  MatCreateVecsFFTW_FFTW(Mat A,Vec *fin,Vec *fout,Vec *bout)
464 {
465   PetscErrorCode ierr;
466   PetscMPIInt    size,rank;
467   MPI_Comm       comm;
468   Mat_FFT        *fft  = (Mat_FFT*)A->data;
469   Mat_FFTW       *fftw = (Mat_FFTW*)fft->data;
470   PetscInt       N     = fft->N;
471   PetscInt       ndim  = fft->ndim,*dim=fft->dim,n=fft->n;
472 
473   PetscFunctionBegin;
474   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
475   PetscValidType(A,1);
476   ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr);
477 
478   ierr = MPI_Comm_size(comm, &size);CHKERRQ(ierr);
479   ierr = MPI_Comm_rank(comm, &rank);CHKERRQ(ierr);
480   if (size == 1) { /* sequential case */
481 #if defined(PETSC_USE_COMPLEX)
482     if (fin)  {ierr = VecCreateSeq(PETSC_COMM_SELF,N,fin);CHKERRQ(ierr);}
483     if (fout) {ierr = VecCreateSeq(PETSC_COMM_SELF,N,fout);CHKERRQ(ierr);}
484     if (bout) {ierr = VecCreateSeq(PETSC_COMM_SELF,N,bout);CHKERRQ(ierr);}
485 #else
486     if (fin) {ierr = VecCreateSeq(PETSC_COMM_SELF,n,fin);CHKERRQ(ierr);}
487     if (fout) {ierr = VecCreateSeq(PETSC_COMM_SELF,n,fout);CHKERRQ(ierr);}
488     if (bout) {ierr = VecCreateSeq(PETSC_COMM_SELF,n,bout);CHKERRQ(ierr);}
489 #endif
490   } else { /* parallel cases */
491     ptrdiff_t    alloc_local,local_n0,local_0_start;
492     ptrdiff_t    local_n1;
493     fftw_complex *data_fout;
494     ptrdiff_t    local_1_start;
495 #if defined(PETSC_USE_COMPLEX)
496     fftw_complex *data_fin,*data_bout;
497 #else
498     double    *data_finr,*data_boutr;
499     PetscInt  n1,N1;
500     ptrdiff_t temp;
501 #endif
502 
503     switch (ndim) {
504     case 1:
505 #if !defined(PETSC_USE_COMPLEX)
506       SETERRQ(comm,PETSC_ERR_SUP,"FFTW does not allow parallel real 1D transform");
507 #else
508       alloc_local = fftw_mpi_local_size_1d(dim[0],comm,FFTW_FORWARD,FFTW_ESTIMATE,&local_n0,&local_0_start,&local_n1,&local_1_start);
509       if (fin) {
510         data_fin  = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
511         ierr      = VecCreateMPIWithArray(comm,1,local_n0,N,(const PetscScalar*)data_fin,fin);CHKERRQ(ierr);
512         ierr      = PetscObjectCompose((PetscObject)*fin,"FFTmatrix",(PetscObject)A);CHKERRQ(ierr);
513         (*fin)->ops->duplicate = VecDuplicate_FFTW_fin;
514         (*fin)->ops->destroy   = VecDestroy_MPIFFTW;
515       }
516       if (fout) {
517         data_fout = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
518         ierr      = VecCreateMPIWithArray(comm,1,local_n1,N,(const PetscScalar*)data_fout,fout);CHKERRQ(ierr);
519         ierr      = PetscObjectCompose((PetscObject)*fout,"FFTmatrix",(PetscObject)A);CHKERRQ(ierr);
520         (*fout)->ops->duplicate = VecDuplicate_FFTW_fout;
521         (*fout)->ops->destroy   = VecDestroy_MPIFFTW;
522       }
523       alloc_local = fftw_mpi_local_size_1d(dim[0],comm,FFTW_BACKWARD,FFTW_ESTIMATE,&local_n0,&local_0_start,&local_n1,&local_1_start);
524       if (bout) {
525         data_bout = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
526         ierr      = VecCreateMPIWithArray(comm,1,local_n1,N,(const PetscScalar*)data_bout,bout);CHKERRQ(ierr);
527         ierr      = PetscObjectCompose((PetscObject)*bout,"FFTmatrix",(PetscObject)A);CHKERRQ(ierr);
528         (*bout)->ops->duplicate = VecDuplicate_FFTW_fout;
529         (*bout)->ops->destroy   = VecDestroy_MPIFFTW;
530       }
531       break;
532 #endif
533     case 2:
534 #if !defined(PETSC_USE_COMPLEX) /* Note that N1 is no more the product of individual dimensions */
535       alloc_local =  fftw_mpi_local_size_2d_transposed(dim[0],dim[1]/2+1,comm,&local_n0,&local_0_start,&local_n1,&local_1_start);
536       N1          = 2*dim[0]*(dim[1]/2+1); n1 = 2*local_n0*(dim[1]/2+1);
537       if (fin) {
538         data_finr = (double*)fftw_malloc(sizeof(double)*alloc_local*2);
539         ierr      = VecCreateMPIWithArray(comm,1,(PetscInt)n1,N1,(PetscScalar*)data_finr,fin);CHKERRQ(ierr);
540         ierr      = PetscObjectCompose((PetscObject)*fin,"FFTmatrix",(PetscObject)A);CHKERRQ(ierr);
541         (*fin)->ops->duplicate = VecDuplicate_FFTW_fin;
542         (*fin)->ops->destroy   = VecDestroy_MPIFFTW;
543       }
544       if (fout) {
545         data_fout = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
546         ierr      = VecCreateMPIWithArray(comm,1,(PetscInt)n1,N1,(PetscScalar*)data_fout,fout);CHKERRQ(ierr);
547         ierr      = PetscObjectCompose((PetscObject)*fout,"FFTmatrix",(PetscObject)A);CHKERRQ(ierr);
548         (*fout)->ops->duplicate = VecDuplicate_FFTW_fout;
549         (*fout)->ops->destroy   = VecDestroy_MPIFFTW;
550       }
551       if (bout) {
552         data_boutr = (double*)fftw_malloc(sizeof(double)*alloc_local*2);
553         ierr       = VecCreateMPIWithArray(comm,1,(PetscInt)n1,N1,(PetscScalar*)data_boutr,bout);CHKERRQ(ierr);
554         ierr       = PetscObjectCompose((PetscObject)*bout,"FFTmatrix",(PetscObject)A);CHKERRQ(ierr);
555         (*bout)->ops->duplicate = VecDuplicate_FFTW_bout;
556         (*bout)->ops->destroy   = VecDestroy_MPIFFTW;
557       }
558 #else
559       /* Get local size */
560       alloc_local = fftw_mpi_local_size_2d(dim[0],dim[1],comm,&local_n0,&local_0_start);
561       if (fin) {
562         data_fin = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
563         ierr     = VecCreateMPIWithArray(comm,1,n,N,(const PetscScalar*)data_fin,fin);CHKERRQ(ierr);
564         ierr     = PetscObjectCompose((PetscObject)*fin,"FFTmatrix",(PetscObject)A);CHKERRQ(ierr);
565         (*fin)->ops->duplicate = VecDuplicate_FFTW_fin;
566         (*fin)->ops->destroy   = VecDestroy_MPIFFTW;
567       }
568       if (fout) {
569         data_fout = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
570         ierr      = VecCreateMPIWithArray(comm,1,n,N,(const PetscScalar*)data_fout,fout);CHKERRQ(ierr);
571         ierr      = PetscObjectCompose((PetscObject)*fout,"FFTmatrix",(PetscObject)A);CHKERRQ(ierr);
572         (*fout)->ops->duplicate = VecDuplicate_FFTW_fout;
573         (*fout)->ops->destroy   = VecDestroy_MPIFFTW;
574       }
575       if (bout) {
576         data_bout = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
577         ierr      = VecCreateMPIWithArray(comm,1,n,N,(const PetscScalar*)data_bout,bout);CHKERRQ(ierr);
578         ierr      = PetscObjectCompose((PetscObject)*bout,"FFTmatrix",(PetscObject)A);CHKERRQ(ierr);
579         (*bout)->ops->duplicate = VecDuplicate_FFTW_bout;
580         (*bout)->ops->destroy   = VecDestroy_MPIFFTW;
581       }
582 #endif
583       break;
584     case 3:
585 #if !defined(PETSC_USE_COMPLEX)
586       alloc_local =  fftw_mpi_local_size_3d_transposed(dim[0],dim[1],dim[2]/2+1,comm,&local_n0,&local_0_start,&local_n1,&local_1_start);
587       N1 = 2*dim[0]*dim[1]*(dim[2]/2+1); n1 = 2*local_n0*dim[1]*(dim[2]/2+1);
588       if (fin) {
589         data_finr = (double*)fftw_malloc(sizeof(double)*alloc_local*2);
590         ierr      = VecCreateMPIWithArray(comm,1,(PetscInt)n1,N1,(PetscScalar*)data_finr,fin);CHKERRQ(ierr);
591         ierr      = PetscObjectCompose((PetscObject)*fin,"FFTmatrix",(PetscObject)A);CHKERRQ(ierr);
592         (*fin)->ops->duplicate = VecDuplicate_FFTW_fin;
593         (*fin)->ops->destroy   = VecDestroy_MPIFFTW;
594       }
595       if (fout) {
596         data_fout=(fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
597         ierr = VecCreateMPIWithArray(comm,1,n1,N1,(PetscScalar*)data_fout,fout);CHKERRQ(ierr);
598         ierr = PetscObjectCompose((PetscObject)*fout,"FFTmatrix",(PetscObject)A);CHKERRQ(ierr);
599         (*fout)->ops->duplicate = VecDuplicate_FFTW_fout;
600         (*fout)->ops->destroy   = VecDestroy_MPIFFTW;
601       }
602       if (bout) {
603         data_boutr=(double*)fftw_malloc(sizeof(double)*alloc_local*2);
604         ierr = VecCreateMPIWithArray(comm,1,(PetscInt)n1,N1,(PetscScalar*)data_boutr,bout);CHKERRQ(ierr);
605         ierr = PetscObjectCompose((PetscObject)*bout,"FFTmatrix",(PetscObject)A);CHKERRQ(ierr);
606         (*bout)->ops->duplicate = VecDuplicate_FFTW_bout;
607         (*bout)->ops->destroy   = VecDestroy_MPIFFTW;
608       }
609 #else
610       alloc_local = fftw_mpi_local_size_3d(dim[0],dim[1],dim[2],comm,&local_n0,&local_0_start);
611       if (fin) {
612         data_fin  = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
613         ierr = VecCreateMPIWithArray(comm,1,n,N,(const PetscScalar*)data_fin,fin);CHKERRQ(ierr);
614         ierr = PetscObjectCompose((PetscObject)*fin,"FFTmatrix",(PetscObject)A);CHKERRQ(ierr);
615         (*fin)->ops->duplicate = VecDuplicate_FFTW_fin;
616         (*fin)->ops->destroy   = VecDestroy_MPIFFTW;
617       }
618       if (fout) {
619         data_fout = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
620         ierr = VecCreateMPIWithArray(comm,1,n,N,(const PetscScalar*)data_fout,fout);CHKERRQ(ierr);
621         ierr = PetscObjectCompose((PetscObject)*fout,"FFTmatrix",(PetscObject)A);CHKERRQ(ierr);
622         (*fout)->ops->duplicate = VecDuplicate_FFTW_fout;
623         (*fout)->ops->destroy   = VecDestroy_MPIFFTW;
624       }
625       if (bout) {
626         data_bout  = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
627         ierr = VecCreateMPIWithArray(comm,1,n,N,(const PetscScalar*)data_bout,bout);CHKERRQ(ierr);
628         ierr = PetscObjectCompose((PetscObject)*bout,"FFTmatrix",(PetscObject)A);CHKERRQ(ierr);
629         (*bout)->ops->duplicate = VecDuplicate_FFTW_bout;
630         (*bout)->ops->destroy   = VecDestroy_MPIFFTW;
631       }
632 #endif
633       break;
634     default:
635 #if !defined(PETSC_USE_COMPLEX)
636       temp = (fftw->dim_fftw)[fftw->ndim_fftw-1];
637 
638       (fftw->dim_fftw)[fftw->ndim_fftw-1] = temp/2 + 1;
639 
640       alloc_local = fftw_mpi_local_size_transposed(fftw->ndim_fftw,fftw->dim_fftw,comm,&local_n0,&local_0_start,&local_n1,&local_1_start);
641       N1          = 2*N*(PetscInt)((fftw->dim_fftw)[fftw->ndim_fftw-1])/((PetscInt) temp);
642 
643       (fftw->dim_fftw)[fftw->ndim_fftw-1] = temp;
644 
645       if (fin) {
646         data_finr=(double*)fftw_malloc(sizeof(double)*alloc_local*2);
647         ierr = VecCreateMPIWithArray(comm,1,(PetscInt)n,N1,(PetscScalar*)data_finr,fin);CHKERRQ(ierr);
648         ierr = PetscObjectCompose((PetscObject)*fin,"FFTmatrix",(PetscObject)A);CHKERRQ(ierr);
649         (*fin)->ops->duplicate = VecDuplicate_FFTW_fin;
650         (*fin)->ops->destroy   = VecDestroy_MPIFFTW;
651       }
652       if (fout) {
653         data_fout=(fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
654         ierr = VecCreateMPIWithArray(comm,1,n,N1,(PetscScalar*)data_fout,fout);CHKERRQ(ierr);
655         ierr = PetscObjectCompose((PetscObject)*fout,"FFTmatrix",(PetscObject)A);CHKERRQ(ierr);
656         (*fout)->ops->duplicate = VecDuplicate_FFTW_fout;
657         (*fout)->ops->destroy   = VecDestroy_MPIFFTW;
658       }
659       if (bout) {
660         data_boutr=(double*)fftw_malloc(sizeof(double)*alloc_local*2);
661         ierr = VecCreateMPIWithArray(comm,1,(PetscInt)n,N1,(PetscScalar*)data_boutr,bout);CHKERRQ(ierr);
662         ierr = PetscObjectCompose((PetscObject)*bout,"FFTmatrix",(PetscObject)A);CHKERRQ(ierr);
663         (*bout)->ops->duplicate = VecDuplicate_FFTW_bout;
664         (*bout)->ops->destroy   = VecDestroy_MPIFFTW;
665       }
666 #else
667       alloc_local = fftw_mpi_local_size(fftw->ndim_fftw,fftw->dim_fftw,comm,&local_n0,&local_0_start);
668       if (fin) {
669         data_fin  = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
670         ierr = VecCreateMPIWithArray(comm,1,n,N,(const PetscScalar*)data_fin,fin);CHKERRQ(ierr);
671         ierr = PetscObjectCompose((PetscObject)*fin,"FFTmatrix",(PetscObject)A);CHKERRQ(ierr);
672         (*fin)->ops->duplicate = VecDuplicate_FFTW_fin;
673         (*fin)->ops->destroy   = VecDestroy_MPIFFTW;
674       }
675       if (fout) {
676         data_fout = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
677         ierr = VecCreateMPIWithArray(comm,1,n,N,(const PetscScalar*)data_fout,fout);CHKERRQ(ierr);
678         ierr = PetscObjectCompose((PetscObject)*fout,"FFTmatrix",(PetscObject)A);CHKERRQ(ierr);
679         (*fout)->ops->duplicate = VecDuplicate_FFTW_fout;
680         (*fout)->ops->destroy   = VecDestroy_MPIFFTW;
681       }
682       if (bout) {
683         data_bout  = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
684         ierr = VecCreateMPIWithArray(comm,1,n,N,(const PetscScalar*)data_bout,bout);CHKERRQ(ierr);
685         ierr = PetscObjectCompose((PetscObject)*bout,"FFTmatrix",(PetscObject)A);CHKERRQ(ierr);
686         (*bout)->ops->duplicate = VecDuplicate_FFTW_bout;
687         (*bout)->ops->destroy   = VecDestroy_MPIFFTW;
688       }
689 #endif
690       break;
691     }
692     /* fftw vectors have their data array allocated by fftw_malloc, such that v->array=xxx but
693        v->array_allocated=NULL. A regular replacearray call won't free the memory and only causes
694        memory leaks. We void these pointers here to avoid acident uses.
695      */
696     if (fin)  (*fin)->ops->replacearray = NULL;
697     if (fout) (*fout)->ops->replacearray = NULL;
698     if (bout) (*bout)->ops->replacearray = NULL;
699   }
700   PetscFunctionReturn(0);
701 }
702 
703 /*@
704    VecScatterPetscToFFTW - Copies the PETSc vector to the vector that goes into FFTW block.
705 
706    Collective on Mat
707 
708    Input Parameters:
709 +  A - FFTW matrix
710 -  x - the PETSc vector
711 
712    Output Parameters:
713 .  y - the FFTW vector
714 
715   Options Database Keys:
716 . -mat_fftw_plannerflags - set FFTW planner flags
717 
718    Level: intermediate
719 
720    Note: For real parallel FFT, FFTW requires insertion of extra space at the end of last dimension. This required even when
721          one is not doing in-place transform. The last dimension size must be changed to 2*(dim[last]/2+1) to accommodate these extra
722          zeros. This routine does that job by scattering operation.
723 
724 .seealso: VecScatterFFTWToPetsc()
725 @*/
726 PetscErrorCode VecScatterPetscToFFTW(Mat A,Vec x,Vec y)
727 {
728   PetscErrorCode ierr;
729 
730   PetscFunctionBegin;
731   ierr = PetscUseMethod(A,"VecScatterPetscToFFTW_C",(Mat,Vec,Vec),(A,x,y));CHKERRQ(ierr);
732   PetscFunctionReturn(0);
733 }
734 
735 PetscErrorCode VecScatterPetscToFFTW_FFTW(Mat A,Vec x,Vec y)
736 {
737   PetscErrorCode ierr;
738   MPI_Comm       comm;
739   Mat_FFT        *fft  = (Mat_FFT*)A->data;
740   Mat_FFTW       *fftw = (Mat_FFTW*)fft->data;
741   PetscInt       N     =fft->N;
742   PetscInt       ndim  =fft->ndim,*dim=fft->dim;
743   PetscInt       low;
744   PetscMPIInt    rank,size;
745   PetscInt       vsize,vsize1;
746   ptrdiff_t      local_n0,local_0_start;
747   ptrdiff_t      local_n1,local_1_start;
748   VecScatter     vecscat;
749   IS             list1,list2;
750 #if !defined(PETSC_USE_COMPLEX)
751   PetscInt       i,j,k,partial_dim;
752   PetscInt       *indx1, *indx2, tempindx, tempindx1;
753   PetscInt       NM;
754   ptrdiff_t      temp;
755 #endif
756 
757   PetscFunctionBegin;
758   ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr);
759   ierr = MPI_Comm_size(comm, &size);CHKERRQ(ierr);
760   ierr = MPI_Comm_rank(comm, &rank);CHKERRQ(ierr);
761   ierr = VecGetOwnershipRange(y,&low,NULL);
762 
763   if (size==1) {
764     ierr = VecGetSize(x,&vsize);CHKERRQ(ierr);
765     ierr = VecGetSize(y,&vsize1);CHKERRQ(ierr);
766     ierr = ISCreateStride(PETSC_COMM_SELF,N,0,1,&list1);CHKERRQ(ierr);
767     ierr = VecScatterCreate(x,list1,y,list1,&vecscat);CHKERRQ(ierr);
768     ierr = VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
769     ierr = VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
770     ierr = VecScatterDestroy(&vecscat);CHKERRQ(ierr);
771     ierr = ISDestroy(&list1);CHKERRQ(ierr);
772   } else {
773     switch (ndim) {
774     case 1:
775 #if defined(PETSC_USE_COMPLEX)
776       fftw_mpi_local_size_1d(dim[0],comm,FFTW_FORWARD,FFTW_ESTIMATE,&local_n0,&local_0_start,&local_n1,&local_1_start);
777 
778       ierr = ISCreateStride(comm,local_n0,local_0_start,1,&list1);
779       ierr = ISCreateStride(comm,local_n0,low,1,&list2);
780       ierr = VecScatterCreate(x,list1,y,list2,&vecscat);CHKERRQ(ierr);
781       ierr = VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
782       ierr = VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
783       ierr = VecScatterDestroy(&vecscat);CHKERRQ(ierr);
784       ierr = ISDestroy(&list1);CHKERRQ(ierr);
785       ierr = ISDestroy(&list2);CHKERRQ(ierr);
786 #else
787       SETERRQ(comm,PETSC_ERR_SUP,"FFTW does not support parallel 1D real transform");
788 #endif
789       break;
790     case 2:
791 #if defined(PETSC_USE_COMPLEX)
792       fftw_mpi_local_size_2d(dim[0],dim[1],comm,&local_n0,&local_0_start);
793 
794       ierr = ISCreateStride(comm,local_n0*dim[1],local_0_start*dim[1],1,&list1);
795       ierr = ISCreateStride(comm,local_n0*dim[1],low,1,&list2);
796       ierr = VecScatterCreate(x,list1,y,list2,&vecscat);CHKERRQ(ierr);
797       ierr = VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
798       ierr = VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
799       ierr = VecScatterDestroy(&vecscat);CHKERRQ(ierr);
800       ierr = ISDestroy(&list1);CHKERRQ(ierr);
801       ierr = ISDestroy(&list2);CHKERRQ(ierr);
802 #else
803       fftw_mpi_local_size_2d_transposed(dim[0],dim[1]/2+1,comm,&local_n0,&local_0_start,&local_n1,&local_1_start);
804 
805       ierr = PetscMalloc1(((PetscInt)local_n0)*dim[1],&indx1);CHKERRQ(ierr);
806       ierr = PetscMalloc1(((PetscInt)local_n0)*dim[1],&indx2);CHKERRQ(ierr);
807 
808       if (dim[1]%2==0) {
809         NM = dim[1]+2;
810       } else {
811         NM = dim[1]+1;
812       }
813       for (i=0; i<local_n0; i++) {
814         for (j=0; j<dim[1]; j++) {
815           tempindx  = i*dim[1] + j;
816           tempindx1 = i*NM + j;
817 
818           indx1[tempindx]=local_0_start*dim[1]+tempindx;
819           indx2[tempindx]=low+tempindx1;
820         }
821       }
822 
823       ierr = ISCreateGeneral(comm,local_n0*dim[1],indx1,PETSC_COPY_VALUES,&list1);CHKERRQ(ierr);
824       ierr = ISCreateGeneral(comm,local_n0*dim[1],indx2,PETSC_COPY_VALUES,&list2);CHKERRQ(ierr);
825 
826       ierr = VecScatterCreate(x,list1,y,list2,&vecscat);CHKERRQ(ierr);
827       ierr = VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
828       ierr = VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
829       ierr = VecScatterDestroy(&vecscat);CHKERRQ(ierr);
830       ierr = ISDestroy(&list1);CHKERRQ(ierr);
831       ierr = ISDestroy(&list2);CHKERRQ(ierr);
832       ierr = PetscFree(indx1);CHKERRQ(ierr);
833       ierr = PetscFree(indx2);CHKERRQ(ierr);
834 #endif
835       break;
836 
837     case 3:
838 #if defined(PETSC_USE_COMPLEX)
839       fftw_mpi_local_size_3d(dim[0],dim[1],dim[2],comm,&local_n0,&local_0_start);
840 
841       ierr = ISCreateStride(comm,local_n0*dim[1]*dim[2],local_0_start*dim[1]*dim[2],1,&list1);
842       ierr = ISCreateStride(comm,local_n0*dim[1]*dim[2],low,1,&list2);
843       ierr = VecScatterCreate(x,list1,y,list2,&vecscat);CHKERRQ(ierr);
844       ierr = VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
845       ierr = VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
846       ierr = VecScatterDestroy(&vecscat);CHKERRQ(ierr);
847       ierr = ISDestroy(&list1);CHKERRQ(ierr);
848       ierr = ISDestroy(&list2);CHKERRQ(ierr);
849 #else
850       /* buggy, needs to be fixed. See src/mat/tests/ex158.c */
851       SETERRQ(comm,PETSC_ERR_SUP,"FFTW does not support parallel 3D real transform");
852       fftw_mpi_local_size_3d_transposed(dim[0],dim[1],dim[2]/2+1,comm,&local_n0,&local_0_start,&local_n1,&local_1_start);
853 
854       ierr = PetscMalloc1(((PetscInt)local_n0)*dim[1]*dim[2],&indx1);CHKERRQ(ierr);
855       ierr = PetscMalloc1(((PetscInt)local_n0)*dim[1]*dim[2],&indx2);CHKERRQ(ierr);
856 
857       if (dim[2]%2==0) NM = dim[2]+2;
858       else             NM = dim[2]+1;
859 
860       for (i=0; i<local_n0; i++) {
861         for (j=0; j<dim[1]; j++) {
862           for (k=0; k<dim[2]; k++) {
863             tempindx  = i*dim[1]*dim[2] + j*dim[2] + k;
864             tempindx1 = i*dim[1]*NM + j*NM + k;
865 
866             indx1[tempindx]=local_0_start*dim[1]*dim[2]+tempindx;
867             indx2[tempindx]=low+tempindx1;
868           }
869         }
870       }
871 
872       ierr = ISCreateGeneral(comm,local_n0*dim[1]*dim[2],indx1,PETSC_COPY_VALUES,&list1);CHKERRQ(ierr);
873       ierr = ISCreateGeneral(comm,local_n0*dim[1]*dim[2],indx2,PETSC_COPY_VALUES,&list2);CHKERRQ(ierr);
874       ierr = VecScatterCreate(x,list1,y,list2,&vecscat);CHKERRQ(ierr);
875       ierr = VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
876       ierr = VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
877       ierr = VecScatterDestroy(&vecscat);CHKERRQ(ierr);
878       ierr = ISDestroy(&list1);CHKERRQ(ierr);
879       ierr = ISDestroy(&list2);CHKERRQ(ierr);
880       ierr = PetscFree(indx1);CHKERRQ(ierr);
881       ierr = PetscFree(indx2);CHKERRQ(ierr);
882 #endif
883       break;
884 
885     default:
886 #if defined(PETSC_USE_COMPLEX)
887       fftw_mpi_local_size(fftw->ndim_fftw,fftw->dim_fftw,comm,&local_n0,&local_0_start);
888 
889       ierr = ISCreateStride(comm,local_n0*(fftw->partial_dim),local_0_start*(fftw->partial_dim),1,&list1);
890       ierr = ISCreateStride(comm,local_n0*(fftw->partial_dim),low,1,&list2);
891       ierr = VecScatterCreate(x,list1,y,list2,&vecscat);CHKERRQ(ierr);
892       ierr = VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
893       ierr = VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
894       ierr = VecScatterDestroy(&vecscat);CHKERRQ(ierr);
895       ierr = ISDestroy(&list1);CHKERRQ(ierr);
896       ierr = ISDestroy(&list2);CHKERRQ(ierr);
897 #else
898       /* buggy, needs to be fixed. See src/mat/tests/ex158.c */
899       SETERRQ(comm,PETSC_ERR_SUP,"FFTW does not support parallel DIM>3 real transform");
900       temp = (fftw->dim_fftw)[fftw->ndim_fftw-1];
901 
902       (fftw->dim_fftw)[fftw->ndim_fftw-1] = temp/2 + 1;
903 
904       fftw_mpi_local_size_transposed(fftw->ndim_fftw,fftw->dim_fftw,comm,&local_n0,&local_0_start,&local_n1,&local_1_start);
905 
906       (fftw->dim_fftw)[fftw->ndim_fftw-1] = temp;
907 
908       partial_dim = fftw->partial_dim;
909 
910       ierr = PetscMalloc1(((PetscInt)local_n0)*partial_dim,&indx1);CHKERRQ(ierr);
911       ierr = PetscMalloc1(((PetscInt)local_n0)*partial_dim,&indx2);CHKERRQ(ierr);
912 
913       if (dim[ndim-1]%2==0) NM = 2;
914       else                  NM = 1;
915 
916       j = low;
917       for (i=0,k=1; i<((PetscInt)local_n0)*partial_dim;i++,k++) {
918         indx1[i] = local_0_start*partial_dim + i;
919         indx2[i] = j;
920         if (k%dim[ndim-1]==0) j+=NM;
921         j++;
922       }
923       ierr = ISCreateGeneral(comm,local_n0*partial_dim,indx1,PETSC_COPY_VALUES,&list1);CHKERRQ(ierr);
924       ierr = ISCreateGeneral(comm,local_n0*partial_dim,indx2,PETSC_COPY_VALUES,&list2);CHKERRQ(ierr);
925       ierr = VecScatterCreate(x,list1,y,list2,&vecscat);CHKERRQ(ierr);
926       ierr = VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
927       ierr = VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
928       ierr = VecScatterDestroy(&vecscat);CHKERRQ(ierr);
929       ierr = ISDestroy(&list1);CHKERRQ(ierr);
930       ierr = ISDestroy(&list2);CHKERRQ(ierr);
931       ierr = PetscFree(indx1);CHKERRQ(ierr);
932       ierr = PetscFree(indx2);CHKERRQ(ierr);
933 #endif
934       break;
935     }
936   }
937   PetscFunctionReturn(0);
938 }
939 
940 /*@
941    VecScatterFFTWToPetsc - Converts FFTW output to the PETSc vector.
942 
943    Collective on Mat
944 
945     Input Parameters:
946 +   A - FFTW matrix
947 -   x - FFTW vector
948 
949    Output Parameters:
950 .  y - PETSc vector
951 
952    Level: intermediate
953 
954    Note: While doing real transform the FFTW output of backward DFT contains extra zeros at the end of last dimension.
955          VecScatterFFTWToPetsc removes those extra zeros.
956 
957 .seealso: VecScatterPetscToFFTW()
958 @*/
959 PetscErrorCode VecScatterFFTWToPetsc(Mat A,Vec x,Vec y)
960 {
961   PetscErrorCode ierr;
962 
963   PetscFunctionBegin;
964   ierr = PetscUseMethod(A,"VecScatterFFTWToPetsc_C",(Mat,Vec,Vec),(A,x,y));CHKERRQ(ierr);
965   PetscFunctionReturn(0);
966 }
967 
968 PetscErrorCode VecScatterFFTWToPetsc_FFTW(Mat A,Vec x,Vec y)
969 {
970   PetscErrorCode ierr;
971   MPI_Comm       comm;
972   Mat_FFT        *fft  = (Mat_FFT*)A->data;
973   Mat_FFTW       *fftw = (Mat_FFTW*)fft->data;
974   PetscInt       N     = fft->N;
975   PetscInt       ndim  = fft->ndim,*dim=fft->dim;
976   PetscInt       low;
977   PetscMPIInt    rank,size;
978   ptrdiff_t      local_n0,local_0_start;
979   ptrdiff_t      local_n1,local_1_start;
980   VecScatter     vecscat;
981   IS             list1,list2;
982 #if !defined(PETSC_USE_COMPLEX)
983   PetscInt       i,j,k,partial_dim;
984   PetscInt       *indx1, *indx2, tempindx, tempindx1;
985   PetscInt       NM;
986   ptrdiff_t      temp;
987 #endif
988 
989   PetscFunctionBegin;
990   ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr);
991   ierr = MPI_Comm_size(comm, &size);CHKERRQ(ierr);
992   ierr = MPI_Comm_rank(comm, &rank);CHKERRQ(ierr);
993   ierr = VecGetOwnershipRange(x,&low,NULL);CHKERRQ(ierr);
994 
995   if (size==1) {
996     ierr = ISCreateStride(comm,N,0,1,&list1);CHKERRQ(ierr);
997     ierr = VecScatterCreate(x,list1,y,list1,&vecscat);CHKERRQ(ierr);
998     ierr = VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
999     ierr = VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
1000     ierr = VecScatterDestroy(&vecscat);CHKERRQ(ierr);
1001     ierr = ISDestroy(&list1);CHKERRQ(ierr);
1002 
1003   } else {
1004     switch (ndim) {
1005     case 1:
1006 #if defined(PETSC_USE_COMPLEX)
1007       fftw_mpi_local_size_1d(dim[0],comm,FFTW_BACKWARD,FFTW_ESTIMATE,&local_n0,&local_0_start,&local_n1,&local_1_start);
1008 
1009       ierr = ISCreateStride(comm,local_n1,local_1_start,1,&list1);
1010       ierr = ISCreateStride(comm,local_n1,low,1,&list2);
1011       ierr = VecScatterCreate(x,list1,y,list2,&vecscat);CHKERRQ(ierr);
1012       ierr = VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
1013       ierr = VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
1014       ierr = VecScatterDestroy(&vecscat);CHKERRQ(ierr);
1015       ierr = ISDestroy(&list1);CHKERRQ(ierr);
1016       ierr = ISDestroy(&list2);CHKERRQ(ierr);
1017 #else
1018       SETERRQ(comm,PETSC_ERR_SUP,"No support for real parallel 1D FFT");
1019 #endif
1020       break;
1021     case 2:
1022 #if defined(PETSC_USE_COMPLEX)
1023       fftw_mpi_local_size_2d(dim[0],dim[1],comm,&local_n0,&local_0_start);
1024 
1025       ierr = ISCreateStride(comm,local_n0*dim[1],local_0_start*dim[1],1,&list1);
1026       ierr = ISCreateStride(comm,local_n0*dim[1],low,1,&list2);
1027       ierr = VecScatterCreate(x,list2,y,list1,&vecscat);CHKERRQ(ierr);
1028       ierr = VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
1029       ierr = VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
1030       ierr = VecScatterDestroy(&vecscat);CHKERRQ(ierr);
1031       ierr = ISDestroy(&list1);CHKERRQ(ierr);
1032       ierr = ISDestroy(&list2);CHKERRQ(ierr);
1033 #else
1034       fftw_mpi_local_size_2d_transposed(dim[0],dim[1]/2+1,comm,&local_n0,&local_0_start,&local_n1,&local_1_start);
1035 
1036       ierr = PetscMalloc1(((PetscInt)local_n0)*dim[1],&indx1);CHKERRQ(ierr);
1037       ierr = PetscMalloc1(((PetscInt)local_n0)*dim[1],&indx2);CHKERRQ(ierr);
1038 
1039       if (dim[1]%2==0) NM = dim[1]+2;
1040       else             NM = dim[1]+1;
1041 
1042       for (i=0; i<local_n0; i++) {
1043         for (j=0; j<dim[1]; j++) {
1044           tempindx = i*dim[1] + j;
1045           tempindx1 = i*NM + j;
1046 
1047           indx1[tempindx]=local_0_start*dim[1]+tempindx;
1048           indx2[tempindx]=low+tempindx1;
1049         }
1050       }
1051 
1052       ierr = ISCreateGeneral(comm,local_n0*dim[1],indx1,PETSC_COPY_VALUES,&list1);CHKERRQ(ierr);
1053       ierr = ISCreateGeneral(comm,local_n0*dim[1],indx2,PETSC_COPY_VALUES,&list2);CHKERRQ(ierr);
1054 
1055       ierr = VecScatterCreate(x,list2,y,list1,&vecscat);CHKERRQ(ierr);
1056       ierr = VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
1057       ierr = VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
1058       ierr = VecScatterDestroy(&vecscat);CHKERRQ(ierr);
1059       ierr = ISDestroy(&list1);CHKERRQ(ierr);
1060       ierr = ISDestroy(&list2);CHKERRQ(ierr);
1061       ierr = PetscFree(indx1);CHKERRQ(ierr);
1062       ierr = PetscFree(indx2);CHKERRQ(ierr);
1063 #endif
1064       break;
1065     case 3:
1066 #if defined(PETSC_USE_COMPLEX)
1067       fftw_mpi_local_size_3d(dim[0],dim[1],dim[2],comm,&local_n0,&local_0_start);
1068 
1069       ierr = ISCreateStride(comm,local_n0*dim[1]*dim[2],local_0_start*dim[1]*dim[2],1,&list1);
1070       ierr = ISCreateStride(comm,local_n0*dim[1]*dim[2],low,1,&list2);
1071       ierr = VecScatterCreate(x,list1,y,list2,&vecscat);CHKERRQ(ierr);
1072       ierr = VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
1073       ierr = VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
1074       ierr = VecScatterDestroy(&vecscat);CHKERRQ(ierr);
1075       ierr = ISDestroy(&list1);CHKERRQ(ierr);
1076       ierr = ISDestroy(&list2);CHKERRQ(ierr);
1077 #else
1078       fftw_mpi_local_size_3d_transposed(dim[0],dim[1],dim[2]/2+1,comm,&local_n0,&local_0_start,&local_n1,&local_1_start);
1079 
1080       ierr = PetscMalloc1(((PetscInt)local_n0)*dim[1]*dim[2],&indx1);CHKERRQ(ierr);
1081       ierr = PetscMalloc1(((PetscInt)local_n0)*dim[1]*dim[2],&indx2);CHKERRQ(ierr);
1082 
1083       if (dim[2]%2==0) NM = dim[2]+2;
1084       else             NM = dim[2]+1;
1085 
1086       for (i=0; i<local_n0; i++) {
1087         for (j=0; j<dim[1]; j++) {
1088           for (k=0; k<dim[2]; k++) {
1089             tempindx  = i*dim[1]*dim[2] + j*dim[2] + k;
1090             tempindx1 = i*dim[1]*NM + j*NM + k;
1091 
1092             indx1[tempindx]=local_0_start*dim[1]*dim[2]+tempindx;
1093             indx2[tempindx]=low+tempindx1;
1094           }
1095         }
1096       }
1097 
1098       ierr = ISCreateGeneral(comm,local_n0*dim[1]*dim[2],indx1,PETSC_COPY_VALUES,&list1);CHKERRQ(ierr);
1099       ierr = ISCreateGeneral(comm,local_n0*dim[1]*dim[2],indx2,PETSC_COPY_VALUES,&list2);CHKERRQ(ierr);
1100 
1101       ierr = VecScatterCreate(x,list2,y,list1,&vecscat);CHKERRQ(ierr);
1102       ierr = VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
1103       ierr = VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
1104       ierr = VecScatterDestroy(&vecscat);CHKERRQ(ierr);
1105       ierr = ISDestroy(&list1);CHKERRQ(ierr);
1106       ierr = ISDestroy(&list2);CHKERRQ(ierr);
1107       ierr = PetscFree(indx1);CHKERRQ(ierr);
1108       ierr = PetscFree(indx2);CHKERRQ(ierr);
1109 #endif
1110       break;
1111     default:
1112 #if defined(PETSC_USE_COMPLEX)
1113       fftw_mpi_local_size(fftw->ndim_fftw,fftw->dim_fftw,comm,&local_n0,&local_0_start);
1114 
1115       ierr = ISCreateStride(comm,local_n0*(fftw->partial_dim),local_0_start*(fftw->partial_dim),1,&list1);
1116       ierr = ISCreateStride(comm,local_n0*(fftw->partial_dim),low,1,&list2);
1117       ierr = VecScatterCreate(x,list1,y,list2,&vecscat);CHKERRQ(ierr);
1118       ierr = VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
1119       ierr = VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
1120       ierr = VecScatterDestroy(&vecscat);CHKERRQ(ierr);
1121       ierr = ISDestroy(&list1);CHKERRQ(ierr);
1122       ierr = ISDestroy(&list2);CHKERRQ(ierr);
1123 #else
1124       temp = (fftw->dim_fftw)[fftw->ndim_fftw-1];
1125 
1126       (fftw->dim_fftw)[fftw->ndim_fftw-1] = temp/2 + 1;
1127 
1128       fftw_mpi_local_size_transposed(fftw->ndim_fftw,fftw->dim_fftw,comm,&local_n0,&local_0_start,&local_n1,&local_1_start);
1129 
1130       (fftw->dim_fftw)[fftw->ndim_fftw-1] = temp;
1131 
1132       partial_dim = fftw->partial_dim;
1133 
1134       ierr = PetscMalloc1(((PetscInt)local_n0)*partial_dim,&indx1);CHKERRQ(ierr);
1135       ierr = PetscMalloc1(((PetscInt)local_n0)*partial_dim,&indx2);CHKERRQ(ierr);
1136 
1137       if (dim[ndim-1]%2==0) NM = 2;
1138       else                  NM = 1;
1139 
1140       j = low;
1141       for (i=0,k=1; i<((PetscInt)local_n0)*partial_dim; i++,k++) {
1142         indx1[i] = local_0_start*partial_dim + i;
1143         indx2[i] = j;
1144         if (k%dim[ndim-1]==0) j+=NM;
1145         j++;
1146       }
1147       ierr = ISCreateGeneral(comm,local_n0*partial_dim,indx1,PETSC_COPY_VALUES,&list1);CHKERRQ(ierr);
1148       ierr = ISCreateGeneral(comm,local_n0*partial_dim,indx2,PETSC_COPY_VALUES,&list2);CHKERRQ(ierr);
1149 
1150       ierr = VecScatterCreate(x,list2,y,list1,&vecscat);CHKERRQ(ierr);
1151       ierr = VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
1152       ierr = VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
1153       ierr = VecScatterDestroy(&vecscat);CHKERRQ(ierr);
1154       ierr = ISDestroy(&list1);CHKERRQ(ierr);
1155       ierr = ISDestroy(&list2);CHKERRQ(ierr);
1156       ierr = PetscFree(indx1);CHKERRQ(ierr);
1157       ierr = PetscFree(indx2);CHKERRQ(ierr);
1158 #endif
1159       break;
1160     }
1161   }
1162   PetscFunctionReturn(0);
1163 }
1164 
1165 /*
1166     MatCreate_FFTW - Creates a matrix object that provides FFT via the external package FFTW
1167 
1168   Options Database Keys:
1169 + -mat_fftw_plannerflags - set FFTW planner flags
1170 
1171    Level: intermediate
1172 
1173 */
1174 PETSC_EXTERN PetscErrorCode MatCreate_FFTW(Mat A)
1175 {
1176   PetscErrorCode ierr;
1177   MPI_Comm       comm;
1178   Mat_FFT        *fft=(Mat_FFT*)A->data;
1179   Mat_FFTW       *fftw;
1180   PetscInt       n=fft->n,N=fft->N,ndim=fft->ndim,*dim=fft->dim;
1181   const char     *plans[]={"FFTW_ESTIMATE","FFTW_MEASURE","FFTW_PATIENT","FFTW_EXHAUSTIVE"};
1182   unsigned       iplans[]={FFTW_ESTIMATE,FFTW_MEASURE,FFTW_PATIENT,FFTW_EXHAUSTIVE};
1183   PetscBool      flg;
1184   PetscInt       p_flag,partial_dim=1,ctr;
1185   PetscMPIInt    size,rank;
1186   ptrdiff_t      *pdim;
1187   ptrdiff_t      local_n1,local_1_start;
1188 #if !defined(PETSC_USE_COMPLEX)
1189   ptrdiff_t      temp;
1190   PetscInt       N1,tot_dim;
1191 #else
1192   PetscInt       n1;
1193 #endif
1194 
1195   PetscFunctionBegin;
1196   ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr);
1197   ierr = MPI_Comm_size(comm, &size);CHKERRQ(ierr);
1198   ierr = MPI_Comm_rank(comm, &rank);CHKERRQ(ierr);
1199 
1200   fftw_mpi_init();
1201   pdim    = (ptrdiff_t*)calloc(ndim,sizeof(ptrdiff_t));
1202   pdim[0] = dim[0];
1203 #if !defined(PETSC_USE_COMPLEX)
1204   tot_dim = 2*dim[0];
1205 #endif
1206   for (ctr=1; ctr<ndim; ctr++) {
1207     partial_dim *= dim[ctr];
1208     pdim[ctr]    = dim[ctr];
1209 #if !defined(PETSC_USE_COMPLEX)
1210     if (ctr==ndim-1) tot_dim *= (dim[ctr]/2+1);
1211     else             tot_dim *= dim[ctr];
1212 #endif
1213   }
1214 
1215   if (size == 1) {
1216 #if defined(PETSC_USE_COMPLEX)
1217     ierr = MatSetSizes(A,N,N,N,N);CHKERRQ(ierr);
1218     n    = N;
1219 #else
1220     ierr = MatSetSizes(A,tot_dim,tot_dim,tot_dim,tot_dim);CHKERRQ(ierr);
1221     n    = tot_dim;
1222 #endif
1223 
1224   } else {
1225     ptrdiff_t local_n0,local_0_start;
1226     switch (ndim) {
1227     case 1:
1228 #if !defined(PETSC_USE_COMPLEX)
1229       SETERRQ(comm,PETSC_ERR_SUP,"FFTW does not support parallel 1D real transform");
1230 #else
1231       fftw_mpi_local_size_1d(dim[0],comm,FFTW_FORWARD,FFTW_ESTIMATE,&local_n0,&local_0_start,&local_n1,&local_1_start);
1232 
1233       n    = (PetscInt)local_n0;
1234       n1   = (PetscInt)local_n1;
1235       ierr = MatSetSizes(A,n1,n,N,N);CHKERRQ(ierr);
1236 #endif
1237       break;
1238     case 2:
1239 #if defined(PETSC_USE_COMPLEX)
1240       fftw_mpi_local_size_2d(dim[0],dim[1],comm,&local_n0,&local_0_start);
1241       /*
1242        PetscSynchronizedPrintf(comm,"[%d] MatCreateSeqFFTW: local_n0, local_0_start %d %d, N %d,dim %d, %d\n",rank,(PetscInt)local_n0*dim[1],(PetscInt)local_0_start,m,dim[0],dim[1]);
1243        PetscSynchronizedFlush(comm,PETSC_STDOUT);
1244        */
1245       n    = (PetscInt)local_n0*dim[1];
1246       ierr = MatSetSizes(A,n,n,N,N);CHKERRQ(ierr);
1247 #else
1248       fftw_mpi_local_size_2d_transposed(dim[0],dim[1]/2+1,comm,&local_n0,&local_0_start,&local_n1,&local_1_start);
1249 
1250       n    = 2*(PetscInt)local_n0*(dim[1]/2+1);
1251       ierr = MatSetSizes(A,n,n,2*dim[0]*(dim[1]/2+1),2*dim[0]*(dim[1]/2+1));
1252 #endif
1253       break;
1254     case 3:
1255 #if defined(PETSC_USE_COMPLEX)
1256       fftw_mpi_local_size_3d(dim[0],dim[1],dim[2],comm,&local_n0,&local_0_start);
1257 
1258       n    = (PetscInt)local_n0*dim[1]*dim[2];
1259       ierr = MatSetSizes(A,n,n,N,N);CHKERRQ(ierr);
1260 #else
1261       fftw_mpi_local_size_3d_transposed(dim[0],dim[1],dim[2]/2+1,comm,&local_n0,&local_0_start,&local_n1,&local_1_start);
1262 
1263       n   = 2*(PetscInt)local_n0*dim[1]*(dim[2]/2+1);
1264       ierr = MatSetSizes(A,n,n,2*dim[0]*dim[1]*(dim[2]/2+1),2*dim[0]*dim[1]*(dim[2]/2+1));
1265 #endif
1266       break;
1267     default:
1268 #if defined(PETSC_USE_COMPLEX)
1269       fftw_mpi_local_size(ndim,pdim,comm,&local_n0,&local_0_start);
1270 
1271       n    = (PetscInt)local_n0*partial_dim;
1272       ierr = MatSetSizes(A,n,n,N,N);CHKERRQ(ierr);
1273 #else
1274       temp = pdim[ndim-1];
1275 
1276       pdim[ndim-1] = temp/2 + 1;
1277 
1278       fftw_mpi_local_size_transposed(ndim,pdim,comm,&local_n0,&local_0_start,&local_n1,&local_1_start);
1279 
1280       n  = 2*(PetscInt)local_n0*partial_dim*pdim[ndim-1]/temp;
1281       N1 = 2*N*(PetscInt)pdim[ndim-1]/((PetscInt) temp);
1282 
1283       pdim[ndim-1] = temp;
1284 
1285       ierr = MatSetSizes(A,n,n,N1,N1);CHKERRQ(ierr);
1286 #endif
1287       break;
1288     }
1289   }
1290   free(pdim);
1291   ierr      = PetscObjectChangeTypeName((PetscObject)A,MATFFTW);CHKERRQ(ierr);
1292   ierr      = PetscNewLog(A,&fftw);CHKERRQ(ierr);
1293   fft->data = (void*)fftw;
1294 
1295   fft->n            = n;
1296   fftw->ndim_fftw   = (ptrdiff_t)ndim; /* This is dimension of fft */
1297   fftw->partial_dim = partial_dim;
1298 
1299   ierr = PetscMalloc1(ndim, &(fftw->dim_fftw));CHKERRQ(ierr);
1300   if (size == 1) {
1301 #if defined(PETSC_USE_64BIT_INDICES)
1302     fftw->iodims = (fftw_iodim64 *) malloc(sizeof(fftw_iodim64) * ndim);
1303 #else
1304     fftw->iodims = (fftw_iodim *) malloc(sizeof(fftw_iodim) * ndim);
1305 #endif
1306   }
1307 
1308   for (ctr=0;ctr<ndim;ctr++) (fftw->dim_fftw)[ctr]=dim[ctr];
1309 
1310   fftw->p_forward  = NULL;
1311   fftw->p_backward = NULL;
1312   fftw->p_flag     = FFTW_ESTIMATE;
1313 
1314   if (size == 1) {
1315     A->ops->mult          = MatMult_SeqFFTW;
1316     A->ops->multtranspose = MatMultTranspose_SeqFFTW;
1317   } else {
1318     A->ops->mult          = MatMult_MPIFFTW;
1319     A->ops->multtranspose = MatMultTranspose_MPIFFTW;
1320   }
1321   fft->matdestroy = MatDestroy_FFTW;
1322   A->assembled    = PETSC_TRUE;
1323   A->preallocated = PETSC_TRUE;
1324 
1325   ierr = PetscObjectComposeFunction((PetscObject)A,"MatCreateVecsFFTW_C",MatCreateVecsFFTW_FFTW);CHKERRQ(ierr);
1326   ierr = PetscObjectComposeFunction((PetscObject)A,"VecScatterPetscToFFTW_C",VecScatterPetscToFFTW_FFTW);CHKERRQ(ierr);
1327   ierr = PetscObjectComposeFunction((PetscObject)A,"VecScatterFFTWToPetsc_C",VecScatterFFTWToPetsc_FFTW);CHKERRQ(ierr);
1328 
1329   /* get runtime options */
1330   ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)A),((PetscObject)A)->prefix,"FFTW Options","Mat");CHKERRQ(ierr);
1331   ierr = PetscOptionsEList("-mat_fftw_plannerflags","Planner Flags","None",plans,4,plans[0],&p_flag,&flg);CHKERRQ(ierr);
1332   if (flg) {
1333     fftw->p_flag = iplans[p_flag];
1334   }
1335   ierr = PetscOptionsEnd();CHKERRQ(ierr);
1336   PetscFunctionReturn(0);
1337 }
1338 
1339 
1340 
1341 
1342