xref: /petsc/src/mat/impls/fft/fftw/fftw.c (revision ba61784b0d5279c10b9d3cdd07d6ab8635a1fe44) !
1 /*
2     Provides an interface to the FFTW package.
3     Testing examples can be found in ~src/mat/tests
4 */
5 
6 #include <../src/mat/impls/fft/fft.h> /*I "petscmat.h" I*/
7 EXTERN_C_BEGIN
8 #if !PetscDefined(HAVE_MPIUNI)
9   #include <fftw3-mpi.h>
10 #else
11   #include <fftw3.h>
12 #endif
13 EXTERN_C_END
14 
15 typedef struct {
16   PetscInt   ndim_fftw;
17   ptrdiff_t *dim_fftw;
18 #if defined(PETSC_USE_64BIT_INDICES)
19   fftw_iodim64 *iodims;
20 #else
21   fftw_iodim *iodims;
22 #endif
23   PetscInt     partial_dim;
24   fftw_plan    p_forward, p_backward;
25   unsigned     p_flag;                                      /* planner flags, FFTW_ESTIMATE, FFTW_MEASURE, FFTW_PATIENT, FFTW_EXHAUSTIVE */
26   PetscScalar *finarray, *foutarray, *binarray, *boutarray; /* keep track of arrays because fftw_execute() can only be executed for the arrays with which the plan was created */
27 } Mat_FFTW;
28 
29 extern PetscErrorCode MatCreateVecsFFTW_FFTW(Mat, Vec *, Vec *, Vec *);
30 
MatMult_SeqFFTW(Mat A,Vec x,Vec y)31 static PetscErrorCode MatMult_SeqFFTW(Mat A, Vec x, Vec y)
32 {
33   Mat_FFT           *fft  = (Mat_FFT *)A->data;
34   Mat_FFTW          *fftw = (Mat_FFTW *)fft->data;
35   const PetscScalar *x_array;
36   PetscScalar       *y_array;
37   Vec                xx;
38 #if defined(PETSC_USE_COMPLEX)
39   #if defined(PETSC_USE_64BIT_INDICES)
40   fftw_iodim64 *iodims;
41   #else
42   fftw_iodim *iodims;
43   #endif
44   PetscInt i;
45 #endif
46   PetscInt ndim = fft->ndim, *dim = fft->dim;
47 
48   PetscFunctionBegin;
49   if (!fftw->p_forward && fftw->p_flag != FFTW_ESTIMATE) {
50     /* The data in the in/out arrays is overwritten so need a dummy array for computation, see FFTW manual sec2.1 or sec4 */
51     PetscCall(VecDuplicate(x, &xx));
52     PetscCall(VecGetArrayRead(xx, &x_array));
53   } else {
54     PetscCall(VecGetArrayRead(x, &x_array));
55   }
56   PetscCall(VecGetArray(y, &y_array));
57   if (!fftw->p_forward) { /* create a plan, then execute it */
58     switch (ndim) {
59     case 1:
60 #if defined(PETSC_USE_COMPLEX)
61       fftw->p_forward = fftw_plan_dft_1d(dim[0], (fftw_complex *)x_array, (fftw_complex *)y_array, FFTW_FORWARD, fftw->p_flag);
62 #else
63       fftw->p_forward = fftw_plan_dft_r2c_1d(dim[0], (double *)x_array, (fftw_complex *)y_array, fftw->p_flag);
64 #endif
65       break;
66     case 2:
67 #if defined(PETSC_USE_COMPLEX)
68       fftw->p_forward = fftw_plan_dft_2d(dim[0], dim[1], (fftw_complex *)x_array, (fftw_complex *)y_array, FFTW_FORWARD, fftw->p_flag);
69 #else
70       fftw->p_forward = fftw_plan_dft_r2c_2d(dim[0], dim[1], (double *)x_array, (fftw_complex *)y_array, fftw->p_flag);
71 #endif
72       break;
73     case 3:
74 #if defined(PETSC_USE_COMPLEX)
75       fftw->p_forward = fftw_plan_dft_3d(dim[0], dim[1], dim[2], (fftw_complex *)x_array, (fftw_complex *)y_array, FFTW_FORWARD, fftw->p_flag);
76 #else
77       fftw->p_forward = fftw_plan_dft_r2c_3d(dim[0], dim[1], dim[2], (double *)x_array, (fftw_complex *)y_array, fftw->p_flag);
78 #endif
79       break;
80     default:
81 #if defined(PETSC_USE_COMPLEX)
82       iodims = fftw->iodims;
83   #if defined(PETSC_USE_64BIT_INDICES)
84       if (ndim) {
85         iodims[ndim - 1].n  = (ptrdiff_t)dim[ndim - 1];
86         iodims[ndim - 1].is = iodims[ndim - 1].os = 1;
87         for (i = ndim - 2; i >= 0; --i) {
88           iodims[i].n  = (ptrdiff_t)dim[i];
89           iodims[i].is = iodims[i].os = iodims[i + 1].is * iodims[i + 1].n;
90         }
91       }
92       fftw->p_forward = fftw_plan_guru64_dft((int)ndim, (fftw_iodim64 *)iodims, 0, NULL, (fftw_complex *)x_array, (fftw_complex *)y_array, FFTW_FORWARD, fftw->p_flag);
93   #else
94       if (ndim) {
95         iodims[ndim - 1].n  = (int)dim[ndim - 1];
96         iodims[ndim - 1].is = iodims[ndim - 1].os = 1;
97         for (i = ndim - 2; i >= 0; --i) {
98           iodims[i].n  = (int)dim[i];
99           iodims[i].is = iodims[i].os = iodims[i + 1].is * iodims[i + 1].n;
100         }
101       }
102       fftw->p_forward = fftw_plan_guru_dft((int)ndim, (fftw_iodim *)iodims, 0, NULL, (fftw_complex *)x_array, (fftw_complex *)y_array, FFTW_FORWARD, fftw->p_flag);
103   #endif
104 
105 #else
106       fftw->p_forward = fftw_plan_dft_r2c(ndim, (int *)dim, (double *)x_array, (fftw_complex *)y_array, fftw->p_flag);
107 #endif
108       break;
109     }
110     if (fftw->p_flag != FFTW_ESTIMATE) {
111       /* The data in the in/out arrays is overwritten so need a dummy array for computation, see FFTW manual sec2.1 or sec4 */
112       PetscCall(VecRestoreArrayRead(xx, &x_array));
113       PetscCall(VecDestroy(&xx));
114       PetscCall(VecGetArrayRead(x, &x_array));
115     } else {
116       fftw->finarray  = (PetscScalar *)x_array;
117       fftw->foutarray = y_array;
118     }
119   }
120 
121   if (fftw->finarray != x_array || fftw->foutarray != y_array) { /* use existing plan on new arrays */
122 #if defined(PETSC_USE_COMPLEX)
123     fftw_execute_dft(fftw->p_forward, (fftw_complex *)x_array, (fftw_complex *)y_array);
124 #else
125     fftw_execute_dft_r2c(fftw->p_forward, (double *)x_array, (fftw_complex *)y_array);
126 #endif
127   } else {
128     fftw_execute(fftw->p_forward);
129   }
130   PetscCall(VecRestoreArray(y, &y_array));
131   PetscCall(VecRestoreArrayRead(x, &x_array));
132   PetscFunctionReturn(PETSC_SUCCESS);
133 }
134 
MatMultTranspose_SeqFFTW(Mat A,Vec x,Vec y)135 static PetscErrorCode MatMultTranspose_SeqFFTW(Mat A, Vec x, Vec y)
136 {
137   Mat_FFT           *fft  = (Mat_FFT *)A->data;
138   Mat_FFTW          *fftw = (Mat_FFTW *)fft->data;
139   const PetscScalar *x_array;
140   PetscScalar       *y_array;
141   PetscInt           ndim = fft->ndim, *dim = fft->dim;
142   Vec                xx;
143 #if defined(PETSC_USE_COMPLEX)
144   #if defined(PETSC_USE_64BIT_INDICES)
145   fftw_iodim64 *iodims = fftw->iodims;
146   #else
147   fftw_iodim *iodims = fftw->iodims;
148   #endif
149 #endif
150 
151   PetscFunctionBegin;
152   if (!fftw->p_backward && fftw->p_flag != FFTW_ESTIMATE) {
153     /* The data in the in/out arrays is overwritten so need a dummy array for computation, see FFTW manual sec2.1 or sec4 */
154     PetscCall(VecDuplicate(x, &xx));
155     PetscCall(VecGetArrayRead(xx, &x_array));
156   } else {
157     PetscCall(VecGetArrayRead(x, &x_array));
158   }
159   PetscCall(VecGetArray(y, &y_array));
160   if (!fftw->p_backward) { /* create a plan, then execute it */
161     switch (ndim) {
162     case 1:
163 #if defined(PETSC_USE_COMPLEX)
164       fftw->p_backward = fftw_plan_dft_1d(dim[0], (fftw_complex *)x_array, (fftw_complex *)y_array, FFTW_BACKWARD, fftw->p_flag);
165 #else
166       fftw->p_backward = fftw_plan_dft_c2r_1d(dim[0], (fftw_complex *)x_array, (double *)y_array, fftw->p_flag);
167 #endif
168       break;
169     case 2:
170 #if defined(PETSC_USE_COMPLEX)
171       fftw->p_backward = fftw_plan_dft_2d(dim[0], dim[1], (fftw_complex *)x_array, (fftw_complex *)y_array, FFTW_BACKWARD, fftw->p_flag);
172 #else
173       fftw->p_backward = fftw_plan_dft_c2r_2d(dim[0], dim[1], (fftw_complex *)x_array, (double *)y_array, fftw->p_flag);
174 #endif
175       break;
176     case 3:
177 #if defined(PETSC_USE_COMPLEX)
178       fftw->p_backward = fftw_plan_dft_3d(dim[0], dim[1], dim[2], (fftw_complex *)x_array, (fftw_complex *)y_array, FFTW_BACKWARD, fftw->p_flag);
179 #else
180       fftw->p_backward = fftw_plan_dft_c2r_3d(dim[0], dim[1], dim[2], (fftw_complex *)x_array, (double *)y_array, fftw->p_flag);
181 #endif
182       break;
183     default:
184 #if defined(PETSC_USE_COMPLEX)
185   #if defined(PETSC_USE_64BIT_INDICES)
186       fftw->p_backward = fftw_plan_guru64_dft((int)ndim, (fftw_iodim64 *)iodims, 0, NULL, (fftw_complex *)x_array, (fftw_complex *)y_array, FFTW_BACKWARD, fftw->p_flag);
187   #else
188       fftw->p_backward = fftw_plan_guru_dft((int)ndim, iodims, 0, NULL, (fftw_complex *)x_array, (fftw_complex *)y_array, FFTW_BACKWARD, fftw->p_flag);
189   #endif
190 #else
191       fftw->p_backward = fftw_plan_dft_c2r((int)ndim, (int *)dim, (fftw_complex *)x_array, (double *)y_array, fftw->p_flag);
192 #endif
193       break;
194     }
195     if (fftw->p_flag != FFTW_ESTIMATE) {
196       /* The data in the in/out arrays is overwritten so need a dummy array for computation, see FFTW manual sec2.1 or sec4 */
197       PetscCall(VecRestoreArrayRead(xx, &x_array));
198       PetscCall(VecDestroy(&xx));
199       PetscCall(VecGetArrayRead(x, &x_array));
200     } else {
201       fftw->binarray  = (PetscScalar *)x_array;
202       fftw->boutarray = y_array;
203     }
204   }
205   if (fftw->binarray != x_array || fftw->boutarray != y_array) { /* use existing plan on new arrays */
206 #if defined(PETSC_USE_COMPLEX)
207     fftw_execute_dft(fftw->p_backward, (fftw_complex *)x_array, (fftw_complex *)y_array);
208 #else
209     fftw_execute_dft_c2r(fftw->p_backward, (fftw_complex *)x_array, (double *)y_array);
210 #endif
211   } else {
212     fftw_execute(fftw->p_backward);
213   }
214   PetscCall(VecRestoreArray(y, &y_array));
215   PetscCall(VecRestoreArrayRead(x, &x_array));
216   PetscFunctionReturn(PETSC_SUCCESS);
217 }
218 
219 #if !PetscDefined(HAVE_MPIUNI)
MatMult_MPIFFTW(Mat A,Vec x,Vec y)220 static PetscErrorCode MatMult_MPIFFTW(Mat A, Vec x, Vec y)
221 {
222   Mat_FFT           *fft  = (Mat_FFT *)A->data;
223   Mat_FFTW          *fftw = (Mat_FFTW *)fft->data;
224   const PetscScalar *x_array;
225   PetscScalar       *y_array;
226   PetscInt           ndim = fft->ndim, *dim = fft->dim;
227   MPI_Comm           comm;
228   Vec                xx;
229 
230   PetscFunctionBegin;
231   PetscCall(PetscObjectGetComm((PetscObject)A, &comm));
232   if (!fftw->p_forward && fftw->p_flag != FFTW_ESTIMATE) {
233     /* The data in the in/out arrays is overwritten so need a dummy array for computation, see FFTW manual sec2.1 or sec4 */
234     PetscCall(VecDuplicate(x, &xx));
235     PetscCall(VecGetArrayRead(xx, &x_array));
236   } else {
237     PetscCall(VecGetArrayRead(x, &x_array));
238   }
239   PetscCall(VecGetArray(y, &y_array));
240   if (!fftw->p_forward) { /* create a plan, then execute it */
241     switch (ndim) {
242     case 1:
243   #if defined(PETSC_USE_COMPLEX)
244       fftw->p_forward = fftw_mpi_plan_dft_1d(dim[0], (fftw_complex *)x_array, (fftw_complex *)y_array, comm, FFTW_FORWARD, fftw->p_flag);
245   #else
246       SETERRQ(comm, PETSC_ERR_SUP, "not support for real numbers yet");
247   #endif
248       break;
249     case 2:
250   #if defined(PETSC_USE_COMPLEX) /* For complex transforms call fftw_mpi_plan_dft, for real transforms call fftw_mpi_plan_dft_r2c */
251       fftw->p_forward = fftw_mpi_plan_dft_2d(dim[0], dim[1], (fftw_complex *)x_array, (fftw_complex *)y_array, comm, FFTW_FORWARD, fftw->p_flag);
252   #else
253       fftw->p_forward = fftw_mpi_plan_dft_r2c_2d(dim[0], dim[1], (double *)x_array, (fftw_complex *)y_array, comm, FFTW_ESTIMATE);
254   #endif
255       break;
256     case 3:
257   #if defined(PETSC_USE_COMPLEX)
258       fftw->p_forward = fftw_mpi_plan_dft_3d(dim[0], dim[1], dim[2], (fftw_complex *)x_array, (fftw_complex *)y_array, comm, FFTW_FORWARD, fftw->p_flag);
259   #else
260       fftw->p_forward = fftw_mpi_plan_dft_r2c_3d(dim[0], dim[1], dim[2], (double *)x_array, (fftw_complex *)y_array, comm, FFTW_ESTIMATE);
261   #endif
262       break;
263     default:
264   #if defined(PETSC_USE_COMPLEX)
265       fftw->p_forward = fftw_mpi_plan_dft(fftw->ndim_fftw, fftw->dim_fftw, (fftw_complex *)x_array, (fftw_complex *)y_array, comm, FFTW_FORWARD, fftw->p_flag);
266   #else
267       fftw->p_forward = fftw_mpi_plan_dft_r2c(fftw->ndim_fftw, fftw->dim_fftw, (double *)x_array, (fftw_complex *)y_array, comm, FFTW_ESTIMATE);
268   #endif
269       break;
270     }
271     if (fftw->p_flag != FFTW_ESTIMATE) {
272       /* The data in the in/out arrays is overwritten so need a dummy array for computation, see FFTW manual sec2.1 or sec4 */
273       PetscCall(VecRestoreArrayRead(xx, &x_array));
274       PetscCall(VecDestroy(&xx));
275       PetscCall(VecGetArrayRead(x, &x_array));
276     } else {
277       fftw->finarray  = (PetscScalar *)x_array;
278       fftw->foutarray = y_array;
279     }
280   }
281   if (fftw->finarray != x_array || fftw->foutarray != y_array) { /* use existing plan on new arrays */
282     fftw_execute_dft(fftw->p_forward, (fftw_complex *)x_array, (fftw_complex *)y_array);
283   } else {
284     fftw_execute(fftw->p_forward);
285   }
286   PetscCall(VecRestoreArray(y, &y_array));
287   PetscCall(VecRestoreArrayRead(x, &x_array));
288   PetscFunctionReturn(PETSC_SUCCESS);
289 }
290 
MatMultTranspose_MPIFFTW(Mat A,Vec x,Vec y)291 static PetscErrorCode MatMultTranspose_MPIFFTW(Mat A, Vec x, Vec y)
292 {
293   Mat_FFT           *fft  = (Mat_FFT *)A->data;
294   Mat_FFTW          *fftw = (Mat_FFTW *)fft->data;
295   const PetscScalar *x_array;
296   PetscScalar       *y_array;
297   PetscInt           ndim = fft->ndim, *dim = fft->dim;
298   MPI_Comm           comm;
299   Vec                xx;
300 
301   PetscFunctionBegin;
302   PetscCall(PetscObjectGetComm((PetscObject)A, &comm));
303   if (!fftw->p_backward && fftw->p_flag != FFTW_ESTIMATE) {
304     /* The data in the in/out arrays is overwritten so need a dummy array for computation, see FFTW manual sec2.1 or sec4 */
305     PetscCall(VecDuplicate(x, &xx));
306     PetscCall(VecGetArrayRead(xx, &x_array));
307   } else {
308     PetscCall(VecGetArrayRead(x, &x_array));
309   }
310   PetscCall(VecGetArray(y, &y_array));
311   if (!fftw->p_backward) { /* create a plan, then execute it */
312     switch (ndim) {
313     case 1:
314   #if defined(PETSC_USE_COMPLEX)
315       fftw->p_backward = fftw_mpi_plan_dft_1d(dim[0], (fftw_complex *)x_array, (fftw_complex *)y_array, comm, FFTW_BACKWARD, fftw->p_flag);
316   #else
317       SETERRQ(comm, PETSC_ERR_SUP, "not support for real numbers yet");
318   #endif
319       break;
320     case 2:
321   #if defined(PETSC_USE_COMPLEX) /* For complex transforms call fftw_mpi_plan_dft with flag FFTW_BACKWARD, for real transforms call fftw_mpi_plan_dft_c2r */
322       fftw->p_backward = fftw_mpi_plan_dft_2d(dim[0], dim[1], (fftw_complex *)x_array, (fftw_complex *)y_array, comm, FFTW_BACKWARD, fftw->p_flag);
323   #else
324       fftw->p_backward = fftw_mpi_plan_dft_c2r_2d(dim[0], dim[1], (fftw_complex *)x_array, (double *)y_array, comm, FFTW_ESTIMATE);
325   #endif
326       break;
327     case 3:
328   #if defined(PETSC_USE_COMPLEX)
329       fftw->p_backward = fftw_mpi_plan_dft_3d(dim[0], dim[1], dim[2], (fftw_complex *)x_array, (fftw_complex *)y_array, comm, FFTW_BACKWARD, fftw->p_flag);
330   #else
331       fftw->p_backward = fftw_mpi_plan_dft_c2r_3d(dim[0], dim[1], dim[2], (fftw_complex *)x_array, (double *)y_array, comm, FFTW_ESTIMATE);
332   #endif
333       break;
334     default:
335   #if defined(PETSC_USE_COMPLEX)
336       fftw->p_backward = fftw_mpi_plan_dft(fftw->ndim_fftw, fftw->dim_fftw, (fftw_complex *)x_array, (fftw_complex *)y_array, comm, FFTW_BACKWARD, fftw->p_flag);
337   #else
338       fftw->p_backward = fftw_mpi_plan_dft_c2r(fftw->ndim_fftw, fftw->dim_fftw, (fftw_complex *)x_array, (double *)y_array, comm, FFTW_ESTIMATE);
339   #endif
340       break;
341     }
342     if (fftw->p_flag != FFTW_ESTIMATE) {
343       /* The data in the in/out arrays is overwritten so need a dummy array for computation, see FFTW manual sec2.1 or sec4 */
344       PetscCall(VecRestoreArrayRead(xx, &x_array));
345       PetscCall(VecDestroy(&xx));
346       PetscCall(VecGetArrayRead(x, &x_array));
347     } else {
348       fftw->binarray  = (PetscScalar *)x_array;
349       fftw->boutarray = y_array;
350     }
351   }
352   if (fftw->binarray != x_array || fftw->boutarray != y_array) { /* use existing plan on new arrays */
353     fftw_execute_dft(fftw->p_backward, (fftw_complex *)x_array, (fftw_complex *)y_array);
354   } else {
355     fftw_execute(fftw->p_backward);
356   }
357   PetscCall(VecRestoreArray(y, &y_array));
358   PetscCall(VecRestoreArrayRead(x, &x_array));
359   PetscFunctionReturn(PETSC_SUCCESS);
360 }
361 #endif
362 
MatDestroy_FFTW(Mat A)363 static PetscErrorCode MatDestroy_FFTW(Mat A)
364 {
365   Mat_FFT  *fft  = (Mat_FFT *)A->data;
366   Mat_FFTW *fftw = (Mat_FFTW *)fft->data;
367 
368   PetscFunctionBegin;
369   fftw_destroy_plan(fftw->p_forward);
370   fftw_destroy_plan(fftw->p_backward);
371   if (fftw->iodims) free(fftw->iodims);
372   PetscCall(PetscFree(fftw->dim_fftw));
373   PetscCall(PetscFree(fft->data));
374   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatCreateVecsFFTW_C", NULL));
375   PetscCall(PetscObjectComposeFunction((PetscObject)A, "VecScatterPetscToFFTW_C", NULL));
376   PetscCall(PetscObjectComposeFunction((PetscObject)A, "VecScatterFFTWToPetsc_C", NULL));
377 #if !PetscDefined(HAVE_MPIUNI)
378   fftw_mpi_cleanup();
379 #endif
380   PetscFunctionReturn(PETSC_SUCCESS);
381 }
382 
383 #if !PetscDefined(HAVE_MPIUNI)
384   #include <../src/vec/vec/impls/mpi/pvecimpl.h> /*I  "petscvec.h"   I*/
VecDestroy_MPIFFTW(Vec v)385 static PetscErrorCode VecDestroy_MPIFFTW(Vec v)
386 {
387   PetscScalar *array;
388 
389   PetscFunctionBegin;
390   PetscCall(VecGetArray(v, &array));
391   fftw_free((fftw_complex *)array);
392   PetscCall(VecRestoreArray(v, &array));
393   PetscCall(VecDestroy_MPI(v));
394   PetscFunctionReturn(PETSC_SUCCESS);
395 }
396 #endif
397 
398 #if !PetscDefined(HAVE_MPIUNI)
VecDuplicate_FFTW_fin(Vec fin,Vec * fin_new)399 static PetscErrorCode VecDuplicate_FFTW_fin(Vec fin, Vec *fin_new)
400 {
401   Mat A;
402 
403   PetscFunctionBegin;
404   PetscCall(PetscObjectQuery((PetscObject)fin, "FFTmatrix", (PetscObject *)&A));
405   PetscCall(MatCreateVecsFFTW_FFTW(A, fin_new, NULL, NULL));
406   PetscFunctionReturn(PETSC_SUCCESS);
407 }
408 
VecDuplicate_FFTW_fout(Vec fout,Vec * fout_new)409 static PetscErrorCode VecDuplicate_FFTW_fout(Vec fout, Vec *fout_new)
410 {
411   Mat A;
412 
413   PetscFunctionBegin;
414   PetscCall(PetscObjectQuery((PetscObject)fout, "FFTmatrix", (PetscObject *)&A));
415   PetscCall(MatCreateVecsFFTW_FFTW(A, NULL, fout_new, NULL));
416   PetscFunctionReturn(PETSC_SUCCESS);
417 }
418 
VecDuplicate_FFTW_bout(Vec bout,Vec * bout_new)419 static PetscErrorCode VecDuplicate_FFTW_bout(Vec bout, Vec *bout_new)
420 {
421   Mat A;
422 
423   PetscFunctionBegin;
424   PetscCall(PetscObjectQuery((PetscObject)bout, "FFTmatrix", (PetscObject *)&A));
425   PetscCall(MatCreateVecsFFTW_FFTW(A, NULL, NULL, bout_new));
426   PetscFunctionReturn(PETSC_SUCCESS);
427 }
428 #endif
429 
430 /*@
431   MatCreateVecsFFTW - Get vector(s) compatible with the matrix, i.e. with the
432   parallel layout determined by `MATFFTW`
433 
434   Collective
435 
436   Input Parameter:
437 . A - the matrix
438 
439   Output Parameters:
440 + x - (optional) input vector of forward FFTW
441 . y - (optional) output vector of forward FFTW
442 - z - (optional) output vector of backward FFTW
443 
444   Options Database Key:
445 . -mat_fftw_plannerflags - set FFTW planner flags
446 
447   Level: advanced
448 
449   Notes:
450   The parallel layout of output of forward FFTW is always same as the input
451   of the backward FFTW. But parallel layout of the input vector of forward
452   FFTW might not be same as the output of backward FFTW.
453 
454   We need to provide enough space while doing parallel real transform.
455   We need to pad extra zeros at the end of last dimension. For this reason the one needs to
456   invoke the routine fftw_mpi_local_size_transposed routines. Remember one has to change the
457   last dimension from n to n/2+1 while invoking this routine. The number of zeros to be padded
458   depends on if the last dimension is even or odd. If the last dimension is even need to pad two
459   zeros if it is odd only one zero is needed.
460 
461   Lastly one needs some scratch space at the end of data set in each process. alloc_local
462   figures out how much space is needed, i.e. it figures out the data+scratch space for
463   each processor and returns that.
464 
465   Developer Notes:
466   How come `MatCreateVecs()` doesn't produce the correctly padded vectors automatically?
467 
468 .seealso: [](ch_matrices), `Mat`, `MATFFTW`, `MatCreateFFT()`, `MatCreateVecs()`
469 @*/
MatCreateVecsFFTW(Mat A,Vec * x,Vec * y,Vec * z)470 PetscErrorCode MatCreateVecsFFTW(Mat A, Vec *x, Vec *y, Vec *z)
471 {
472   PetscFunctionBegin;
473   PetscUseMethod(A, "MatCreateVecsFFTW_C", (Mat, Vec *, Vec *, Vec *), (A, x, y, z));
474   PetscFunctionReturn(PETSC_SUCCESS);
475 }
476 
MatCreateVecsFFTW_FFTW(Mat A,Vec * fin,Vec * fout,Vec * bout)477 PetscErrorCode MatCreateVecsFFTW_FFTW(Mat A, Vec *fin, Vec *fout, Vec *bout)
478 {
479   PetscMPIInt size, rank;
480   MPI_Comm    comm;
481   Mat_FFT    *fft = (Mat_FFT *)A->data;
482 
483   PetscFunctionBegin;
484   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
485   PetscValidType(A, 1);
486   PetscCall(PetscObjectGetComm((PetscObject)A, &comm));
487 
488   PetscCallMPI(MPI_Comm_size(comm, &size));
489   PetscCallMPI(MPI_Comm_rank(comm, &rank));
490   if (size == 1) { /* sequential case */
491 #if defined(PETSC_USE_COMPLEX)
492     if (fin) PetscCall(VecCreateSeq(PETSC_COMM_SELF, fft->N, fin));
493     if (fout) PetscCall(VecCreateSeq(PETSC_COMM_SELF, fft->N, fout));
494     if (bout) PetscCall(VecCreateSeq(PETSC_COMM_SELF, fft->N, bout));
495 #else
496     if (fin) PetscCall(VecCreateSeq(PETSC_COMM_SELF, fft->n, fin));
497     if (fout) PetscCall(VecCreateSeq(PETSC_COMM_SELF, fft->n, fout));
498     if (bout) PetscCall(VecCreateSeq(PETSC_COMM_SELF, fft->n, bout));
499 #endif
500 #if !PetscDefined(HAVE_MPIUNI)
501   } else { /* parallel cases */
502     Mat_FFTW     *fftw = (Mat_FFTW *)fft->data;
503     PetscInt      ndim = fft->ndim, *dim = fft->dim;
504     ptrdiff_t     alloc_local, local_n0, local_0_start;
505     ptrdiff_t     local_n1;
506     fftw_complex *data_fout;
507     ptrdiff_t     local_1_start;
508     PetscInt      n1, N1;
509   #if defined(PETSC_USE_COMPLEX)
510     fftw_complex *data_fin, *data_bout;
511   #else
512     double   *data_finr, *data_boutr;
513     ptrdiff_t temp;
514   #endif
515 
516     switch (ndim) {
517     case 1:
518   #if !defined(PETSC_USE_COMPLEX)
519       SETERRQ(comm, PETSC_ERR_SUP, "FFTW does not allow parallel real 1D transform");
520   #else
521       alloc_local = fftw_mpi_local_size_1d(dim[0], comm, FFTW_FORWARD, FFTW_ESTIMATE, &local_n0, &local_0_start, &local_n1, &local_1_start);
522       if (fin) {
523         data_fin = (fftw_complex *)fftw_malloc(sizeof(fftw_complex) * alloc_local);
524         PetscCall(PetscIntCast(local_n0, &n1));
525         PetscCall(PetscIntCast(fft->N, &N1));
526         PetscCall(VecCreateMPIWithArray(comm, 1, n1, N1, (const PetscScalar *)data_fin, fin));
527         PetscCall(PetscObjectCompose((PetscObject)*fin, "FFTmatrix", (PetscObject)A));
528         (*fin)->ops->duplicate = VecDuplicate_FFTW_fin;
529         (*fin)->ops->destroy   = VecDestroy_MPIFFTW;
530       }
531       if (fout) {
532         data_fout = (fftw_complex *)fftw_malloc(sizeof(fftw_complex) * alloc_local);
533         PetscCall(PetscIntCast(local_n1, &n1));
534         PetscCall(PetscIntCast(fft->N, &N1));
535         PetscCall(VecCreateMPIWithArray(comm, 1, n1, N1, (const PetscScalar *)data_fout, fout));
536         PetscCall(PetscObjectCompose((PetscObject)*fout, "FFTmatrix", (PetscObject)A));
537         (*fout)->ops->duplicate = VecDuplicate_FFTW_fout;
538         (*fout)->ops->destroy   = VecDestroy_MPIFFTW;
539       }
540       alloc_local = fftw_mpi_local_size_1d(dim[0], comm, FFTW_BACKWARD, FFTW_ESTIMATE, &local_n0, &local_0_start, &local_n1, &local_1_start);
541       if (bout) {
542         data_bout = (fftw_complex *)fftw_malloc(sizeof(fftw_complex) * alloc_local);
543         PetscCall(PetscIntCast(local_n1, &n1));
544         PetscCall(PetscIntCast(fft->N, &N1));
545         PetscCall(VecCreateMPIWithArray(comm, 1, n1, N1, (const PetscScalar *)data_bout, bout));
546         PetscCall(PetscObjectCompose((PetscObject)*bout, "FFTmatrix", (PetscObject)A));
547         (*bout)->ops->duplicate = VecDuplicate_FFTW_fout;
548         (*bout)->ops->destroy   = VecDestroy_MPIFFTW;
549       }
550       break;
551   #endif
552     case 2:
553   #if !defined(PETSC_USE_COMPLEX) /* Note that N1 is no more the product of individual dimensions */
554       alloc_local = fftw_mpi_local_size_2d_transposed(dim[0], dim[1] / 2 + 1, comm, &local_n0, &local_0_start, &local_n1, &local_1_start);
555       PetscCall(PetscIntCast(2 * dim[0] * (dim[1] / 2 + 1), &N1));
556       PetscCall(PetscIntCast(2 * local_n0 * (dim[1] / 2 + 1), &n1));
557       if (fin) {
558         data_finr = (double *)fftw_malloc(sizeof(double) * alloc_local * 2);
559         PetscCall(VecCreateMPIWithArray(comm, 1, (PetscInt)n1, N1, (PetscScalar *)data_finr, fin));
560         PetscCall(PetscObjectCompose((PetscObject)*fin, "FFTmatrix", (PetscObject)A));
561         (*fin)->ops->duplicate = VecDuplicate_FFTW_fin;
562         (*fin)->ops->destroy   = VecDestroy_MPIFFTW;
563       }
564       if (fout) {
565         data_fout = (fftw_complex *)fftw_malloc(sizeof(fftw_complex) * alloc_local);
566         PetscCall(VecCreateMPIWithArray(comm, 1, (PetscInt)n1, N1, (PetscScalar *)data_fout, fout));
567         PetscCall(PetscObjectCompose((PetscObject)*fout, "FFTmatrix", (PetscObject)A));
568         (*fout)->ops->duplicate = VecDuplicate_FFTW_fout;
569         (*fout)->ops->destroy   = VecDestroy_MPIFFTW;
570       }
571       if (bout) {
572         data_boutr = (double *)fftw_malloc(sizeof(double) * alloc_local * 2);
573         PetscCall(VecCreateMPIWithArray(comm, 1, (PetscInt)n1, N1, (PetscScalar *)data_boutr, bout));
574         PetscCall(PetscObjectCompose((PetscObject)*bout, "FFTmatrix", (PetscObject)A));
575         (*bout)->ops->duplicate = VecDuplicate_FFTW_bout;
576         (*bout)->ops->destroy   = VecDestroy_MPIFFTW;
577       }
578   #else
579       /* Get local size */
580       alloc_local = fftw_mpi_local_size_2d(dim[0], dim[1], comm, &local_n0, &local_0_start);
581       if (fin) {
582         data_fin = (fftw_complex *)fftw_malloc(sizeof(fftw_complex) * alloc_local);
583         PetscCall(VecCreateMPIWithArray(comm, 1, fft->n, fft->N, (const PetscScalar *)data_fin, fin));
584         PetscCall(PetscObjectCompose((PetscObject)*fin, "FFTmatrix", (PetscObject)A));
585         (*fin)->ops->duplicate = VecDuplicate_FFTW_fin;
586         (*fin)->ops->destroy   = VecDestroy_MPIFFTW;
587       }
588       if (fout) {
589         data_fout = (fftw_complex *)fftw_malloc(sizeof(fftw_complex) * alloc_local);
590         PetscCall(VecCreateMPIWithArray(comm, 1, fft->n, fft->N, (const PetscScalar *)data_fout, fout));
591         PetscCall(PetscObjectCompose((PetscObject)*fout, "FFTmatrix", (PetscObject)A));
592         (*fout)->ops->duplicate = VecDuplicate_FFTW_fout;
593         (*fout)->ops->destroy   = VecDestroy_MPIFFTW;
594       }
595       if (bout) {
596         data_bout = (fftw_complex *)fftw_malloc(sizeof(fftw_complex) * alloc_local);
597         PetscCall(VecCreateMPIWithArray(comm, 1, fft->n, fft->N, (const PetscScalar *)data_bout, bout));
598         PetscCall(PetscObjectCompose((PetscObject)*bout, "FFTmatrix", (PetscObject)A));
599         (*bout)->ops->duplicate = VecDuplicate_FFTW_bout;
600         (*bout)->ops->destroy   = VecDestroy_MPIFFTW;
601       }
602   #endif
603       break;
604     case 3:
605   #if !defined(PETSC_USE_COMPLEX)
606       alloc_local = fftw_mpi_local_size_3d_transposed(dim[0], dim[1], dim[2] / 2 + 1, comm, &local_n0, &local_0_start, &local_n1, &local_1_start);
607       PetscCall(PetscIntCast(2 * dim[0] * dim[1] * (dim[2] / 2 + 1), &N1));
608       PetscCall(PetscIntCast(2 * local_n0 * dim[1] * (dim[2] / 2 + 1), &n1));
609       if (fin) {
610         data_finr = (double *)fftw_malloc(sizeof(double) * alloc_local * 2);
611         PetscCall(VecCreateMPIWithArray(comm, 1, (PetscInt)n1, N1, (PetscScalar *)data_finr, fin));
612         PetscCall(PetscObjectCompose((PetscObject)*fin, "FFTmatrix", (PetscObject)A));
613         (*fin)->ops->duplicate = VecDuplicate_FFTW_fin;
614         (*fin)->ops->destroy   = VecDestroy_MPIFFTW;
615       }
616       if (fout) {
617         data_fout = (fftw_complex *)fftw_malloc(sizeof(fftw_complex) * alloc_local);
618         PetscCall(VecCreateMPIWithArray(comm, 1, n1, N1, (PetscScalar *)data_fout, fout));
619         PetscCall(PetscObjectCompose((PetscObject)*fout, "FFTmatrix", (PetscObject)A));
620         (*fout)->ops->duplicate = VecDuplicate_FFTW_fout;
621         (*fout)->ops->destroy   = VecDestroy_MPIFFTW;
622       }
623       if (bout) {
624         data_boutr = (double *)fftw_malloc(sizeof(double) * alloc_local * 2);
625         PetscCall(VecCreateMPIWithArray(comm, 1, (PetscInt)n1, N1, (PetscScalar *)data_boutr, bout));
626         PetscCall(PetscObjectCompose((PetscObject)*bout, "FFTmatrix", (PetscObject)A));
627         (*bout)->ops->duplicate = VecDuplicate_FFTW_bout;
628         (*bout)->ops->destroy   = VecDestroy_MPIFFTW;
629       }
630   #else
631       alloc_local = fftw_mpi_local_size_3d(dim[0], dim[1], dim[2], comm, &local_n0, &local_0_start);
632       if (fin) {
633         data_fin = (fftw_complex *)fftw_malloc(sizeof(fftw_complex) * alloc_local);
634         PetscCall(VecCreateMPIWithArray(comm, 1, fft->n, fft->N, (const PetscScalar *)data_fin, fin));
635         PetscCall(PetscObjectCompose((PetscObject)*fin, "FFTmatrix", (PetscObject)A));
636         (*fin)->ops->duplicate = VecDuplicate_FFTW_fin;
637         (*fin)->ops->destroy   = VecDestroy_MPIFFTW;
638       }
639       if (fout) {
640         data_fout = (fftw_complex *)fftw_malloc(sizeof(fftw_complex) * alloc_local);
641         PetscCall(VecCreateMPIWithArray(comm, 1, fft->n, fft->N, (const PetscScalar *)data_fout, fout));
642         PetscCall(PetscObjectCompose((PetscObject)*fout, "FFTmatrix", (PetscObject)A));
643         (*fout)->ops->duplicate = VecDuplicate_FFTW_fout;
644         (*fout)->ops->destroy   = VecDestroy_MPIFFTW;
645       }
646       if (bout) {
647         data_bout = (fftw_complex *)fftw_malloc(sizeof(fftw_complex) * alloc_local);
648         PetscCall(VecCreateMPIWithArray(comm, 1, fft->n, fft->N, (const PetscScalar *)data_bout, bout));
649         PetscCall(PetscObjectCompose((PetscObject)*bout, "FFTmatrix", (PetscObject)A));
650         (*bout)->ops->duplicate = VecDuplicate_FFTW_bout;
651         (*bout)->ops->destroy   = VecDestroy_MPIFFTW;
652       }
653   #endif
654       break;
655     default:
656   #if !defined(PETSC_USE_COMPLEX)
657       temp                                  = (fftw->dim_fftw)[fftw->ndim_fftw - 1];
658       (fftw->dim_fftw)[fftw->ndim_fftw - 1] = temp / 2 + 1;
659       alloc_local                           = fftw_mpi_local_size_transposed(fftw->ndim_fftw, fftw->dim_fftw, comm, &local_n0, &local_0_start, &local_n1, &local_1_start);
660       N1                                    = 2 * fft->N * (PetscInt)(fftw->dim_fftw[fftw->ndim_fftw - 1]) / ((PetscInt)temp);
661       (fftw->dim_fftw)[fftw->ndim_fftw - 1] = temp;
662 
663       if (fin) {
664         data_finr = (double *)fftw_malloc(sizeof(double) * alloc_local * 2);
665         PetscCall(VecCreateMPIWithArray(comm, 1, fft->n, N1, (PetscScalar *)data_finr, fin));
666         PetscCall(PetscObjectCompose((PetscObject)*fin, "FFTmatrix", (PetscObject)A));
667         (*fin)->ops->duplicate = VecDuplicate_FFTW_fin;
668         (*fin)->ops->destroy   = VecDestroy_MPIFFTW;
669       }
670       if (fout) {
671         data_fout = (fftw_complex *)fftw_malloc(sizeof(fftw_complex) * alloc_local);
672         PetscCall(VecCreateMPIWithArray(comm, 1, fft->n, N1, (PetscScalar *)data_fout, fout));
673         PetscCall(PetscObjectCompose((PetscObject)*fout, "FFTmatrix", (PetscObject)A));
674         (*fout)->ops->duplicate = VecDuplicate_FFTW_fout;
675         (*fout)->ops->destroy   = VecDestroy_MPIFFTW;
676       }
677       if (bout) {
678         data_boutr = (double *)fftw_malloc(sizeof(double) * alloc_local * 2);
679         PetscCall(VecCreateMPIWithArray(comm, 1, fft->n, N1, (PetscScalar *)data_boutr, bout));
680         PetscCall(PetscObjectCompose((PetscObject)*bout, "FFTmatrix", (PetscObject)A));
681         (*bout)->ops->duplicate = VecDuplicate_FFTW_bout;
682         (*bout)->ops->destroy   = VecDestroy_MPIFFTW;
683       }
684   #else
685       alloc_local = fftw_mpi_local_size(fftw->ndim_fftw, fftw->dim_fftw, comm, &local_n0, &local_0_start);
686       if (fin) {
687         data_fin = (fftw_complex *)fftw_malloc(sizeof(fftw_complex) * alloc_local);
688         PetscCall(VecCreateMPIWithArray(comm, 1, fft->n, fft->N, (const PetscScalar *)data_fin, fin));
689         PetscCall(PetscObjectCompose((PetscObject)*fin, "FFTmatrix", (PetscObject)A));
690         (*fin)->ops->duplicate = VecDuplicate_FFTW_fin;
691         (*fin)->ops->destroy   = VecDestroy_MPIFFTW;
692       }
693       if (fout) {
694         data_fout = (fftw_complex *)fftw_malloc(sizeof(fftw_complex) * alloc_local);
695         PetscCall(VecCreateMPIWithArray(comm, 1, fft->n, fft->N, (const PetscScalar *)data_fout, fout));
696         PetscCall(PetscObjectCompose((PetscObject)*fout, "FFTmatrix", (PetscObject)A));
697         (*fout)->ops->duplicate = VecDuplicate_FFTW_fout;
698         (*fout)->ops->destroy   = VecDestroy_MPIFFTW;
699       }
700       if (bout) {
701         data_bout = (fftw_complex *)fftw_malloc(sizeof(fftw_complex) * alloc_local);
702         PetscCall(VecCreateMPIWithArray(comm, 1, fft->n, fft->N, (const PetscScalar *)data_bout, bout));
703         PetscCall(PetscObjectCompose((PetscObject)*bout, "FFTmatrix", (PetscObject)A));
704         (*bout)->ops->duplicate = VecDuplicate_FFTW_bout;
705         (*bout)->ops->destroy   = VecDestroy_MPIFFTW;
706       }
707   #endif
708       break;
709     }
710     /* fftw vectors have their data array allocated by fftw_malloc, such that v->array=xxx but
711        v->array_allocated=NULL. A regular replacearray call won't free the memory and only causes
712        memory leaks. We void these pointers here to avoid accident uses.
713     */
714     if (fin) (*fin)->ops->replacearray = NULL;
715     if (fout) (*fout)->ops->replacearray = NULL;
716     if (bout) (*bout)->ops->replacearray = NULL;
717 #endif
718   }
719   PetscFunctionReturn(PETSC_SUCCESS);
720 }
721 
722 /*@
723   VecScatterPetscToFFTW - Copies a PETSc vector to the vector that goes into `MATFFTW` calls.
724 
725   Collective
726 
727   Input Parameters:
728 + A - FFTW matrix
729 - x - the PETSc vector
730 
731   Output Parameter:
732 . y - the FFTW vector
733 
734   Level: intermediate
735 
736   Note:
737   For real parallel FFT, FFTW requires insertion of extra space at the end of last dimension. This required even when
738   one is not doing in-place transform. The last dimension size must be changed to 2*(dim[last]/2+1) to accommodate these extra
739   zeros. This routine does that job by scattering operation.
740 
741 .seealso: [](ch_matrices), `Mat`, `MATFFTW`, `VecScatterFFTWToPetsc()`, `MatCreateVecsFFTW()`
742 @*/
VecScatterPetscToFFTW(Mat A,Vec x,Vec y)743 PetscErrorCode VecScatterPetscToFFTW(Mat A, Vec x, Vec y)
744 {
745   PetscFunctionBegin;
746   PetscUseMethod(A, "VecScatterPetscToFFTW_C", (Mat, Vec, Vec), (A, x, y));
747   PetscFunctionReturn(PETSC_SUCCESS);
748 }
749 
VecScatterPetscToFFTW_FFTW(Mat A,Vec x,Vec y)750 static PetscErrorCode VecScatterPetscToFFTW_FFTW(Mat A, Vec x, Vec y)
751 {
752   MPI_Comm    comm;
753   Mat_FFT    *fft = (Mat_FFT *)A->data;
754   PetscInt    low;
755   PetscMPIInt rank, size;
756   PetscInt    vsize, vsize1;
757   VecScatter  vecscat;
758   IS          list1;
759 
760   PetscFunctionBegin;
761   PetscCall(PetscObjectGetComm((PetscObject)A, &comm));
762   PetscCallMPI(MPI_Comm_size(comm, &size));
763   PetscCallMPI(MPI_Comm_rank(comm, &rank));
764   PetscCall(VecGetOwnershipRange(y, &low, NULL));
765 
766   if (size == 1) {
767     PetscCall(VecGetSize(x, &vsize));
768     PetscCall(VecGetSize(y, &vsize1));
769     PetscCall(ISCreateStride(PETSC_COMM_SELF, fft->N, 0, 1, &list1));
770     PetscCall(VecScatterCreate(x, list1, y, list1, &vecscat));
771     PetscCall(VecScatterBegin(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
772     PetscCall(VecScatterEnd(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
773     PetscCall(VecScatterDestroy(&vecscat));
774     PetscCall(ISDestroy(&list1));
775 #if !PetscDefined(HAVE_MPIUNI)
776   } else {
777     Mat_FFTW *fftw = (Mat_FFTW *)fft->data;
778     PetscInt  ndim = fft->ndim, *dim = fft->dim, n1;
779     ptrdiff_t local_n0, local_0_start;
780     ptrdiff_t local_n1, local_1_start;
781     IS        list2;
782   #if !defined(PETSC_USE_COMPLEX)
783     PetscInt  i, j, k, partial_dim;
784     PetscInt *indx1, *indx2, tempindx, tempindx1;
785     PetscInt  NM;
786     ptrdiff_t temp;
787   #else
788     PetscInt nstart, nlow;
789   #endif
790 
791     switch (ndim) {
792     case 1:
793   #if defined(PETSC_USE_COMPLEX)
794       fftw_mpi_local_size_1d(dim[0], comm, FFTW_FORWARD, FFTW_ESTIMATE, &local_n0, &local_0_start, &local_n1, &local_1_start);
795 
796       PetscCall(PetscIntCast(local_n0, &n1));
797       PetscCall(PetscIntCast(local_0_start, &nstart));
798       PetscCall(PetscIntCast(low, &nlow));
799       PetscCall(ISCreateStride(comm, n1, nstart, 1, &list1));
800       PetscCall(ISCreateStride(comm, n1, nlow, 1, &list2));
801       PetscCall(VecScatterCreate(x, list1, y, list2, &vecscat));
802       PetscCall(VecScatterBegin(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
803       PetscCall(VecScatterEnd(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
804       PetscCall(VecScatterDestroy(&vecscat));
805       PetscCall(ISDestroy(&list1));
806       PetscCall(ISDestroy(&list2));
807   #else
808       SETERRQ(comm, PETSC_ERR_SUP, "FFTW does not support parallel 1D real transform");
809   #endif
810       break;
811     case 2:
812   #if defined(PETSC_USE_COMPLEX)
813       fftw_mpi_local_size_2d(dim[0], dim[1], comm, &local_n0, &local_0_start);
814 
815       PetscCall(PetscIntCast(local_n0 * dim[1], &n1));
816       PetscCall(PetscIntCast(local_0_start * dim[1], &nstart));
817       PetscCall(PetscIntCast(low, &nlow));
818       PetscCall(ISCreateStride(comm, n1, nstart, 1, &list1));
819       PetscCall(ISCreateStride(comm, n1, nlow, 1, &list2));
820       PetscCall(VecScatterCreate(x, list1, y, list2, &vecscat));
821       PetscCall(VecScatterBegin(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
822       PetscCall(VecScatterEnd(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
823       PetscCall(VecScatterDestroy(&vecscat));
824       PetscCall(ISDestroy(&list1));
825       PetscCall(ISDestroy(&list2));
826   #else
827       fftw_mpi_local_size_2d_transposed(dim[0], dim[1] / 2 + 1, comm, &local_n0, &local_0_start, &local_n1, &local_1_start);
828 
829       PetscCall(PetscMalloc1((PetscInt)local_n0 * dim[1], &indx1));
830       PetscCall(PetscMalloc1((PetscInt)local_n0 * dim[1], &indx2));
831 
832       if (dim[1] % 2 == 0) {
833         NM = dim[1] + 2;
834       } else {
835         NM = dim[1] + 1;
836       }
837       for (i = 0; i < local_n0; i++) {
838         for (j = 0; j < dim[1]; j++) {
839           tempindx  = i * dim[1] + j;
840           tempindx1 = i * NM + j;
841 
842           PetscCall(PetscIntCast(local_0_start * dim[1] + tempindx, &indx1[tempindx]));
843           indx2[tempindx] = low + tempindx1;
844         }
845       }
846 
847       PetscCall(PetscIntCast(local_n0 * dim[1], &n1));
848       PetscCall(ISCreateGeneral(comm, n1, indx1, PETSC_COPY_VALUES, &list1));
849       PetscCall(ISCreateGeneral(comm, n1, indx2, PETSC_COPY_VALUES, &list2));
850 
851       PetscCall(VecScatterCreate(x, list1, y, list2, &vecscat));
852       PetscCall(VecScatterBegin(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
853       PetscCall(VecScatterEnd(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
854       PetscCall(VecScatterDestroy(&vecscat));
855       PetscCall(ISDestroy(&list1));
856       PetscCall(ISDestroy(&list2));
857       PetscCall(PetscFree(indx1));
858       PetscCall(PetscFree(indx2));
859   #endif
860       break;
861 
862     case 3:
863   #if defined(PETSC_USE_COMPLEX)
864       fftw_mpi_local_size_3d(dim[0], dim[1], dim[2], comm, &local_n0, &local_0_start);
865 
866       PetscCall(PetscIntCast(local_n0 * dim[1] * dim[2], &n1));
867       PetscCall(PetscIntCast(local_0_start * dim[1] * dim[2], &nstart));
868       PetscCall(PetscIntCast(low, &nlow));
869       PetscCall(ISCreateStride(comm, n1, nstart, 1, &list1));
870       PetscCall(ISCreateStride(comm, n1, nlow, 1, &list2));
871       PetscCall(VecScatterCreate(x, list1, y, list2, &vecscat));
872       PetscCall(VecScatterBegin(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
873       PetscCall(VecScatterEnd(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
874       PetscCall(VecScatterDestroy(&vecscat));
875       PetscCall(ISDestroy(&list1));
876       PetscCall(ISDestroy(&list2));
877   #else
878       /* buggy, needs to be fixed. See src/mat/tests/ex158.c */
879       SETERRQ(comm, PETSC_ERR_SUP, "FFTW does not support parallel 3D real transform");
880       fftw_mpi_local_size_3d_transposed(dim[0], dim[1], dim[2] / 2 + 1, comm, &local_n0, &local_0_start, &local_n1, &local_1_start);
881 
882       PetscCall(PetscMalloc1((PetscInt)local_n0 * dim[1] * dim[2], &indx1));
883       PetscCall(PetscMalloc1((PetscInt)local_n0 * dim[1] * dim[2], &indx2));
884 
885       if (dim[2] % 2 == 0) NM = dim[2] + 2;
886       else NM = dim[2] + 1;
887 
888       for (i = 0; i < local_n0; i++) {
889         for (j = 0; j < dim[1]; j++) {
890           for (k = 0; k < dim[2]; k++) {
891             tempindx  = i * dim[1] * dim[2] + j * dim[2] + k;
892             tempindx1 = i * dim[1] * NM + j * NM + k;
893 
894             PetscCall(PetscIntCast(local_0_start * dim[1] * dim[2] + tempindx, &indx1[tempindx]));
895             PetscCall(PetscIntCast(low + tempindx1, &indx2[tempindx]));
896           }
897         }
898       }
899 
900       PetscCall(ISCreateGeneral(comm, local_n0 * dim[1] * dim[2], indx1, PETSC_COPY_VALUES, &list1));
901       PetscCall(ISCreateGeneral(comm, local_n0 * dim[1] * dim[2], indx2, PETSC_COPY_VALUES, &list2));
902       PetscCall(VecScatterCreate(x, list1, y, list2, &vecscat));
903       PetscCall(VecScatterBegin(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
904       PetscCall(VecScatterEnd(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
905       PetscCall(VecScatterDestroy(&vecscat));
906       PetscCall(ISDestroy(&list1));
907       PetscCall(ISDestroy(&list2));
908       PetscCall(PetscFree(indx1));
909       PetscCall(PetscFree(indx2));
910   #endif
911       break;
912 
913     default:
914   #if defined(PETSC_USE_COMPLEX)
915       fftw_mpi_local_size(fftw->ndim_fftw, fftw->dim_fftw, comm, &local_n0, &local_0_start);
916 
917       PetscCall(PetscIntCast(local_n0 * (fftw->partial_dim), &n1));
918       PetscCall(PetscIntCast(local_0_start * (fftw->partial_dim), &nstart));
919       PetscCall(PetscIntCast(low, &nlow));
920       PetscCall(ISCreateStride(comm, n1, nstart, 1, &list1));
921       PetscCall(ISCreateStride(comm, n1, nlow, 1, &list2));
922       PetscCall(VecScatterCreate(x, list1, y, list2, &vecscat));
923       PetscCall(VecScatterBegin(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
924       PetscCall(VecScatterEnd(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
925       PetscCall(VecScatterDestroy(&vecscat));
926       PetscCall(ISDestroy(&list1));
927       PetscCall(ISDestroy(&list2));
928   #else
929       /* buggy, needs to be fixed. See src/mat/tests/ex158.c */
930       SETERRQ(comm, PETSC_ERR_SUP, "FFTW does not support parallel DIM>3 real transform");
931       temp = (fftw->dim_fftw)[fftw->ndim_fftw - 1];
932 
933       (fftw->dim_fftw)[fftw->ndim_fftw - 1] = temp / 2 + 1;
934 
935       fftw_mpi_local_size_transposed(fftw->ndim_fftw, fftw->dim_fftw, comm, &local_n0, &local_0_start, &local_n1, &local_1_start);
936 
937       (fftw->dim_fftw)[fftw->ndim_fftw - 1] = temp;
938 
939       partial_dim = fftw->partial_dim;
940 
941       PetscCall(PetscMalloc1((PetscInt)local_n0 * partial_dim, &indx1));
942       PetscCall(PetscMalloc1((PetscInt)local_n0 * partial_dim, &indx2));
943 
944       if (dim[ndim - 1] % 2 == 0) NM = 2;
945       else NM = 1;
946 
947       j = low;
948       for (i = 0, k = 1; i < (PetscInt)local_n0 * partial_dim; i++, k++) {
949         indx1[i] = local_0_start * partial_dim + i;
950         indx2[i] = j;
951         if (k % dim[ndim - 1] == 0) j += NM;
952         j++;
953       }
954       PetscCall(ISCreateGeneral(comm, local_n0 * partial_dim, indx1, PETSC_COPY_VALUES, &list1));
955       PetscCall(ISCreateGeneral(comm, local_n0 * partial_dim, indx2, PETSC_COPY_VALUES, &list2));
956       PetscCall(VecScatterCreate(x, list1, y, list2, &vecscat));
957       PetscCall(VecScatterBegin(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
958       PetscCall(VecScatterEnd(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
959       PetscCall(VecScatterDestroy(&vecscat));
960       PetscCall(ISDestroy(&list1));
961       PetscCall(ISDestroy(&list2));
962       PetscCall(PetscFree(indx1));
963       PetscCall(PetscFree(indx2));
964   #endif
965       break;
966     }
967 #endif
968   }
969   PetscFunctionReturn(PETSC_SUCCESS);
970 }
971 
972 /*@
973   VecScatterFFTWToPetsc - Converts `MATFFTW` output vector to a PETSc vector.
974 
975   Collective
976 
977   Input Parameters:
978 + A - `MATFFTW` matrix
979 - x - FFTW vector
980 
981   Output Parameter:
982 . y - PETSc vector
983 
984   Level: intermediate
985 
986   Note:
987   While doing real transform the FFTW output of backward DFT contains extra zeros at the end of last dimension.
988   `VecScatterFFTWToPetsc()` removes those extra zeros.
989 
990 .seealso: [](ch_matrices), `Mat`, `VecScatterPetscToFFTW()`, `MATFFTW`, `MatCreateVecsFFTW()`
991 @*/
VecScatterFFTWToPetsc(Mat A,Vec x,Vec y)992 PetscErrorCode VecScatterFFTWToPetsc(Mat A, Vec x, Vec y)
993 {
994   PetscFunctionBegin;
995   PetscUseMethod(A, "VecScatterFFTWToPetsc_C", (Mat, Vec, Vec), (A, x, y));
996   PetscFunctionReturn(PETSC_SUCCESS);
997 }
998 
VecScatterFFTWToPetsc_FFTW(Mat A,Vec x,Vec y)999 static PetscErrorCode VecScatterFFTWToPetsc_FFTW(Mat A, Vec x, Vec y)
1000 {
1001   MPI_Comm    comm;
1002   Mat_FFT    *fft = (Mat_FFT *)A->data;
1003   PetscInt    low;
1004   PetscMPIInt rank, size;
1005   VecScatter  vecscat;
1006   IS          list1;
1007 
1008   PetscFunctionBegin;
1009   PetscCall(PetscObjectGetComm((PetscObject)A, &comm));
1010   PetscCallMPI(MPI_Comm_size(comm, &size));
1011   PetscCallMPI(MPI_Comm_rank(comm, &rank));
1012   PetscCall(VecGetOwnershipRange(x, &low, NULL));
1013 
1014   if (size == 1) {
1015     PetscCall(ISCreateStride(comm, fft->N, 0, 1, &list1));
1016     PetscCall(VecScatterCreate(x, list1, y, list1, &vecscat));
1017     PetscCall(VecScatterBegin(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
1018     PetscCall(VecScatterEnd(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
1019     PetscCall(VecScatterDestroy(&vecscat));
1020     PetscCall(ISDestroy(&list1));
1021 
1022 #if !PetscDefined(HAVE_MPIUNI)
1023   } else {
1024     Mat_FFTW *fftw = (Mat_FFTW *)fft->data;
1025     PetscInt  ndim = fft->ndim, *dim = fft->dim;
1026     ptrdiff_t local_n0, local_0_start;
1027     ptrdiff_t local_n1, local_1_start;
1028     IS        list2;
1029   #if !defined(PETSC_USE_COMPLEX)
1030     PetscInt  i, j, k, partial_dim;
1031     PetscInt *indx1, *indx2, tempindx, tempindx1;
1032     PetscInt  NM;
1033     ptrdiff_t temp;
1034   #else
1035     PetscInt nlow, nstart;
1036   #endif
1037     PetscInt n1;
1038     switch (ndim) {
1039     case 1:
1040   #if defined(PETSC_USE_COMPLEX)
1041       fftw_mpi_local_size_1d(dim[0], comm, FFTW_BACKWARD, FFTW_ESTIMATE, &local_n0, &local_0_start, &local_n1, &local_1_start);
1042 
1043       PetscCall(PetscIntCast(local_n1, &n1));
1044       PetscCall(PetscIntCast(local_1_start, &nstart));
1045       PetscCall(PetscIntCast(low, &nlow));
1046       PetscCall(ISCreateStride(comm, n1, nstart, 1, &list1));
1047       PetscCall(ISCreateStride(comm, n1, nlow, 1, &list2));
1048       PetscCall(VecScatterCreate(x, list1, y, list2, &vecscat));
1049       PetscCall(VecScatterBegin(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
1050       PetscCall(VecScatterEnd(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
1051       PetscCall(VecScatterDestroy(&vecscat));
1052       PetscCall(ISDestroy(&list1));
1053       PetscCall(ISDestroy(&list2));
1054   #else
1055       SETERRQ(comm, PETSC_ERR_SUP, "No support for real parallel 1D FFT");
1056   #endif
1057       break;
1058     case 2:
1059   #if defined(PETSC_USE_COMPLEX)
1060       fftw_mpi_local_size_2d(dim[0], dim[1], comm, &local_n0, &local_0_start);
1061 
1062       PetscCall(PetscIntCast(local_n0 * dim[1], &n1));
1063       PetscCall(PetscIntCast(local_0_start * dim[1], &nstart));
1064       PetscCall(PetscIntCast(low, &nlow));
1065       PetscCall(ISCreateStride(comm, n1, nstart, 1, &list1));
1066       PetscCall(ISCreateStride(comm, n1, nlow, 1, &list2));
1067       PetscCall(VecScatterCreate(x, list2, y, list1, &vecscat));
1068       PetscCall(VecScatterBegin(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
1069       PetscCall(VecScatterEnd(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
1070       PetscCall(VecScatterDestroy(&vecscat));
1071       PetscCall(ISDestroy(&list1));
1072       PetscCall(ISDestroy(&list2));
1073   #else
1074       fftw_mpi_local_size_2d_transposed(dim[0], dim[1] / 2 + 1, comm, &local_n0, &local_0_start, &local_n1, &local_1_start);
1075 
1076       PetscCall(PetscMalloc1((PetscInt)local_n0 * dim[1], &indx1));
1077       PetscCall(PetscMalloc1((PetscInt)local_n0 * dim[1], &indx2));
1078 
1079       if (dim[1] % 2 == 0) NM = dim[1] + 2;
1080       else NM = dim[1] + 1;
1081 
1082       for (i = 0; i < local_n0; i++) {
1083         for (j = 0; j < dim[1]; j++) {
1084           tempindx  = i * dim[1] + j;
1085           tempindx1 = i * NM + j;
1086 
1087           PetscCall(PetscIntCast(local_0_start * dim[1] + tempindx, &indx1[tempindx]));
1088           PetscCall(PetscIntCast(low + tempindx1, &indx2[tempindx]));
1089         }
1090       }
1091 
1092       PetscCall(PetscIntCast(local_n0 * dim[1], &n1));
1093       PetscCall(ISCreateGeneral(comm, n1, indx1, PETSC_COPY_VALUES, &list1));
1094       PetscCall(ISCreateGeneral(comm, n1, indx2, PETSC_COPY_VALUES, &list2));
1095 
1096       PetscCall(VecScatterCreate(x, list2, y, list1, &vecscat));
1097       PetscCall(VecScatterBegin(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
1098       PetscCall(VecScatterEnd(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
1099       PetscCall(VecScatterDestroy(&vecscat));
1100       PetscCall(ISDestroy(&list1));
1101       PetscCall(ISDestroy(&list2));
1102       PetscCall(PetscFree(indx1));
1103       PetscCall(PetscFree(indx2));
1104   #endif
1105       break;
1106     case 3:
1107   #if defined(PETSC_USE_COMPLEX)
1108       fftw_mpi_local_size_3d(dim[0], dim[1], dim[2], comm, &local_n0, &local_0_start);
1109 
1110       PetscCall(PetscIntCast(local_n0 * dim[1] * dim[2], &n1));
1111       PetscCall(PetscIntCast(local_0_start * dim[1] * dim[2], &nstart));
1112       PetscCall(PetscIntCast(low, &nlow));
1113       PetscCall(ISCreateStride(comm, n1, nstart, 1, &list1));
1114       PetscCall(ISCreateStride(comm, n1, nlow, 1, &list2));
1115       PetscCall(VecScatterCreate(x, list1, y, list2, &vecscat));
1116       PetscCall(VecScatterBegin(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
1117       PetscCall(VecScatterEnd(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
1118       PetscCall(VecScatterDestroy(&vecscat));
1119       PetscCall(ISDestroy(&list1));
1120       PetscCall(ISDestroy(&list2));
1121   #else
1122       fftw_mpi_local_size_3d_transposed(dim[0], dim[1], dim[2] / 2 + 1, comm, &local_n0, &local_0_start, &local_n1, &local_1_start);
1123 
1124       PetscCall(PetscMalloc1((PetscInt)local_n0 * dim[1] * dim[2], &indx1));
1125       PetscCall(PetscMalloc1((PetscInt)local_n0 * dim[1] * dim[2], &indx2));
1126 
1127       if (dim[2] % 2 == 0) NM = dim[2] + 2;
1128       else NM = dim[2] + 1;
1129 
1130       for (i = 0; i < local_n0; i++) {
1131         for (j = 0; j < dim[1]; j++) {
1132           for (k = 0; k < dim[2]; k++) {
1133             tempindx  = i * dim[1] * dim[2] + j * dim[2] + k;
1134             tempindx1 = i * dim[1] * NM + j * NM + k;
1135 
1136             PetscCall(PetscIntCast(local_0_start * dim[1] * dim[2] + tempindx, &indx1[tempindx]));
1137             PetscCall(PetscIntCast(low + tempindx1, &indx2[tempindx]));
1138           }
1139         }
1140       }
1141 
1142       PetscCall(PetscIntCast(local_n0 * dim[1] * dim[2], &n1));
1143       PetscCall(ISCreateGeneral(comm, n1, indx1, PETSC_COPY_VALUES, &list1));
1144       PetscCall(ISCreateGeneral(comm, n1, indx2, PETSC_COPY_VALUES, &list2));
1145 
1146       PetscCall(VecScatterCreate(x, list2, y, list1, &vecscat));
1147       PetscCall(VecScatterBegin(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
1148       PetscCall(VecScatterEnd(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
1149       PetscCall(VecScatterDestroy(&vecscat));
1150       PetscCall(ISDestroy(&list1));
1151       PetscCall(ISDestroy(&list2));
1152       PetscCall(PetscFree(indx1));
1153       PetscCall(PetscFree(indx2));
1154   #endif
1155       break;
1156     default:
1157   #if defined(PETSC_USE_COMPLEX)
1158       fftw_mpi_local_size(fftw->ndim_fftw, fftw->dim_fftw, comm, &local_n0, &local_0_start);
1159 
1160       PetscCall(PetscIntCast(local_n0 * (fftw->partial_dim), &n1));
1161       PetscCall(PetscIntCast(local_0_start * (fftw->partial_dim), &nstart));
1162       PetscCall(PetscIntCast(low, &nlow));
1163       PetscCall(ISCreateStride(comm, n1, nstart, 1, &list1));
1164       PetscCall(ISCreateStride(comm, n1, nlow, 1, &list2));
1165       PetscCall(VecScatterCreate(x, list1, y, list2, &vecscat));
1166       PetscCall(VecScatterBegin(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
1167       PetscCall(VecScatterEnd(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
1168       PetscCall(VecScatterDestroy(&vecscat));
1169       PetscCall(ISDestroy(&list1));
1170       PetscCall(ISDestroy(&list2));
1171   #else
1172       temp = (fftw->dim_fftw)[fftw->ndim_fftw - 1];
1173 
1174       (fftw->dim_fftw)[fftw->ndim_fftw - 1] = temp / 2 + 1;
1175 
1176       fftw_mpi_local_size_transposed(fftw->ndim_fftw, fftw->dim_fftw, comm, &local_n0, &local_0_start, &local_n1, &local_1_start);
1177 
1178       (fftw->dim_fftw)[fftw->ndim_fftw - 1] = temp;
1179 
1180       partial_dim = fftw->partial_dim;
1181 
1182       PetscCall(PetscMalloc1((PetscInt)local_n0 * partial_dim, &indx1));
1183       PetscCall(PetscMalloc1((PetscInt)local_n0 * partial_dim, &indx2));
1184 
1185       if (dim[ndim - 1] % 2 == 0) NM = 2;
1186       else NM = 1;
1187 
1188       j = low;
1189       for (i = 0, k = 1; i < (PetscInt)local_n0 * partial_dim; i++, k++) {
1190         PetscCall(PetscIntCast(local_0_start * partial_dim + i, &indx1[i]));
1191         indx2[i] = j;
1192         if (k % dim[ndim - 1] == 0) j += NM;
1193         j++;
1194       }
1195       PetscCall(PetscIntCast(local_n0 * partial_dim, &n1));
1196       PetscCall(ISCreateGeneral(comm, n1, indx1, PETSC_COPY_VALUES, &list1));
1197       PetscCall(ISCreateGeneral(comm, n1, indx2, PETSC_COPY_VALUES, &list2));
1198 
1199       PetscCall(VecScatterCreate(x, list2, y, list1, &vecscat));
1200       PetscCall(VecScatterBegin(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
1201       PetscCall(VecScatterEnd(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
1202       PetscCall(VecScatterDestroy(&vecscat));
1203       PetscCall(ISDestroy(&list1));
1204       PetscCall(ISDestroy(&list2));
1205       PetscCall(PetscFree(indx1));
1206       PetscCall(PetscFree(indx2));
1207   #endif
1208       break;
1209     }
1210 #endif
1211   }
1212   PetscFunctionReturn(PETSC_SUCCESS);
1213 }
1214 
1215 /*MC
1216   MATFFTW -  "fftw" - Matrix type that provides FFT via the FFTW external package.
1217 
1218   Level: intermediate
1219 
1220 .seealso: [](ch_matrices), `Mat`, `MatCreate()`, `MatType`, `MatCreateFFT()`
1221 M*/
MatCreate_FFTW(Mat A)1222 PETSC_EXTERN PetscErrorCode MatCreate_FFTW(Mat A)
1223 {
1224   MPI_Comm    comm;
1225   Mat_FFT    *fft = (Mat_FFT *)A->data;
1226   Mat_FFTW   *fftw;
1227   PetscInt    ndim = fft->ndim, *dim = fft->dim;
1228   const char *plans[]  = {"FFTW_ESTIMATE", "FFTW_MEASURE", "FFTW_PATIENT", "FFTW_EXHAUSTIVE"};
1229   unsigned    iplans[] = {FFTW_ESTIMATE, FFTW_MEASURE, FFTW_PATIENT, FFTW_EXHAUSTIVE};
1230   PetscBool   flg;
1231   PetscInt    p_flag, partial_dim = 1, ctr;
1232   PetscMPIInt size, rank;
1233   ptrdiff_t  *pdim;
1234 #if !defined(PETSC_USE_COMPLEX)
1235   PetscInt tot_dim;
1236 #endif
1237 
1238   PetscFunctionBegin;
1239   PetscCall(PetscObjectGetComm((PetscObject)A, &comm));
1240   PetscCallMPI(MPI_Comm_size(comm, &size));
1241   PetscCallMPI(MPI_Comm_rank(comm, &rank));
1242 
1243 #if !PetscDefined(HAVE_MPIUNI)
1244   fftw_mpi_init();
1245 #endif
1246   pdim    = (ptrdiff_t *)calloc(ndim, sizeof(ptrdiff_t));
1247   pdim[0] = dim[0];
1248 #if !defined(PETSC_USE_COMPLEX)
1249   tot_dim = 2 * dim[0];
1250 #endif
1251   for (ctr = 1; ctr < ndim; ctr++) {
1252     partial_dim *= dim[ctr];
1253     pdim[ctr] = dim[ctr];
1254 #if !defined(PETSC_USE_COMPLEX)
1255     if (ctr == ndim - 1) tot_dim *= (dim[ctr] / 2 + 1);
1256     else tot_dim *= dim[ctr];
1257 #endif
1258   }
1259 
1260   if (size == 1) {
1261 #if defined(PETSC_USE_COMPLEX)
1262     PetscCall(MatSetSizes(A, fft->N, fft->N, fft->N, fft->N));
1263     fft->n = fft->N;
1264 #else
1265     PetscCall(MatSetSizes(A, tot_dim, tot_dim, tot_dim, tot_dim));
1266     fft->n = tot_dim;
1267 #endif
1268 #if !PetscDefined(HAVE_MPIUNI)
1269   } else {
1270     ptrdiff_t local_n0, local_0_start, local_n1, local_1_start;
1271   #if !defined(PETSC_USE_COMPLEX)
1272     ptrdiff_t temp;
1273     PetscInt  N1;
1274   #else
1275     PetscInt n1;
1276   #endif
1277 
1278     switch (ndim) {
1279     case 1:
1280   #if !defined(PETSC_USE_COMPLEX)
1281       SETERRQ(comm, PETSC_ERR_SUP, "FFTW does not support parallel 1D real transform");
1282   #else
1283       fftw_mpi_local_size_1d(dim[0], comm, FFTW_FORWARD, FFTW_ESTIMATE, &local_n0, &local_0_start, &local_n1, &local_1_start);
1284       PetscCall(PetscIntCast(local_n0, &fft->n));
1285       PetscCall(PetscIntCast(local_n1, &n1));
1286       PetscCall(MatSetSizes(A, n1, fft->n, fft->N, fft->N));
1287   #endif
1288       break;
1289     case 2:
1290   #if defined(PETSC_USE_COMPLEX)
1291       fftw_mpi_local_size_2d(dim[0], dim[1], comm, &local_n0, &local_0_start);
1292       fft->n = (PetscInt)local_n0 * dim[1];
1293       PetscCall(MatSetSizes(A, fft->n, fft->n, fft->N, fft->N));
1294   #else
1295       fftw_mpi_local_size_2d_transposed(dim[0], dim[1] / 2 + 1, comm, &local_n0, &local_0_start, &local_n1, &local_1_start);
1296 
1297       fft->n = 2 * (PetscInt)local_n0 * (dim[1] / 2 + 1);
1298       PetscCall(MatSetSizes(A, fft->n, fft->n, 2 * dim[0] * (dim[1] / 2 + 1), 2 * dim[0] * (dim[1] / 2 + 1)));
1299   #endif
1300       break;
1301     case 3:
1302   #if defined(PETSC_USE_COMPLEX)
1303       fftw_mpi_local_size_3d(dim[0], dim[1], dim[2], comm, &local_n0, &local_0_start);
1304 
1305       fft->n = (PetscInt)local_n0 * dim[1] * dim[2];
1306       PetscCall(MatSetSizes(A, fft->n, fft->n, fft->N, fft->N));
1307   #else
1308       fftw_mpi_local_size_3d_transposed(dim[0], dim[1], dim[2] / 2 + 1, comm, &local_n0, &local_0_start, &local_n1, &local_1_start);
1309 
1310       fft->n = 2 * (PetscInt)local_n0 * dim[1] * (dim[2] / 2 + 1);
1311       PetscCall(MatSetSizes(A, fft->n, fft->n, 2 * dim[0] * dim[1] * (dim[2] / 2 + 1), 2 * dim[0] * dim[1] * (dim[2] / 2 + 1)));
1312   #endif
1313       break;
1314     default:
1315   #if defined(PETSC_USE_COMPLEX)
1316       fftw_mpi_local_size(ndim, pdim, comm, &local_n0, &local_0_start);
1317 
1318       PetscCall(PetscIntCast(local_n0 * partial_dim, &fft->n));
1319       PetscCall(MatSetSizes(A, fft->n, fft->n, fft->N, fft->N));
1320   #else
1321       temp = pdim[ndim - 1];
1322 
1323       pdim[ndim - 1] = temp / 2 + 1;
1324 
1325       fftw_mpi_local_size_transposed(ndim, pdim, comm, &local_n0, &local_0_start, &local_n1, &local_1_start);
1326 
1327       PetscCall(PetscIntCast(2 * (PetscInt)local_n0 * partial_dim * pdim[ndim - 1] / temp, &fft->n));
1328       N1 = 2 * fft->N * (PetscInt)pdim[ndim - 1] / ((PetscInt)temp);
1329 
1330       pdim[ndim - 1] = temp;
1331 
1332       PetscCall(MatSetSizes(A, fft->n, fft->n, N1, N1));
1333   #endif
1334       break;
1335     }
1336 #endif
1337   }
1338   free(pdim);
1339   PetscCall(PetscObjectChangeTypeName((PetscObject)A, MATFFTW));
1340   PetscCall(PetscNew(&fftw));
1341   fft->data = (void *)fftw;
1342 
1343   fftw->ndim_fftw   = ndim; /* This is dimension of fft */
1344   fftw->partial_dim = partial_dim;
1345 
1346   PetscCall(PetscMalloc1(ndim, &fftw->dim_fftw));
1347   if (size == 1) {
1348 #if defined(PETSC_USE_64BIT_INDICES)
1349     fftw->iodims = (fftw_iodim64 *)malloc(sizeof(fftw_iodim64) * ndim);
1350 #else
1351     fftw->iodims = (fftw_iodim *)malloc(sizeof(fftw_iodim) * ndim);
1352 #endif
1353   }
1354 
1355   for (ctr = 0; ctr < ndim; ctr++) (fftw->dim_fftw)[ctr] = dim[ctr];
1356 
1357   fftw->p_forward  = NULL;
1358   fftw->p_backward = NULL;
1359   fftw->p_flag     = FFTW_ESTIMATE;
1360 
1361   if (size == 1) {
1362     A->ops->mult          = MatMult_SeqFFTW;
1363     A->ops->multtranspose = MatMultTranspose_SeqFFTW;
1364 #if !PetscDefined(HAVE_MPIUNI)
1365   } else {
1366     A->ops->mult          = MatMult_MPIFFTW;
1367     A->ops->multtranspose = MatMultTranspose_MPIFFTW;
1368 #endif
1369   }
1370   fft->matdestroy = MatDestroy_FFTW;
1371   A->assembled    = PETSC_TRUE;
1372   A->preallocated = PETSC_TRUE;
1373 
1374   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatCreateVecsFFTW_C", MatCreateVecsFFTW_FFTW));
1375   PetscCall(PetscObjectComposeFunction((PetscObject)A, "VecScatterPetscToFFTW_C", VecScatterPetscToFFTW_FFTW));
1376   PetscCall(PetscObjectComposeFunction((PetscObject)A, "VecScatterFFTWToPetsc_C", VecScatterFFTWToPetsc_FFTW));
1377 
1378   /* get runtime options */
1379   PetscOptionsBegin(PetscObjectComm((PetscObject)A), ((PetscObject)A)->prefix, "FFTW Options", "Mat");
1380   PetscCall(PetscOptionsEList("-mat_fftw_plannerflags", "Planner Flags", "None", plans, 4, plans[0], &p_flag, &flg));
1381   if (flg) fftw->p_flag = iplans[p_flag];
1382   PetscOptionsEnd();
1383   PetscFunctionReturn(PETSC_SUCCESS);
1384 }
1385