39static constexpr double TWO_PI = 6.283185307179586476925286766559;
43#ifdef NUMERICS_HAS_AVX2
46static inline void butterfly(
num::cplx* __restrict__ u,
51 auto* ud =
reinterpret_cast<double*
>(u);
52 auto* vd =
reinterpret_cast<double*
>(v);
53 auto* wd =
reinterpret_cast<const double*
>(tw);
56 for (; j + 1 < hlen; j += 2) {
57 __m256d U = _mm256_loadu_pd(ud + 2*j);
58 __m256d V = _mm256_loadu_pd(vd + 2*j);
59 __m256d W = _mm256_loadu_pd(wd + 2*j);
61 __m256d Wre = _mm256_unpacklo_pd(W, W);
62 __m256d Wim = _mm256_unpackhi_pd(W, W);
63 __m256d Vsw = _mm256_permute_pd(V, 0x5);
64 __m256d T = _mm256_addsub_pd(
65 _mm256_mul_pd(V, Wre),
66 _mm256_mul_pd(Vsw, Wim));
67 _mm256_storeu_pd(ud + 2*j, _mm256_add_pd(U, T));
68 _mm256_storeu_pd(vd + 2*j, _mm256_sub_pd(U, T));
71 for (; j < hlen; ++j) {
79#elif defined(NUMERICS_HAS_NEON)
82static inline void butterfly(
num::cplx* __restrict__ u,
87 auto* ud =
reinterpret_cast<double*
>(u);
88 auto* vd =
reinterpret_cast<double*
>(v);
89 auto* wd =
reinterpret_cast<const double*
>(tw);
92 for (; j + 1 < hlen; j += 2) {
94 float64x2x2_t U = vld2q_f64(ud + 2*j);
95 float64x2x2_t V = vld2q_f64(vd + 2*j);
96 float64x2x2_t W = vld2q_f64(wd + 2*j);
100 float64x2_t Tr = vfmsq_f64(vmulq_f64(V.val[0], W.val[0]),
103 float64x2_t Ti = vfmaq_f64(vmulq_f64(V.val[0], W.val[1]),
106 float64x2x2_t Ru, Rv;
107 Ru.val[0] = vaddq_f64(U.val[0], Tr);
108 Ru.val[1] = vaddq_f64(U.val[1], Ti);
109 Rv.val[0] = vsubq_f64(U.val[0], Tr);
110 Rv.val[1] = vsubq_f64(U.val[1], Ti);
111 vst2q_f64(ud + 2*j, Ru);
112 vst2q_f64(vd + 2*j, Rv);
115 for (; j < hlen; ++j) {
129 for (
int j = 0; j < hlen; ++j) {
148 if (n_ == 0 || (n_ & (n_ - 1)))
149 throw std::invalid_argument(
"FFTPlan: length must be a power of two");
150 for (
int len = 2; len <= n_; len <<= 1) {
151 double ang = TWO_PI /
static_cast<double>(len) * (inv ? 1.0 : -1.0);
152 num::cplx wlen{std::cos(ang), std::sin(ang)};
153 std::vector<num::cplx> tw(len / 2);
155 for (
int j = 0; j < len / 2; ++j) { tw[j] = w; w *= wlen; }
164 for (
int len = 2; len <=
n; len <<= 1, ++stage) {
167 for (
int i = 0; i <
n; i += len)
168 butterfly(data + i, data + i + hlen, tw, hlen);
176 int n =
static_cast<int>(in.
size());
177 for (
int i = 0; i < n; ++i) out[i] = in[i];
183 int n =
static_cast<int>(in.
size());
184 for (
int i = 0; i < n; ++i) out[i] = in[i];
190 int n =
static_cast<int>(in.
size());
192 for (
int i = 0; i < n; ++i) tmp[i] = {in[i], 0.0};
195 for (
int k = 0; k < n / 2 + 1; ++k) out[k] = tmp[k];
200 for (
int k = 0; k < n / 2 + 1; ++k) tmp[k] = in[k];
201 for (
int k = 1; k < (n - 1) / 2 + 1; ++k)
202 tmp[n - k] = std::conj(in[k]);
205 for (
int i = 0; i < n; ++i) out[i] = tmp[i].real();