10 #ifndef EIGEN_COMPLEX_AVX_H
11 #define EIGEN_COMPLEX_AVX_H
25 #ifndef EIGEN_VECTORIZE_AVX512
52 typedef std::complex<float>
type;
72 const __m256 mask = _mm256_castsi256_ps(_mm256_setr_epi32(0x00000000,0x80000000,0x00000000,0x80000000,0x00000000,0x80000000,0x00000000,0x80000000));
78 __m256 tmp1 = _mm256_mul_ps(_mm256_moveldup_ps(
a.v),
b.v);
79 __m256 tmp2 = _mm256_mul_ps(_mm256_movehdup_ps(
a.v), _mm256_permute_ps(
b.v, _MM_SHUFFLE(2,3,0,1)));
80 __m256
result = _mm256_addsub_ps(tmp1, tmp2);
86 __m256 eq = _mm256_cmp_ps(
a.v,
b.v, _CMP_EQ_OQ);
87 return Packet4cf(_mm256_and_ps(eq, _mm256_permute_ps(eq, 0xb1)));
102 return Packet4cf(_mm256_castpd_ps(_mm256_broadcast_sd((
const double*)(
const void*)&from)));
110 return Packet4cf(_mm256_insertf128_ps(_mm256_castps128_ps256(
a.v),
b.v, 1));
126 __m128 low = _mm256_extractf128_ps(from.v, 0);
127 to[stride*0] = std::complex<float>(_mm_cvtss_f32(_mm_shuffle_ps(low, low, 0)),
128 _mm_cvtss_f32(_mm_shuffle_ps(low, low, 1)));
129 to[stride*1] = std::complex<float>(_mm_cvtss_f32(_mm_shuffle_ps(low, low, 2)),
130 _mm_cvtss_f32(_mm_shuffle_ps(low, low, 3)));
132 __m128 high = _mm256_extractf128_ps(from.v, 1);
133 to[stride*2] = std::complex<float>(_mm_cvtss_f32(_mm_shuffle_ps(high, high, 0)),
134 _mm_cvtss_f32(_mm_shuffle_ps(high, high, 1)));
135 to[stride*3] = std::complex<float>(_mm_cvtss_f32(_mm_shuffle_ps(high, high, 2)),
136 _mm_cvtss_f32(_mm_shuffle_ps(high, high, 3)));
146 __m128 low = _mm256_extractf128_ps(
a.v, 0);
147 __m128 high = _mm256_extractf128_ps(
a.v, 1);
148 __m128d lowd = _mm_castps_pd(low);
149 __m128d highd = _mm_castps_pd(high);
150 low = _mm_castpd_ps(_mm_shuffle_pd(lowd,lowd,0
x1));
151 high = _mm_castpd_ps(_mm_shuffle_pd(highd,highd,0
x1));
152 __m256
result = _mm256_setzero_ps();
175 __m256 tmp = _mm256_mul_ps(
b.v,
b.v);
176 __m256 tmp2 = _mm256_shuffle_ps(tmp,tmp,0xB1);
177 __m256 denom = _mm256_add_ps(tmp, tmp2);
178 return Packet4cf(_mm256_div_ps(num.
v, denom));
183 return Packet4cf(_mm256_shuffle_ps(
x.v,
x.v, _MM_SHUFFLE(2, 3, 0 ,1)));
194 #ifndef EIGEN_VECTORIZE_AVX512
221 typedef std::complex<double>
type;
238 const __m256d mask = _mm256_castsi256_pd(_mm256_set_epi32(0x80000000,0
x0,0
x0,0
x0,0x80000000,0
x0,0
x0,0
x0));
244 __m256d tmp1 = _mm256_shuffle_pd(
a.v,
a.v,0
x0);
245 __m256d even = _mm256_mul_pd(tmp1,
b.v);
246 __m256d tmp2 = _mm256_shuffle_pd(
a.v,
a.v,0xF);
247 __m256d tmp3 = _mm256_shuffle_pd(
b.v,
b.v,0
x5);
248 __m256d odd = _mm256_mul_pd(tmp2, tmp3);
249 return Packet2cd(_mm256_addsub_pd(even, odd));
254 __m256d eq = _mm256_cmp_pd(
a.v,
b.v, _CMP_EQ_OQ);
273 return Packet2cd(_mm256_broadcast_pd((
const __m128d*)(
const void*)&from));
289 __m128d low = _mm256_extractf128_pd(from.v, 0);
290 to[stride*0] = std::complex<double>(_mm_cvtsd_f64(low), _mm_cvtsd_f64(_mm_shuffle_pd(low, low, 1)));
291 __m128d high = _mm256_extractf128_pd(from.v, 1);
292 to[stride*1] = std::complex<double>(_mm_cvtsd_f64(high), _mm_cvtsd_f64(_mm_shuffle_pd(high, high, 1)));
297 __m128d low = _mm256_extractf128_pd(
a.v, 0);
299 _mm_store_pd(
res, low);
300 return std::complex<double>(
res[0],
res[1]);
304 __m256d
result = _mm256_permute2f128_pd(
a.v,
a.v, 1);
325 __m256d tmp = _mm256_mul_pd(
b.v,
b.v);
326 __m256d denom = _mm256_hadd_pd(tmp, tmp);
327 return Packet2cd(_mm256_div_pd(num.
v, denom));
337 __m256d
P0 = _mm256_castps_pd(kernel.
packet[0].
v);
338 __m256d
P1 = _mm256_castps_pd(kernel.
packet[1].
v);
339 __m256d
P2 = _mm256_castps_pd(kernel.
packet[2].
v);
340 __m256d
P3 = _mm256_castps_pd(kernel.
packet[3].
v);
342 __m256d T0 = _mm256_shuffle_pd(
P0,
P1, 15);
343 __m256d
T1 = _mm256_shuffle_pd(
P0,
P1, 0);
344 __m256d
T2 = _mm256_shuffle_pd(
P2,
P3, 15);
345 __m256d
T3 = _mm256_shuffle_pd(
P2,
P3, 0);
347 kernel.
packet[1].
v = _mm256_castpd_ps(_mm256_permute2f128_pd(T0,
T2, 32));
348 kernel.
packet[3].
v = _mm256_castpd_ps(_mm256_permute2f128_pd(T0,
T2, 49));
349 kernel.
packet[0].
v = _mm256_castpd_ps(_mm256_permute2f128_pd(
T1,
T3, 32));
350 kernel.
packet[2].
v = _mm256_castpd_ps(_mm256_permute2f128_pd(
T1,
T3, 49));
355 __m256d tmp = _mm256_permute2f128_pd(kernel.
packet[0].
v, kernel.
packet[1].
v, 0+(2<<4));
361 return psqrt_complex<Packet2cd>(
a);
365 return psqrt_complex<Packet4cf>(
a);
372 #endif // EIGEN_COMPLEX_AVX_H