11 #ifndef EIGEN_COMPLEX_NEON_H
12 #define EIGEN_COMPLEX_NEON_H
21 #if EIGEN_COMP_CLANG || EIGEN_COMP_CASTXML
22 uint32x4_t
ret = { 0x00000000, 0x80000000, 0x00000000, 0x80000000 };
25 static const uint32_t conj_XOR_DATA[] = { 0x00000000, 0x80000000, 0x00000000, 0x80000000 };
26 return vld1q_u32( conj_XOR_DATA );
32 static const uint32_t conj_XOR_DATA[] = { 0x00000000, 0x80000000 };
33 return vld1_u32( conj_XOR_DATA );
51 template<>
struct packet_traits<
std::
complex<float> > : default_packet_traits
77 typedef std::complex<float>
type;
89 template<>
struct unpacket_traits<Packet2cf>
91 typedef std::complex<float>
type;
105 {
return Packet1cf(vset_lane_f32(
a, vdup_n_f32(0.
f), 0)); }
107 {
return Packet2cf(vreinterpretq_f32_u64(vmovl_u32(vreinterpret_u32_f32(
a)))); }
110 {
return Packet1cf(vld1_f32(
reinterpret_cast<const float*
>(&from))); }
113 const float32x2_t r64 = vld1_f32(
reinterpret_cast<const float*
>(&from));
114 return Packet2cf(vcombine_f32(r64, r64));
138 return Packet2cf(vreinterpretq_f32_u32(veorq_u32(
b,
p4ui_CONJ_XOR())));
146 v1 = vdup_lane_f32(
a.v, 0);
148 v2 = vdup_lane_f32(
a.v, 1);
150 v1 = vmul_f32(
v1,
b.v);
152 v2 = vmul_f32(
v2,
b.v);
165 v1 = vcombine_f32(vdup_lane_f32(vget_low_f32(
a.v), 0), vdup_lane_f32(vget_high_f32(
a.v), 0));
167 v2 = vcombine_f32(vdup_lane_f32(vget_low_f32(
a.v), 1), vdup_lane_f32(vget_high_f32(
a.v), 1));
169 v1 = vmulq_f32(
v1,
b.v);
171 v2 = vmulq_f32(
v2,
b.v);
173 v2 = vreinterpretq_f32_u32(veorq_u32(vreinterpretq_u32_f32(
v2),
p4ui_CONJ_XOR()));
175 v2 = vrev64q_f32(
v2);
177 return Packet2cf(vaddq_f32(
v1,
v2));
187 Packet2f eq_swapped = vrev64_f32(eq);
198 Packet4f eq_swapped = vrev64q_f32(eq);
204 {
return Packet1cf(vreinterpret_f32_u32(vand_u32(vreinterpret_u32_f32(
a.v), vreinterpret_u32_f32(
b.v)))); }
206 {
return Packet2cf(vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(
a.v), vreinterpretq_u32_f32(
b.v)))); }
209 {
return Packet1cf(vreinterpret_f32_u32(vorr_u32(vreinterpret_u32_f32(
a.v), vreinterpret_u32_f32(
b.v)))); }
211 {
return Packet2cf(vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(
a.v), vreinterpretq_u32_f32(
b.v)))); }
214 {
return Packet1cf(vreinterpret_f32_u32(veor_u32(vreinterpret_u32_f32(
a.v), vreinterpret_u32_f32(
b.v)))); }
216 {
return Packet2cf(vreinterpretq_f32_u32(veorq_u32(vreinterpretq_u32_f32(
a.v), vreinterpretq_u32_f32(
b.v)))); }
219 {
return Packet1cf(vreinterpret_f32_u32(vbic_u32(vreinterpret_u32_f32(
a.v), vreinterpret_u32_f32(
b.v)))); }
221 {
return Packet2cf(vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(
a.v), vreinterpretq_u32_f32(
b.v)))); }
240 template<>
EIGEN_STRONG_INLINE void pstore <std::complex<float> >(std::complex<float> *to,
const Packet2cf& from)
245 template<>
EIGEN_STRONG_INLINE void pstoreu<std::complex<float> >(std::complex<float> *to,
const Packet2cf& from)
249 const std::complex<float>* from,
Index stride)
254 template<>
EIGEN_DEVICE_FUNC inline Packet2cf pgather<std::complex<float>, Packet2cf>(
255 const std::complex<float>* from,
Index stride)
261 return Packet2cf(
res);
266 { to[stride*0] = std::complex<float>(vget_lane_f32(from.v, 0), vget_lane_f32(from.v, 1)); }
267 template<>
EIGEN_DEVICE_FUNC inline void pscatter<std::complex<float>, Packet2cf>(
268 std::complex<float>* to,
const Packet2cf& from,
Index stride)
270 to[stride*0] = std::complex<float>(vgetq_lane_f32(from.v, 0), vgetq_lane_f32(from.v, 1));
271 to[stride*1] = std::complex<float>(vgetq_lane_f32(from.v, 2), vgetq_lane_f32(from.v, 3));
274 template<>
EIGEN_STRONG_INLINE void prefetch<std::complex<float> >(
const std::complex<float> *addr)
280 vst1_f32(
reinterpret_cast<float*
>(&
x),
a.v);
286 vst1q_f32(
reinterpret_cast<float*
>(
x),
a.v);
292 {
return Packet2cf(vcombine_f32(vget_high_f32(
a.v), vget_low_f32(
a.v))); }
297 {
return Packet2cf(vrev64q_f32(
a.v)); }
301 std::complex<float>
s;
302 vst1_f32((
float *)&
s,
a.v);
307 std::complex<float>
s;
308 vst1_f32(
reinterpret_cast<float*
>(&
s), vadd_f32(vget_low_f32(
a.v), vget_high_f32(
a.v)));
314 std::complex<float>
s;
315 vst1_f32((
float *)&
s,
a.v);
321 std::complex<float>
s;
323 a1 = vget_low_f32(
a.v);
324 a2 = vget_high_f32(
a.v);
326 v1 = vdup_lane_f32(
a1, 0);
328 v2 = vdup_lane_f32(
a1, 1);
340 vst1_f32(
reinterpret_cast<float*
>(&
s),
prod);
355 s = vmul_f32(
b.v,
b.v);
356 rev_s = vrev64_f32(
s);
367 s = vmulq_f32(
b.v,
b.v);
368 rev_s = vrev64q_f32(
s);
376 Packet4f tmp = vcombine_f32(vget_high_f32(kernel.packet[0].v), vget_high_f32(kernel.packet[1].v));
377 kernel.packet[0].v = vcombine_f32(vget_low_f32(kernel.packet[0].v), vget_low_f32(kernel.packet[1].v));
378 kernel.packet[1].v = tmp;
382 return psqrt_complex<Packet1cf>(
a);
386 return psqrt_complex<Packet2cf>(
a);
390 #if EIGEN_ARCH_ARM64 && !EIGEN_APPLE_DOUBLE_NEON_BUG
393 #if EIGEN_COMP_CLANG || EIGEN_COMP_CASTXML
394 static uint64x2_t p2ul_CONJ_XOR = {0x0, 0x8000000000000000};
396 const uint64_t p2ul_conj_XOR_DATA[] = { 0x0, 0x8000000000000000 };
397 static uint64x2_t p2ul_CONJ_XOR = vld1q_u64( p2ul_conj_XOR_DATA );
407 template<>
struct packet_traits<
std::
complex<double> > : default_packet_traits
409 typedef Packet1cd
type;
410 typedef Packet1cd
half;
431 template<>
struct unpacket_traits<Packet1cd>
433 typedef std::complex<double>
type;
434 typedef Packet1cd
half;
465 {
return Packet1cd(pnegate<Packet2d>(
a.v)); }
468 {
return Packet1cd(vreinterpretq_f64_u64(veorq_u64(vreinterpretq_u64_f64(
a.v), p2ul_CONJ_XOR))); }
475 v1 = vdupq_lane_f64(vget_low_f64(
a.v), 0);
477 v2 = vdupq_lane_f64(vget_high_f64(
a.v), 0);
479 v1 = vmulq_f64(
v1,
b.v);
481 v2 = vmulq_f64(
v2,
b.v);
483 v2 = vreinterpretq_f64_u64(veorq_u64(vreinterpretq_u64_f64(
v2), p2ul_CONJ_XOR));
485 v2 = preverse<Packet2d>(
v2);
487 return Packet1cd(vaddq_f64(
v1,
v2));
497 Packet2d eq_swapped = vreinterpretq_f64_u32(vrev64q_u32(vreinterpretq_u32_f64(eq)));
503 {
return Packet1cd(vreinterpretq_f64_u64(vandq_u64(vreinterpretq_u64_f64(
a.v),vreinterpretq_u64_f64(
b.v)))); }
506 {
return Packet1cd(vreinterpretq_f64_u64(vorrq_u64(vreinterpretq_u64_f64(
a.v),vreinterpretq_u64_f64(
b.v)))); }
509 {
return Packet1cd(vreinterpretq_f64_u64(veorq_u64(vreinterpretq_u64_f64(
a.v),vreinterpretq_u64_f64(
b.v)))); }
512 {
return Packet1cd(vreinterpretq_f64_u64(vbicq_u64(vreinterpretq_u64_f64(
a.v),vreinterpretq_u64_f64(
b.v)))); }
517 template<>
EIGEN_STRONG_INLINE void pstore <std::complex<double> >(std::complex<double> *to,
const Packet1cd& from)
520 template<>
EIGEN_STRONG_INLINE void pstoreu<std::complex<double> >(std::complex<double> *to,
const Packet1cd& from)
523 template<>
EIGEN_STRONG_INLINE void prefetch<std::complex<double> >(
const std::complex<double> *addr)
526 template<>
EIGEN_DEVICE_FUNC inline Packet1cd pgather<std::complex<double>, Packet1cd>(
527 const std::complex<double>* from,
Index stride)
532 return Packet1cd(
res);
535 template<>
EIGEN_DEVICE_FUNC inline void pscatter<std::complex<double>, Packet1cd>(
536 std::complex<double>* to,
const Packet1cd& from,
Index stride)
537 { to[stride*0] = std::complex<double>(vgetq_lane_f64(from.v, 0), vgetq_lane_f64(from.v, 1)); }
542 pstore<std::complex<double> >(&
res,
a);
569 Packet2d tmp = vcombine_f64(vget_high_f64(kernel.packet[0].v), vget_high_f64(kernel.packet[1].v));
570 kernel.packet[0].v = vcombine_f64(vget_low_f64(kernel.packet[0].v), vget_low_f64(kernel.packet[1].v));
571 kernel.packet[1].v = tmp;
575 return psqrt_complex<Packet1cd>(
a);
578 #endif // EIGEN_ARCH_ARM64
584 #endif // EIGEN_COMPLEX_NEON_H