11 #ifndef EIGEN_COMPLEX_NEON_H
12 #define EIGEN_COMPLEX_NEON_H
21 uint32x4_t
ret = { 0x00000000, 0x80000000, 0x00000000, 0x80000000 };
24 static const uint32_t conj_XOR_DATA[] = { 0x00000000, 0x80000000, 0x00000000, 0x80000000 };
25 return vld1q_u32( conj_XOR_DATA );
30 static const uint32_t conj_XOR_DATA[] = { 0x00000000, 0x80000000 };
31 return vld1_u32( conj_XOR_DATA );
42 template<>
struct packet_traits<
std::
complex<float> > : default_packet_traits
70 r64 = vld1_f32((
const float *)&from);
81 return Packet2cf(vreinterpretq_f32_u32(veorq_u32(
b,
p4ui_CONJ_XOR())));
89 v1 = vcombine_f32(vdup_lane_f32(vget_low_f32(
a.v), 0), vdup_lane_f32(vget_high_f32(
a.v), 0));
91 v2 = vcombine_f32(vdup_lane_f32(vget_low_f32(
a.v), 1), vdup_lane_f32(vget_high_f32(
a.v), 1));
93 v1 = vmulq_f32(v1,
b.v);
95 v2 = vmulq_f32(v2,
b.v);
97 v2 = vreinterpretq_f32_u32(veorq_u32(vreinterpretq_u32_f32(v2),
p4ui_CONJ_XOR()));
101 return Packet2cf(vaddq_f32(v1, v2));
106 return Packet2cf(vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(
a.v),vreinterpretq_u32_f32(
b.v))));
110 return Packet2cf(vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(
a.v),vreinterpretq_u32_f32(
b.v))));
114 return Packet2cf(vreinterpretq_f32_u32(veorq_u32(vreinterpretq_u32_f32(
a.v),vreinterpretq_u32_f32(
b.v))));
118 return Packet2cf(vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(
a.v),vreinterpretq_u32_f32(
b.v))));
129 template<> EIGEN_DEVICE_FUNC
inline Packet2cf pgather<std::complex<float>, Packet2cf>(
const std::complex<float>* from,
Index stride)
132 res = vsetq_lane_f32(
std::real(from[0*stride]), res, 0);
133 res = vsetq_lane_f32(
std::imag(from[0*stride]), res, 1);
134 res = vsetq_lane_f32(
std::real(from[1*stride]), res, 2);
135 res = vsetq_lane_f32(
std::imag(from[1*stride]), res, 3);
136 return Packet2cf(res);
139 template<> EIGEN_DEVICE_FUNC
inline void pscatter<std::complex<float>, Packet2cf>(std::complex<float>* to,
const Packet2cf& from,
Index stride)
141 to[stride*0] = std::complex<float>(vgetq_lane_f32(from.v, 0), vgetq_lane_f32(from.v, 1));
142 to[stride*1] = std::complex<float>(vgetq_lane_f32(from.v, 2), vgetq_lane_f32(from.v, 3));
150 vst1q_f32((
float *)
x,
a.v);
156 float32x2_t a_lo, a_hi;
159 a_lo = vget_low_f32(
a.v);
160 a_hi = vget_high_f32(
a.v);
161 a_r128 = vcombine_f32(a_hi, a_lo);
163 return Packet2cf(a_r128);
168 return Packet2cf(vrev64q_f32(
a.v));
174 std::complex<float>
s;
176 a1 = vget_low_f32(
a.v);
177 a2 = vget_high_f32(
a.v);
178 a2 = vadd_f32(a1, a2);
179 vst1_f32((
float *)&
s, a2);
189 sum1 = vcombine_f32(vget_low_f32(vecs[0].v), vget_low_f32(vecs[1].v));
190 sum2 = vcombine_f32(vget_high_f32(vecs[0].v), vget_high_f32(vecs[1].v));
191 sum = vaddq_f32(sum1, sum2);
193 return Packet2cf(sum);
198 float32x2_t a1, a2, v1, v2, prod;
199 std::complex<float>
s;
201 a1 = vget_low_f32(
a.v);
202 a2 = vget_high_f32(
a.v);
204 v1 = vdup_lane_f32(a1, 0);
206 v2 = vdup_lane_f32(a1, 1);
208 v1 = vmul_f32(v1, a2);
210 v2 = vmul_f32(v2, a2);
212 v2 = vreinterpret_f32_u32(veor_u32(vreinterpret_u32_f32(v2),
p2ui_CONJ_XOR()));
216 prod = vadd_f32(v1, v2);
218 vst1_f32((
float *)&
s, prod);
224 struct palign_impl<Offset,Packet2cf>
230 first.v = vextq_f32(first.v, second.v, 2);
235 template<>
struct conj_helper<Packet2cf, Packet2cf, false,true>
246 template<>
struct conj_helper<Packet2cf, Packet2cf, true,false>
257 template<>
struct conj_helper<Packet2cf, Packet2cf, true,true>
273 Packet2cf res = conj_helper<Packet2cf,Packet2cf,false,true>().pmul(
a,
b);
277 s = vmulq_f32(
b.v,
b.v);
278 rev_s = vrev64q_f32(
s);
283 EIGEN_DEVICE_FUNC
inline void
284 ptranspose(PacketBlock<Packet2cf,2>& kernel) {
285 Packet4f tmp = vcombine_f32(vget_high_f32(kernel.packet[0].v), vget_high_f32(kernel.packet[1].v));
286 kernel.packet[0].v = vcombine_f32(vget_low_f32(kernel.packet[0].v), vget_low_f32(kernel.packet[1].v));
287 kernel.packet[1].v = tmp;
291 #if EIGEN_ARCH_ARM64 && !EIGEN_APPLE_DOUBLE_NEON_BUG
295 static uint64x2_t p2ul_CONJ_XOR = {0x0, 0x8000000000000000};
297 const uint64_t p2ul_conj_XOR_DATA[] = { 0x0, 0x8000000000000000 };
298 static uint64x2_t p2ul_CONJ_XOR = vld1q_u64( p2ul_conj_XOR_DATA );
308 template<>
struct packet_traits<
std::
complex<double> > : default_packet_traits
310 typedef Packet1cd
type;
311 typedef Packet1cd
half;
342 template<>
EIGEN_STRONG_INLINE Packet1cd
pconj(
const Packet1cd&
a) {
return Packet1cd(vreinterpretq_f64_u64(veorq_u64(vreinterpretq_u64_f64(
a.v), p2ul_CONJ_XOR))); }
349 v1 = vdupq_lane_f64(vget_low_f64(
a.v), 0);
351 v2 = vdupq_lane_f64(vget_high_f64(
a.v), 0);
353 v1 = vmulq_f64(v1,
b.v);
355 v2 = vmulq_f64(v2,
b.v);
357 v2 = vreinterpretq_f64_u64(veorq_u64(vreinterpretq_u64_f64(v2), p2ul_CONJ_XOR));
359 v2 = preverse<Packet2d>(v2);
361 return Packet1cd(vaddq_f64(v1, v2));
366 return Packet1cd(vreinterpretq_f64_u64(vandq_u64(vreinterpretq_u64_f64(
a.v),vreinterpretq_u64_f64(
b.v))));
370 return Packet1cd(vreinterpretq_f64_u64(vorrq_u64(vreinterpretq_u64_f64(
a.v),vreinterpretq_u64_f64(
b.v))));
374 return Packet1cd(vreinterpretq_f64_u64(veorq_u64(vreinterpretq_u64_f64(
a.v),vreinterpretq_u64_f64(
b.v))));
378 return Packet1cd(vreinterpretq_f64_u64(vbicq_u64(vreinterpretq_u64_f64(
a.v),vreinterpretq_u64_f64(
b.v))));
388 template<> EIGEN_DEVICE_FUNC
inline Packet1cd pgather<std::complex<double>, Packet1cd>(
const std::complex<double>* from,
Index stride)
391 res = vsetq_lane_f64(
std::real(from[0*stride]), res, 0);
392 res = vsetq_lane_f64(
std::imag(from[0*stride]), res, 1);
393 return Packet1cd(res);
396 template<> EIGEN_DEVICE_FUNC
inline void pscatter<std::complex<double>, Packet1cd>(std::complex<double>* to,
const Packet1cd& from,
Index stride)
398 to[stride*0] = std::complex<double>(vgetq_lane_f64(from.v, 0), vgetq_lane_f64(from.v, 1));
405 pstore<std::complex<double> >(&res,
a);
419 struct palign_impl<Offset,Packet1cd>
428 template<>
struct conj_helper<Packet1cd, Packet1cd, false,true>
439 template<>
struct conj_helper<Packet1cd, Packet1cd, true,false>
450 template<>
struct conj_helper<Packet1cd, Packet1cd, true,true>
466 Packet1cd res = conj_helper<Packet1cd,Packet1cd,false,true>().pmul(
a,
b);
480 Packet2d tmp = vcombine_f64(vget_high_f64(kernel.packet[0].v), vget_high_f64(kernel.packet[1].v));
481 kernel.packet[0].v = vcombine_f64(vget_low_f64(kernel.packet[0].v), vget_low_f64(kernel.packet[1].v));
482 kernel.packet[1].v = tmp;
484 #endif // EIGEN_ARCH_ARM64
490 #endif // EIGEN_COMPLEX_NEON_H