10 #ifndef EIGEN_PACKET_MATH_SVE_H
11 #define EIGEN_PACKET_MATH_SVE_H
17 #ifndef EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD
18 #define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 8
21 #ifndef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
22 #define EIGEN_HAS_SINGLE_INSTRUCTION_MADD
25 #define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 32
27 template <
typename Scalar,
int SVEVectorLength>
29 enum {
size = SVEVectorLength / (
sizeof(
Scalar) * CHAR_BIT) };
33 typedef svint32_t PacketXi
__attribute__((arm_sve_vector_bits(EIGEN_ARM64_SVE_VL)));
78 svprfw(svptrue_b32(), addr, SV_PLDL1KEEP);
84 return svdup_n_s32(from);
92 return svadd_s32_z(svptrue_b32(),
pset1<PacketXi>(
a), svld1_s32(svptrue_b32(),
c));
98 return svadd_s32_z(svptrue_b32(),
a,
b);
104 return svsub_s32_z(svptrue_b32(),
a,
b);
110 return svneg_s32_z(svptrue_b32(),
a);
122 return svmul_s32_z(svptrue_b32(),
a,
b);
128 return svdiv_s32_z(svptrue_b32(),
a,
b);
134 return svmla_s32_z(svptrue_b32(),
c,
a,
b);
140 return svmin_s32_z(svptrue_b32(),
a,
b);
146 return svmax_s32_z(svptrue_b32(),
a,
b);
152 return svdup_n_s32_z(svcmplt_s32(svptrue_b32(),
a,
b), 0xffffffffu);
158 return svdup_n_s32_z(svcmplt_s32(svptrue_b32(),
a,
b), 0xffffffffu);
164 return svdup_n_s32_z(svcmpeq_s32(svptrue_b32(),
a,
b), 0xffffffffu);
170 return svdup_n_s32_z(svptrue_b32(), 0xffffffffu);
176 return svdup_n_s32_z(svptrue_b32(), 0);
182 return svand_s32_z(svptrue_b32(),
a,
b);
188 return svorr_s32_z(svptrue_b32(),
a,
b);
194 return sveor_s32_z(svptrue_b32(),
a,
b);
200 return svbic_s32_z(svptrue_b32(),
a,
b);
206 return svasrd_n_s32_z(svptrue_b32(),
a,
N);
212 return svreinterpret_s32_u32(svlsr_u32_z(svptrue_b32(), svreinterpret_u32_s32(
a), svdup_n_u32_z(svptrue_b32(),
N)));
218 return svlsl_s32_z(svptrue_b32(),
a, svdup_n_u32_z(svptrue_b32(),
N));
236 svuint32_t
indices = svindex_u32(0, 1);
238 return svld1_gather_u32index_s32(svptrue_b32(), from,
indices);
244 svuint32_t
indices = svindex_u32(0, 1);
247 return svld1_gather_u32index_s32(svptrue_b32(), from,
indices);
266 svint32_t
indices = svindex_s32(0, stride);
267 return svld1_gather_s32index_s32(svptrue_b32(), from,
indices);
274 svint32_t
indices = svindex_s32(0, stride);
275 svst1_scatter_s32index_s32(svptrue_b32(), to,
indices, from);
282 return svlasta_s32(svpfalse_b(),
a);
294 return svabs_s32_z(svptrue_b32(),
a);
307 EIGEN_INTERNAL_ERROR_PLEASE_FILE_A_BUG_REPORT);
310 svint32_t
prod = svmul_s32_z(svptrue_b32(),
a, svrev_s32(
a));
314 if (EIGEN_ARM64_SVE_VL >= 2048) {
315 half_prod = svtbl_s32(
prod, svindex_u32(32, 1));
316 prod = svmul_s32_z(svptrue_b32(),
prod, half_prod);
318 if (EIGEN_ARM64_SVE_VL >= 1024) {
319 half_prod = svtbl_s32(
prod, svindex_u32(16, 1));
320 prod = svmul_s32_z(svptrue_b32(),
prod, half_prod);
322 if (EIGEN_ARM64_SVE_VL >= 512) {
323 half_prod = svtbl_s32(
prod, svindex_u32(8, 1));
324 prod = svmul_s32_z(svptrue_b32(),
prod, half_prod);
326 if (EIGEN_ARM64_SVE_VL >= 256) {
327 half_prod = svtbl_s32(
prod, svindex_u32(4, 1));
328 prod = svmul_s32_z(svptrue_b32(),
prod, half_prod);
331 half_prod = svtbl_s32(
prod, svindex_u32(2, 1));
332 prod = svmul_s32_z(svptrue_b32(),
prod, half_prod);
341 return svminv_s32(svptrue_b32(),
a);
347 return svmaxv_s32(svptrue_b32(),
a);
355 PacketXi stride_index = svindex_s32(0,
N);
357 for (
i = 0;
i <
N;
i++) {
358 svst1_scatter_s32index_s32(svptrue_b32(),
buffer +
i, stride_index, kernel.
packet[
i]);
360 for (
i = 0;
i <
N;
i++) {
367 typedef svfloat32_t PacketXf
__attribute__((arm_sve_vector_bits(EIGEN_ARM64_SVE_VL)));
370 struct packet_traits<
float> : default_packet_traits {
426 return svdup_n_f32(from);
432 return svreinterpret_f32_u32(svdup_n_u32_z(svptrue_b32(), from));
440 return svadd_f32_z(svptrue_b32(),
pset1<PacketXf>(
a), svld1_f32(svptrue_b32(),
c));
446 return svadd_f32_z(svptrue_b32(),
a,
b);
452 return svsub_f32_z(svptrue_b32(),
a,
b);
458 return svneg_f32_z(svptrue_b32(),
a);
470 return svmul_f32_z(svptrue_b32(),
a,
b);
476 return svdiv_f32_z(svptrue_b32(),
a,
b);
482 return svmla_f32_z(svptrue_b32(),
c,
a,
b);
488 return svmin_f32_z(svptrue_b32(),
a,
b);
500 return svminnm_f32_z(svptrue_b32(),
a,
b);
506 return svmax_f32_z(svptrue_b32(),
a,
b);
518 return svmaxnm_f32_z(svptrue_b32(),
a,
b);
526 return svreinterpret_f32_u32(svdup_n_u32_z(svcmplt_f32(svptrue_b32(),
a,
b), 0xffffffffu));
532 return svreinterpret_f32_u32(svdup_n_u32_z(svcmplt_f32(svptrue_b32(),
a,
b), 0xffffffffu));
538 return svreinterpret_f32_u32(svdup_n_u32_z(svcmpeq_f32(svptrue_b32(),
a,
b), 0xffffffffu));
547 return svreinterpret_f32_u32(svdup_n_u32_z(svnot_b_z(svptrue_b32(), svcmpge_f32(svptrue_b32(),
a,
b)), 0xffffffffu));
553 return svrintm_f32_z(svptrue_b32(),
a);
559 return svreinterpret_f32_u32(svdup_n_u32_z(svptrue_b32(), 0xffffffffu));
566 return svreinterpret_f32_u32(svand_u32_z(svptrue_b32(), svreinterpret_u32_f32(
a), svreinterpret_u32_f32(
b)));
572 return svreinterpret_f32_u32(svorr_u32_z(svptrue_b32(), svreinterpret_u32_f32(
a), svreinterpret_u32_f32(
b)));
578 return svreinterpret_f32_u32(sveor_u32_z(svptrue_b32(), svreinterpret_u32_f32(
a), svreinterpret_u32_f32(
b)));
584 return svreinterpret_f32_u32(svbic_u32_z(svptrue_b32(), svreinterpret_u32_f32(
a), svreinterpret_u32_f32(
b)));
602 svuint32_t
indices = svindex_u32(0, 1);
604 return svld1_gather_u32index_f32(svptrue_b32(), from,
indices);
610 svuint32_t
indices = svindex_u32(0, 1);
613 return svld1_gather_u32index_f32(svptrue_b32(), from,
indices);
632 svint32_t
indices = svindex_s32(0, stride);
633 return svld1_gather_s32index_f32(svptrue_b32(), from,
indices);
640 svint32_t
indices = svindex_s32(0, stride);
641 svst1_scatter_s32index_f32(svptrue_b32(), to,
indices, from);
648 return svlasta_f32(svpfalse_b(),
a);
660 return svabs_f32_z(svptrue_b32(),
a);
674 return svaddv_f32(svptrue_b32(),
a);
684 EIGEN_INTERNAL_ERROR_PLEASE_FILE_A_BUG_REPORT);
686 svfloat32_t
prod = svmul_f32_z(svptrue_b32(),
a, svrev_f32(
a));
687 svfloat32_t half_prod;
690 if (EIGEN_ARM64_SVE_VL >= 2048) {
691 half_prod = svtbl_f32(
prod, svindex_u32(32, 1));
692 prod = svmul_f32_z(svptrue_b32(),
prod, half_prod);
694 if (EIGEN_ARM64_SVE_VL >= 1024) {
695 half_prod = svtbl_f32(
prod, svindex_u32(16, 1));
696 prod = svmul_f32_z(svptrue_b32(),
prod, half_prod);
698 if (EIGEN_ARM64_SVE_VL >= 512) {
699 half_prod = svtbl_f32(
prod, svindex_u32(8, 1));
700 prod = svmul_f32_z(svptrue_b32(),
prod, half_prod);
702 if (EIGEN_ARM64_SVE_VL >= 256) {
703 half_prod = svtbl_f32(
prod, svindex_u32(4, 1));
704 prod = svmul_f32_z(svptrue_b32(),
prod, half_prod);
707 half_prod = svtbl_f32(
prod, svindex_u32(2, 1));
708 prod = svmul_f32_z(svptrue_b32(),
prod, half_prod);
717 return svminv_f32(svptrue_b32(),
a);
723 return svmaxv_f32(svptrue_b32(),
a);
732 PacketXi stride_index = svindex_s32(0,
N);
734 for (
i = 0;
i <
N;
i++) {
735 svst1_scatter_s32index_f32(svptrue_b32(),
buffer +
i, stride_index, kernel.
packet[
i]);
738 for (
i = 0;
i <
N;
i++) {
752 #endif // EIGEN_PACKET_MATH_SVE_H