10 #ifndef EIGEN_PACKET_MATH_AVX512_H
11 #define EIGEN_PACKET_MATH_AVX512_H
17 #ifndef EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD
18 #define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 8
21 #ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS
22 #define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 32
25 #ifdef EIGEN_VECTORIZE_FMA
26 #ifndef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
27 #define EIGEN_HAS_SINGLE_INSTRUCTION_MADD
95 template<>
struct packet_traits<
float> : default_packet_traits
112 #if EIGEN_GNUC_AT_LEAST(5, 3) || (!EIGEN_COMP_GNUC_STRICT)
132 template<>
struct packet_traits<double> : default_packet_traits
141 #if EIGEN_GNUC_AT_LEAST(5, 3) || (!EIGEN_COMP_GNUC_STRICT)
198 return _mm512_set1_ps(from);
202 return _mm512_set1_pd(from);
206 return _mm512_set1_epi32(from);
211 return _mm512_castsi512_ps(_mm512_set1_epi32(from));
216 return _mm512_castsi512_pd(_mm512_set1_epi64(from));
224 return _mm512_castsi512_ps(_mm512_set_epi32(0, -1, 0, -1, 0, -1, 0, -1,
225 0, -1, 0, -1, 0, -1, 0, -1));
228 return _mm512_set_epi32(0, -1, 0, -1, 0, -1, 0, -1,
229 0, -1, 0, -1, 0, -1, 0, -1);
232 return _mm512_castsi512_pd(_mm512_set_epi32(0, 0, -1, -1, 0, 0, -1, -1,
233 0, 0, -1, -1, 0, 0, -1, -1));
238 return _mm512_broadcastss_ps(_mm_load_ps1(from));
242 return _mm512_set1_pd(*from);
247 return _mm512_add_ps(
249 _mm512_set_ps(15.0
f, 14.0
f, 13.0
f, 12.0
f, 11.0
f, 10.0
f, 9.0
f, 8.0
f, 7.0
f, 6.0
f, 5.0
f,
250 4.0
f, 3.0
f, 2.0
f, 1.0
f, 0.0
f));
254 return _mm512_add_pd(_mm512_set1_pd(
a),
255 _mm512_set_pd(7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0, 0.0));
261 return _mm512_add_ps(
a,
b);
266 return _mm512_add_pd(
a,
b);
271 return _mm512_add_epi32(
a,
b);
277 return _mm512_sub_ps(
a,
b);
282 return _mm512_sub_pd(
a,
b);
287 return _mm512_sub_epi32(
a,
b);
292 return _mm512_sub_ps(_mm512_set1_ps(0.0),
a);
296 return _mm512_sub_pd(_mm512_set1_pd(0.0),
a);
315 return _mm512_mul_ps(
a,
b);
320 return _mm512_mul_pd(
a,
b);
325 return _mm512_mullo_epi32(
a,
b);
331 return _mm512_div_ps(
a,
b);
336 return _mm512_div_pd(
a,
b);
339 #ifdef EIGEN_VECTORIZE_FMA
343 return _mm512_fmadd_ps(
a,
b,
c);
348 return _mm512_fmadd_pd(
a,
b,
c);
356 __mmask16 mask16 = _mm512_cmp_epi32_mask(
357 _mm512_castps_si512(mask), _mm512_setzero_epi32(), _MM_CMPINT_EQ);
358 return _mm512_mask_blend_ps(mask16,
a,
b);
365 __mmask8 mask8 = _mm512_cmp_epi64_mask(_mm512_castpd_si512(mask),
366 _mm512_setzero_epi32(), _MM_CMPINT_EQ);
367 return _mm512_mask_blend_pd(mask8,
a,
b);
374 return _mm512_min_ps(
b,
a);
380 return _mm512_min_pd(
b,
a);
387 return _mm512_max_ps(
b,
a);
393 return _mm512_max_pd(
b,
a);
431 #ifdef EIGEN_VECTORIZE_AVX512DQ
438 return _mm256_castsi256_ps(_mm512_extracti64x4_epi64( _mm512_castps_si512(
x),I_));
443 return _mm_castsi128_pd(_mm512_extracti32x4_epi32( _mm512_castpd_si512(
x),I_));
447 return _mm512_castsi512_ps(_mm512_inserti64x4(_mm512_castsi256_si512(_mm256_castps_si256(
a)),
448 _mm256_castps_si256(
b),1));
462 __m256i lo = _mm256_castps_si256(extract256<0>(rf));
463 __m256i hi = _mm256_castps_si256(extract256<1>(rf));
464 __m128i result_lo = _mm_packs_epi32(_mm256_extractf128_si256(lo, 0),
465 _mm256_extractf128_si256(lo, 1));
466 __m128i result_hi = _mm_packs_epi32(_mm256_extractf128_si256(hi, 0),
467 _mm256_extractf128_si256(hi, 1));
468 return _mm256_insertf128_si256(_mm256_castsi128_si256(result_lo), result_hi, 1);
473 __mmask16 mask = _mm512_cmp_ps_mask(
a,
b, _CMP_EQ_OQ);
474 return _mm512_castsi512_ps(
475 _mm512_mask_set1_epi32(_mm512_set1_epi32(0), mask, 0xffffffffu));
478 __mmask16 mask = _mm512_cmp_ps_mask(
a,
b, _CMP_LE_OQ);
479 return _mm512_castsi512_ps(
480 _mm512_mask_set1_epi32(_mm512_set1_epi32(0), mask, 0xffffffffu));
484 __mmask16 mask = _mm512_cmp_ps_mask(
a,
b, _CMP_LT_OQ);
485 return _mm512_castsi512_ps(
486 _mm512_mask_set1_epi32(_mm512_set1_epi32(0), mask, 0xffffffffu));
490 __mmask16 mask = _mm512_cmp_ps_mask(
a,
b, _CMP_NGE_UQ);
491 return _mm512_castsi512_ps(
492 _mm512_mask_set1_epi32(_mm512_set1_epi32(0), mask, 0xffffffffu));
496 __mmask16 mask = _mm512_cmp_epi32_mask(
a,
b, _CMP_EQ_OQ);
497 return _mm512_mask_set1_epi32(_mm512_set1_epi32(0), mask, 0xffffffffu);
503 __mmask8 mask = _mm512_cmp_pd_mask(
a,
b, _CMP_EQ_OQ);
504 return _mm512_castsi512_pd(
505 _mm512_mask_set1_epi64(_mm512_set1_epi64(0), mask, 0xffffffffffffffffu));
509 __mmask8 mask = _mm512_cmp_pd_mask(
a,
b, _CMP_LE_OQ);
510 return _mm512_castsi512_pd(
511 _mm512_mask_set1_epi64(_mm512_set1_epi64(0), mask, 0xffffffffffffffffu));
515 __mmask8 mask = _mm512_cmp_pd_mask(
a,
b, _CMP_LT_OQ);
516 return _mm512_castsi512_pd(
517 _mm512_mask_set1_epi64(_mm512_set1_epi64(0), mask, 0xffffffffffffffffu));
521 __mmask8 mask = _mm512_cmp_pd_mask(
a,
b, _CMP_NGE_UQ);
522 return _mm512_castsi512_pd(
523 _mm512_mask_set1_epi64(_mm512_set1_epi64(0), mask, 0xffffffffffffffffu));
537 return _mm512_set1_epi32(0xffffffffu);
553 return _mm512_and_si512(
a,
b);
559 #ifdef EIGEN_VECTORIZE_AVX512DQ
560 return _mm512_and_ps(
a,
b);
562 return _mm512_castsi512_ps(
pand(_mm512_castps_si512(
a),_mm512_castps_si512(
b)));
568 #ifdef EIGEN_VECTORIZE_AVX512DQ
569 return _mm512_and_pd(
a,
b);
572 Packet4d lane0_a = _mm512_extractf64x4_pd(
a, 0);
573 Packet4d lane0_b = _mm512_extractf64x4_pd(
b, 0);
574 res = _mm512_insertf64x4(
res, _mm256_and_pd(lane0_a, lane0_b), 0);
576 Packet4d lane1_a = _mm512_extractf64x4_pd(
a, 1);
577 Packet4d lane1_b = _mm512_extractf64x4_pd(
b, 1);
578 return _mm512_insertf64x4(
res, _mm256_and_pd(lane1_a, lane1_b), 1);
584 return _mm512_or_si512(
a,
b);
589 #ifdef EIGEN_VECTORIZE_AVX512DQ
590 return _mm512_or_ps(
a,
b);
592 return _mm512_castsi512_ps(
por(_mm512_castps_si512(
a),_mm512_castps_si512(
b)));
599 #ifdef EIGEN_VECTORIZE_AVX512DQ
600 return _mm512_or_pd(
a,
b);
602 return _mm512_castsi512_pd(
por(_mm512_castpd_si512(
a),_mm512_castpd_si512(
b)));
608 return _mm512_xor_si512(
a,
b);
613 #ifdef EIGEN_VECTORIZE_AVX512DQ
614 return _mm512_xor_ps(
a,
b);
616 return _mm512_castsi512_ps(
pxor(_mm512_castps_si512(
a),_mm512_castps_si512(
b)));
622 #ifdef EIGEN_VECTORIZE_AVX512DQ
623 return _mm512_xor_pd(
a,
b);
625 return _mm512_castsi512_pd(
pxor(_mm512_castpd_si512(
a),_mm512_castpd_si512(
b)));
631 return _mm512_andnot_si512(
b,
a);
636 #ifdef EIGEN_VECTORIZE_AVX512DQ
637 return _mm512_andnot_ps(
b,
a);
639 return _mm512_castsi512_ps(
pandnot(_mm512_castps_si512(
a),_mm512_castps_si512(
b)));
644 #ifdef EIGEN_VECTORIZE_AVX512DQ
645 return _mm512_andnot_pd(
b,
a);
647 return _mm512_castsi512_pd(
pandnot(_mm512_castpd_si512(
a),_mm512_castpd_si512(
b)));
656 return _mm512_roundscale_ps(
padd(
por(
pand(
a, mask), prev0dot5),
a), _MM_FROUND_TO_ZERO);
663 return _mm512_roundscale_pd(
padd(
por(
pand(
a, mask), prev0dot5),
a), _MM_FROUND_TO_ZERO);
667 return _mm512_srai_epi32(
a,
N);
671 return _mm512_srli_epi32(
a,
N);
675 return _mm512_slli_epi32(
a,
N);
689 reinterpret_cast<const __m512i*
>(from));
703 reinterpret_cast<const __m512i*
>(from));
708 __mmask16 mask =
static_cast<__mmask16
>(umask);
718 __m256i low_half = _mm256_loadu_si256(
reinterpret_cast<const __m256i*
>(from));
719 __m512 even_elements = _mm512_castsi512_ps(_mm512_cvtepu32_epi64(low_half));
720 __m512 pairs = _mm512_permute_ps(even_elements, _MM_SHUFFLE(2, 2, 0, 0));
724 #ifdef EIGEN_VECTORIZE_AVX512DQ
730 __m512d
x = _mm512_setzero_pd();
731 x = _mm512_insertf64x2(
x, _mm_loaddup_pd(&from[0]), 0);
732 x = _mm512_insertf64x2(
x, _mm_loaddup_pd(&from[1]), 1);
733 x = _mm512_insertf64x2(
x, _mm_loaddup_pd(&from[2]), 2);
734 x = _mm512_insertf64x2(
x, _mm_loaddup_pd(&from[3]), 3);
740 __m512d
x = _mm512_setzero_pd();
741 x = _mm512_mask_broadcastsd_pd(
x, 0
x3<<0, _mm_load_sd(from+0));
742 x = _mm512_mask_broadcastsd_pd(
x, 0
x3<<2, _mm_load_sd(from+1));
743 x = _mm512_mask_broadcastsd_pd(
x, 0
x3<<4, _mm_load_sd(from+2));
744 x = _mm512_mask_broadcastsd_pd(
x, 0
x3<<6, _mm_load_sd(from+3));
754 const Packet16i scatter_mask = _mm512_set_epi32(3,3,3,3, 2,2,2,2, 1,1,1,1, 0,0,0,0);
755 return _mm512_permutexvar_ps(scatter_mask, tmp);
762 __m256d lane0 = _mm256_set1_pd(*from);
763 __m256d lane1 = _mm256_set1_pd(*(from+1));
764 __m512d tmp = _mm512_undefined_pd();
765 tmp = _mm512_insertf64x4(tmp, lane0, 0);
766 return _mm512_insertf64x4(tmp, lane1, 1);
794 reinterpret_cast<__m512i*
>(to), from);
798 __mmask16 mask =
static_cast<__mmask16
>(umask);
805 Packet16i stride_vector = _mm512_set1_epi32(convert_index<int>(stride));
807 _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
810 return _mm512_i32gather_ps(
indices, from, 4);
815 Packet8i stride_vector = _mm256_set1_epi32(convert_index<int>(stride));
816 Packet8i stride_multiplier = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
817 Packet8i indices = _mm256_mullo_epi32(stride_vector, stride_multiplier);
819 return _mm512_i32gather_pd(
indices, from, 8);
826 Packet16i stride_vector = _mm512_set1_epi32(convert_index<int>(stride));
828 _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
830 _mm512_i32scatter_ps(to,
indices, from, 4);
836 Packet8i stride_vector = _mm256_set1_epi32(convert_index<int>(stride));
837 Packet8i stride_multiplier = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
838 Packet8i indices = _mm256_mullo_epi32(stride_vector, stride_multiplier);
839 _mm512_i32scatter_pd(to,
indices, from, 8);
864 return _mm_cvtss_f32(_mm512_extractf32x4_ps(
a, 0));
868 return _mm_cvtsd_f64(_mm256_extractf128_pd(_mm512_extractf64x4_pd(
a, 0), 0));
872 return _mm_extract_epi32(_mm512_extracti32x4_epi32(
a, 0), 0);
877 return _mm512_permutexvar_ps(_mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15),
a);
882 return _mm512_permutexvar_pd(_mm512_set_epi32(0, 0, 0, 1, 0, 2, 0, 3, 0, 4, 0, 5, 0, 6, 0, 7),
a);
888 return _mm512_castsi512_ps(_mm512_and_si512(_mm512_castps_si512(
a), _mm512_set1_epi32(0x7fffffff)));
893 return _mm512_castsi512_pd(_mm512_and_si512(_mm512_castpd_si512(
a),
894 _mm512_set1_epi64(0x7fffffffffffffff)));
907 #ifdef EIGEN_VECTORIZE_AVX512DQ
908 return _mm512_cvtepi64_pd(_mm512_srli_epi64(_mm512_castpd_si512(
pand(
a, cst_exp_mask)), 52));
910 return _mm512_cvtepi32_pd(_mm512_cvtepi64_epi32(_mm512_srli_epi64(_mm512_castpd_si512(
pand(
a, cst_exp_mask)), 52)));
933 const Packet8i permute_idx = _mm256_setr_epi32(0, 4, 1, 5, 2, 6, 3, 7);
935 Packet8i lo = _mm256_slli_epi64(hi, 52);
936 hi = _mm256_slli_epi64(_mm256_srli_epi64(hi, 32), 52);
937 Packet8d c = _mm512_castsi512_pd(_mm512_inserti64x4(_mm512_castsi256_si512(lo), hi, 1));
942 hi = _mm256_permutevar8x32_epi32(
padd(
b,
bias), permute_idx);
943 lo = _mm256_slli_epi64(hi, 52);
944 hi = _mm256_slli_epi64(_mm256_srli_epi64(hi, 32), 52);
945 c = _mm512_castsi512_pd(_mm512_inserti64x4(_mm512_castsi256_si512(lo), hi, 1));
950 #ifdef EIGEN_VECTORIZE_AVX512DQ
952 #define EIGEN_EXTRACT_8f_FROM_16f(INPUT, OUTPUT) \
953 __m256 OUTPUT##_0 = _mm512_extractf32x8_ps(INPUT, 0); \
954 __m256 OUTPUT##_1 = _mm512_extractf32x8_ps(INPUT, 1)
956 #define EIGEN_EXTRACT_8f_FROM_16f(INPUT, OUTPUT) \
957 __m256 OUTPUT##_0 = _mm256_insertf128_ps( \
958 _mm256_castps128_ps256(_mm512_extractf32x4_ps(INPUT, 0)), \
959 _mm512_extractf32x4_ps(INPUT, 1), 1); \
960 __m256 OUTPUT##_1 = _mm256_insertf128_ps( \
961 _mm256_castps128_ps256(_mm512_extractf32x4_ps(INPUT, 2)), \
962 _mm512_extractf32x4_ps(INPUT, 3), 1);
965 #ifdef EIGEN_VECTORIZE_AVX512DQ
966 #define EIGEN_INSERT_8f_INTO_16f(OUTPUT, INPUTA, INPUTB) \
967 OUTPUT = _mm512_insertf32x8(_mm512_castps256_ps512(INPUTA), INPUTB, 1);
969 #define EIGEN_INSERT_8f_INTO_16f(OUTPUT, INPUTA, INPUTB) \
970 OUTPUT = _mm512_undefined_ps(); \
971 OUTPUT = _mm512_insertf32x4(OUTPUT, _mm256_extractf128_ps(INPUTA, 0), 0); \
972 OUTPUT = _mm512_insertf32x4(OUTPUT, _mm256_extractf128_ps(INPUTA, 1), 1); \
973 OUTPUT = _mm512_insertf32x4(OUTPUT, _mm256_extractf128_ps(INPUTB, 0), 2); \
974 OUTPUT = _mm512_insertf32x4(OUTPUT, _mm256_extractf128_ps(INPUTB, 1), 3);
979 #ifdef EIGEN_VECTORIZE_AVX512DQ
980 __m256 lane0 = _mm512_extractf32x8_ps(
a, 0);
981 __m256 lane1 = _mm512_extractf32x8_ps(
a, 1);
982 Packet8f x = _mm256_add_ps(lane0, lane1);
985 __m128 lane0 = _mm512_extractf32x4_ps(
a, 0);
986 __m128 lane1 = _mm512_extractf32x4_ps(
a, 1);
987 __m128 lane2 = _mm512_extractf32x4_ps(
a, 2);
988 __m128 lane3 = _mm512_extractf32x4_ps(
a, 3);
989 __m128 sum = _mm_add_ps(_mm_add_ps(lane0, lane1), _mm_add_ps(lane2, lane3));
990 sum = _mm_hadd_ps(sum, sum);
991 sum = _mm_hadd_ps(sum, _mm_permute_ps(sum, 1));
992 return _mm_cvtss_f32(sum);
997 __m256d lane0 = _mm512_extractf64x4_pd(
a, 0);
998 __m256d lane1 = _mm512_extractf64x4_pd(
a, 1);
999 __m256d sum = _mm256_add_pd(lane0, lane1);
1000 __m256d tmp0 = _mm256_hadd_pd(sum, _mm256_permute2f128_pd(sum, sum, 1));
1001 return _mm_cvtsd_f64(_mm256_castpd256_pd128(_mm256_hadd_pd(tmp0, tmp0)));
1006 #ifdef EIGEN_VECTORIZE_AVX512DQ
1007 __m256 lane0 = _mm512_extractf32x8_ps(
a, 0);
1008 __m256 lane1 = _mm512_extractf32x8_ps(
a, 1);
1009 return _mm256_add_ps(lane0, lane1);
1011 __m128 lane0 = _mm512_extractf32x4_ps(
a, 0);
1012 __m128 lane1 = _mm512_extractf32x4_ps(
a, 1);
1013 __m128 lane2 = _mm512_extractf32x4_ps(
a, 2);
1014 __m128 lane3 = _mm512_extractf32x4_ps(
a, 3);
1015 __m128 sum0 = _mm_add_ps(lane0, lane2);
1016 __m128 sum1 = _mm_add_ps(lane1, lane3);
1017 return _mm256_insertf128_ps(_mm256_castps128_ps256(sum0), sum1, 1);
1022 __m256d lane0 = _mm512_extractf64x4_pd(
a, 0);
1023 __m256d lane1 = _mm512_extractf64x4_pd(
a, 1);
1024 return _mm256_add_pd(lane0, lane1);
1031 Packet8f lane0 = _mm512_extractf32x8_ps(
a, 0);
1032 Packet8f lane1 = _mm512_extractf32x8_ps(
a, 1);
1035 res =
pmul(
res, _mm_permute_ps(
res, _MM_SHUFFLE(0, 0, 3, 2)));
1038 __m128 lane0 = _mm512_extractf32x4_ps(
a, 0);
1039 __m128 lane1 = _mm512_extractf32x4_ps(
a, 1);
1040 __m128 lane2 = _mm512_extractf32x4_ps(
a, 2);
1041 __m128 lane3 = _mm512_extractf32x4_ps(
a, 3);
1043 res =
pmul(
res, _mm_permute_ps(
res, _MM_SHUFFLE(0, 0, 3, 2)));
1049 __m256d lane0 = _mm512_extractf64x4_pd(
a, 0);
1050 __m256d lane1 = _mm512_extractf64x4_pd(
a, 1);
1051 __m256d
res =
pmul(lane0, lane1);
1058 __m128 lane0 = _mm512_extractf32x4_ps(
a, 0);
1059 __m128 lane1 = _mm512_extractf32x4_ps(
a, 1);
1060 __m128 lane2 = _mm512_extractf32x4_ps(
a, 2);
1061 __m128 lane3 = _mm512_extractf32x4_ps(
a, 3);
1062 __m128
res = _mm_min_ps(_mm_min_ps(lane0, lane1), _mm_min_ps(lane2, lane3));
1063 res = _mm_min_ps(
res, _mm_permute_ps(
res, _MM_SHUFFLE(0, 0, 3, 2)));
1064 return pfirst(_mm_min_ps(
res, _mm_permute_ps(
res, _MM_SHUFFLE(0, 0, 0, 1))));
1068 __m256d lane0 = _mm512_extractf64x4_pd(
a, 0);
1069 __m256d lane1 = _mm512_extractf64x4_pd(
a, 1);
1070 __m256d
res = _mm256_min_pd(lane0, lane1);
1071 res = _mm256_min_pd(
res, _mm256_permute2f128_pd(
res,
res, 1));
1077 __m128 lane0 = _mm512_extractf32x4_ps(
a, 0);
1078 __m128 lane1 = _mm512_extractf32x4_ps(
a, 1);
1079 __m128 lane2 = _mm512_extractf32x4_ps(
a, 2);
1080 __m128 lane3 = _mm512_extractf32x4_ps(
a, 3);
1081 __m128
res = _mm_max_ps(_mm_max_ps(lane0, lane1), _mm_max_ps(lane2, lane3));
1082 res = _mm_max_ps(
res, _mm_permute_ps(
res, _MM_SHUFFLE(0, 0, 3, 2)));
1083 return pfirst(_mm_max_ps(
res, _mm_permute_ps(
res, _MM_SHUFFLE(0, 0, 0, 1))));
1088 __m256d lane0 = _mm512_extractf64x4_pd(
a, 0);
1089 __m256d lane1 = _mm512_extractf64x4_pd(
a, 1);
1090 __m256d
res = _mm256_max_pd(lane0, lane1);
1091 res = _mm256_max_pd(
res, _mm256_permute2f128_pd(
res,
res, 1));
1098 __mmask16 tmp = _mm512_test_epi32_mask(
xi,
xi);
1099 return !_mm512_kortestz(tmp,tmp);
1104 #define PACK_OUTPUT(OUTPUT, INPUT, INDEX, STRIDE) \
1105 EIGEN_INSERT_8f_INTO_16f(OUTPUT[INDEX], INPUT[INDEX], INPUT[INDEX + STRIDE]);
1108 __m512 T0 = _mm512_unpacklo_ps(kernel.
packet[0], kernel.
packet[1]);
1109 __m512
T1 = _mm512_unpackhi_ps(kernel.
packet[0], kernel.
packet[1]);
1110 __m512
T2 = _mm512_unpacklo_ps(kernel.
packet[2], kernel.
packet[3]);
1111 __m512
T3 = _mm512_unpackhi_ps(kernel.
packet[2], kernel.
packet[3]);
1112 __m512
T4 = _mm512_unpacklo_ps(kernel.
packet[4], kernel.
packet[5]);
1113 __m512
T5 = _mm512_unpackhi_ps(kernel.
packet[4], kernel.
packet[5]);
1114 __m512
T6 = _mm512_unpacklo_ps(kernel.
packet[6], kernel.
packet[7]);
1115 __m512 T7 = _mm512_unpackhi_ps(kernel.
packet[6], kernel.
packet[7]);
1116 __m512 T8 = _mm512_unpacklo_ps(kernel.
packet[8], kernel.
packet[9]);
1117 __m512 T9 = _mm512_unpackhi_ps(kernel.
packet[8], kernel.
packet[9]);
1118 __m512 T10 = _mm512_unpacklo_ps(kernel.
packet[10], kernel.
packet[11]);
1119 __m512 T11 = _mm512_unpackhi_ps(kernel.
packet[10], kernel.
packet[11]);
1120 __m512 T12 = _mm512_unpacklo_ps(kernel.
packet[12], kernel.
packet[13]);
1121 __m512 T13 = _mm512_unpackhi_ps(kernel.
packet[12], kernel.
packet[13]);
1122 __m512 T14 = _mm512_unpacklo_ps(kernel.
packet[14], kernel.
packet[15]);
1123 __m512 T15 = _mm512_unpackhi_ps(kernel.
packet[14], kernel.
packet[15]);
1124 __m512 S0 = _mm512_shuffle_ps(T0,
T2, _MM_SHUFFLE(1, 0, 1, 0));
1125 __m512
S1 = _mm512_shuffle_ps(T0,
T2, _MM_SHUFFLE(3, 2, 3, 2));
1126 __m512
S2 = _mm512_shuffle_ps(
T1,
T3, _MM_SHUFFLE(1, 0, 1, 0));
1127 __m512
S3 = _mm512_shuffle_ps(
T1,
T3, _MM_SHUFFLE(3, 2, 3, 2));
1128 __m512 S4 = _mm512_shuffle_ps(
T4,
T6, _MM_SHUFFLE(1, 0, 1, 0));
1129 __m512 S5 = _mm512_shuffle_ps(
T4,
T6, _MM_SHUFFLE(3, 2, 3, 2));
1130 __m512 S6 = _mm512_shuffle_ps(
T5, T7, _MM_SHUFFLE(1, 0, 1, 0));
1131 __m512 S7 = _mm512_shuffle_ps(
T5, T7, _MM_SHUFFLE(3, 2, 3, 2));
1132 __m512 S8 = _mm512_shuffle_ps(T8, T10, _MM_SHUFFLE(1, 0, 1, 0));
1133 __m512 S9 = _mm512_shuffle_ps(T8, T10, _MM_SHUFFLE(3, 2, 3, 2));
1134 __m512 S10 = _mm512_shuffle_ps(T9, T11, _MM_SHUFFLE(1, 0, 1, 0));
1135 __m512 S11 = _mm512_shuffle_ps(T9, T11, _MM_SHUFFLE(3, 2, 3, 2));
1136 __m512 S12 = _mm512_shuffle_ps(T12, T14, _MM_SHUFFLE(1, 0, 1, 0));
1137 __m512 S13 = _mm512_shuffle_ps(T12, T14, _MM_SHUFFLE(3, 2, 3, 2));
1138 __m512 S14 = _mm512_shuffle_ps(T13, T15, _MM_SHUFFLE(1, 0, 1, 0));
1139 __m512 S15 = _mm512_shuffle_ps(T13, T15, _MM_SHUFFLE(3, 2, 3, 2));
1160 tmp.
packet[0] = _mm256_permute2f128_ps(S0_0, S4_0, 0x20);
1161 tmp.
packet[1] = _mm256_permute2f128_ps(S1_0, S5_0, 0x20);
1162 tmp.
packet[2] = _mm256_permute2f128_ps(S2_0, S6_0, 0x20);
1163 tmp.
packet[3] = _mm256_permute2f128_ps(S3_0, S7_0, 0x20);
1164 tmp.
packet[4] = _mm256_permute2f128_ps(S0_0, S4_0, 0x31);
1165 tmp.
packet[5] = _mm256_permute2f128_ps(S1_0, S5_0, 0x31);
1166 tmp.
packet[6] = _mm256_permute2f128_ps(S2_0, S6_0, 0x31);
1167 tmp.
packet[7] = _mm256_permute2f128_ps(S3_0, S7_0, 0x31);
1169 tmp.
packet[8] = _mm256_permute2f128_ps(S0_1, S4_1, 0x20);
1170 tmp.
packet[9] = _mm256_permute2f128_ps(S1_1, S5_1, 0x20);
1171 tmp.
packet[10] = _mm256_permute2f128_ps(S2_1, S6_1, 0x20);
1172 tmp.
packet[11] = _mm256_permute2f128_ps(S3_1, S7_1, 0x20);
1173 tmp.
packet[12] = _mm256_permute2f128_ps(S0_1, S4_1, 0x31);
1174 tmp.
packet[13] = _mm256_permute2f128_ps(S1_1, S5_1, 0x31);
1175 tmp.
packet[14] = _mm256_permute2f128_ps(S2_1, S6_1, 0x31);
1176 tmp.
packet[15] = _mm256_permute2f128_ps(S3_1, S7_1, 0x31);
1179 tmp.
packet[16] = _mm256_permute2f128_ps(S8_0, S12_0, 0x20);
1180 tmp.
packet[17] = _mm256_permute2f128_ps(S9_0, S13_0, 0x20);
1181 tmp.
packet[18] = _mm256_permute2f128_ps(S10_0, S14_0, 0x20);
1182 tmp.
packet[19] = _mm256_permute2f128_ps(S11_0, S15_0, 0x20);
1183 tmp.
packet[20] = _mm256_permute2f128_ps(S8_0, S12_0, 0x31);
1184 tmp.
packet[21] = _mm256_permute2f128_ps(S9_0, S13_0, 0x31);
1185 tmp.
packet[22] = _mm256_permute2f128_ps(S10_0, S14_0, 0x31);
1186 tmp.
packet[23] = _mm256_permute2f128_ps(S11_0, S15_0, 0x31);
1188 tmp.
packet[24] = _mm256_permute2f128_ps(S8_1, S12_1, 0x20);
1189 tmp.
packet[25] = _mm256_permute2f128_ps(S9_1, S13_1, 0x20);
1190 tmp.
packet[26] = _mm256_permute2f128_ps(S10_1, S14_1, 0x20);
1191 tmp.
packet[27] = _mm256_permute2f128_ps(S11_1, S15_1, 0x20);
1192 tmp.
packet[28] = _mm256_permute2f128_ps(S8_1, S12_1, 0x31);
1193 tmp.
packet[29] = _mm256_permute2f128_ps(S9_1, S13_1, 0x31);
1194 tmp.
packet[30] = _mm256_permute2f128_ps(S10_1, S14_1, 0x31);
1195 tmp.
packet[31] = _mm256_permute2f128_ps(S11_1, S15_1, 0x31);
1218 #define PACK_OUTPUT_2(OUTPUT, INPUT, INDEX, STRIDE) \
1219 EIGEN_INSERT_8f_INTO_16f(OUTPUT[INDEX], INPUT[2 * INDEX], \
1220 INPUT[2 * INDEX + STRIDE]);
1223 __m512 T0 = _mm512_unpacklo_ps(kernel.
packet[0], kernel.
packet[1]);
1224 __m512
T1 = _mm512_unpackhi_ps(kernel.
packet[0], kernel.
packet[1]);
1225 __m512
T2 = _mm512_unpacklo_ps(kernel.
packet[2], kernel.
packet[3]);
1226 __m512
T3 = _mm512_unpackhi_ps(kernel.
packet[2], kernel.
packet[3]);
1228 __m512 S0 = _mm512_shuffle_ps(T0,
T2, _MM_SHUFFLE(1, 0, 1, 0));
1229 __m512
S1 = _mm512_shuffle_ps(T0,
T2, _MM_SHUFFLE(3, 2, 3, 2));
1230 __m512
S2 = _mm512_shuffle_ps(
T1,
T3, _MM_SHUFFLE(1, 0, 1, 0));
1231 __m512
S3 = _mm512_shuffle_ps(
T1,
T3, _MM_SHUFFLE(3, 2, 3, 2));
1240 tmp.
packet[0] = _mm256_permute2f128_ps(S0_0, S1_0, 0x20);
1241 tmp.
packet[1] = _mm256_permute2f128_ps(S2_0, S3_0, 0x20);
1242 tmp.
packet[2] = _mm256_permute2f128_ps(S0_0, S1_0, 0x31);
1243 tmp.
packet[3] = _mm256_permute2f128_ps(S2_0, S3_0, 0x31);
1245 tmp.
packet[4] = _mm256_permute2f128_ps(S0_1, S1_1, 0x20);
1246 tmp.
packet[5] = _mm256_permute2f128_ps(S2_1, S3_1, 0x20);
1247 tmp.
packet[6] = _mm256_permute2f128_ps(S0_1, S1_1, 0x31);
1248 tmp.
packet[7] = _mm256_permute2f128_ps(S2_1, S3_1, 0x31);
1256 #define PACK_OUTPUT_SQ_D(OUTPUT, INPUT, INDEX, STRIDE) \
1257 OUTPUT[INDEX] = _mm512_insertf64x4(OUTPUT[INDEX], INPUT[INDEX], 0); \
1258 OUTPUT[INDEX] = _mm512_insertf64x4(OUTPUT[INDEX], INPUT[INDEX + STRIDE], 1);
1260 #define PACK_OUTPUT_D(OUTPUT, INPUT, INDEX, STRIDE) \
1261 OUTPUT[INDEX] = _mm512_insertf64x4(OUTPUT[INDEX], INPUT[(2 * INDEX)], 0); \
1263 _mm512_insertf64x4(OUTPUT[INDEX], INPUT[(2 * INDEX) + STRIDE], 1);
1266 __m512d T0 = _mm512_shuffle_pd(kernel.
packet[0], kernel.
packet[1], 0);
1267 __m512d
T1 = _mm512_shuffle_pd(kernel.
packet[0], kernel.
packet[1], 0xff);
1268 __m512d
T2 = _mm512_shuffle_pd(kernel.
packet[2], kernel.
packet[3], 0);
1269 __m512d
T3 = _mm512_shuffle_pd(kernel.
packet[2], kernel.
packet[3], 0xff);
1273 tmp.
packet[0] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T0, 0),
1274 _mm512_extractf64x4_pd(
T2, 0), 0x20);
1275 tmp.
packet[1] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(
T1, 0),
1276 _mm512_extractf64x4_pd(
T3, 0), 0x20);
1277 tmp.
packet[2] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T0, 0),
1278 _mm512_extractf64x4_pd(
T2, 0), 0x31);
1279 tmp.
packet[3] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(
T1, 0),
1280 _mm512_extractf64x4_pd(
T3, 0), 0x31);
1282 tmp.
packet[4] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T0, 1),
1283 _mm512_extractf64x4_pd(
T2, 1), 0x20);
1284 tmp.
packet[5] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(
T1, 1),
1285 _mm512_extractf64x4_pd(
T3, 1), 0x20);
1286 tmp.
packet[6] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T0, 1),
1287 _mm512_extractf64x4_pd(
T2, 1), 0x31);
1288 tmp.
packet[7] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(
T1, 1),
1289 _mm512_extractf64x4_pd(
T3, 1), 0x31);
1298 __m512d T0 = _mm512_unpacklo_pd(kernel.
packet[0], kernel.
packet[1]);
1299 __m512d
T1 = _mm512_unpackhi_pd(kernel.
packet[0], kernel.
packet[1]);
1300 __m512d
T2 = _mm512_unpacklo_pd(kernel.
packet[2], kernel.
packet[3]);
1301 __m512d
T3 = _mm512_unpackhi_pd(kernel.
packet[2], kernel.
packet[3]);
1302 __m512d
T4 = _mm512_unpacklo_pd(kernel.
packet[4], kernel.
packet[5]);
1303 __m512d
T5 = _mm512_unpackhi_pd(kernel.
packet[4], kernel.
packet[5]);
1304 __m512d
T6 = _mm512_unpacklo_pd(kernel.
packet[6], kernel.
packet[7]);
1305 __m512d T7 = _mm512_unpackhi_pd(kernel.
packet[6], kernel.
packet[7]);
1309 tmp.
packet[0] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T0, 0),
1310 _mm512_extractf64x4_pd(
T2, 0), 0x20);
1311 tmp.
packet[1] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(
T1, 0),
1312 _mm512_extractf64x4_pd(
T3, 0), 0x20);
1313 tmp.
packet[2] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T0, 0),
1314 _mm512_extractf64x4_pd(
T2, 0), 0x31);
1315 tmp.
packet[3] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(
T1, 0),
1316 _mm512_extractf64x4_pd(
T3, 0), 0x31);
1318 tmp.
packet[4] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T0, 1),
1319 _mm512_extractf64x4_pd(
T2, 1), 0x20);
1320 tmp.
packet[5] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(
T1, 1),
1321 _mm512_extractf64x4_pd(
T3, 1), 0x20);
1322 tmp.
packet[6] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T0, 1),
1323 _mm512_extractf64x4_pd(
T2, 1), 0x31);
1324 tmp.
packet[7] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(
T1, 1),
1325 _mm512_extractf64x4_pd(
T3, 1), 0x31);
1327 tmp.
packet[8] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(
T4, 0),
1328 _mm512_extractf64x4_pd(
T6, 0), 0x20);
1329 tmp.
packet[9] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(
T5, 0),
1330 _mm512_extractf64x4_pd(T7, 0), 0x20);
1331 tmp.
packet[10] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(
T4, 0),
1332 _mm512_extractf64x4_pd(
T6, 0), 0x31);
1333 tmp.
packet[11] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(
T5, 0),
1334 _mm512_extractf64x4_pd(T7, 0), 0x31);
1336 tmp.
packet[12] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(
T4, 1),
1337 _mm512_extractf64x4_pd(
T6, 1), 0x20);
1338 tmp.
packet[13] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(
T5, 1),
1339 _mm512_extractf64x4_pd(T7, 1), 0x20);
1340 tmp.
packet[14] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(
T4, 1),
1341 _mm512_extractf64x4_pd(
T6, 1), 0x31);
1342 tmp.
packet[15] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(
T5, 1),
1343 _mm512_extractf64x4_pd(T7, 1), 0x31);
1359 assert(
false &&
"To be implemented");
1366 __mmask8
m = (ifPacket.
select[0] )
1367 | (ifPacket.
select[1]<<1)
1368 | (ifPacket.
select[2]<<2)
1369 | (ifPacket.
select[3]<<3)
1370 | (ifPacket.
select[4]<<4)
1371 | (ifPacket.
select[5]<<5)
1372 | (ifPacket.
select[6]<<6)
1373 | (ifPacket.
select[7]<<7);
1374 return _mm512_mask_blend_pd(
m, elsePacket, thenPacket);
1379 return _mm256_set1_epi16(from.
x);
1387 return _mm256_load_si256(
reinterpret_cast<const __m256i*
>(from));
1391 return _mm256_loadu_si256(
reinterpret_cast<const __m256i*
>(from));
1397 _mm256_store_si256((__m256i*)(
void*)to, from);
1403 _mm256_storeu_si256((__m256i*)(
void*)to, from);
1408 unsigned short a = from[0].
x;
1409 unsigned short b = from[1].
x;
1410 unsigned short c = from[2].
x;
1411 unsigned short d = from[3].
x;
1412 unsigned short e = from[4].
x;
1413 unsigned short f = from[5].
x;
1414 unsigned short g = from[6].
x;
1415 unsigned short h = from[7].
x;
1416 return _mm256_set_epi16(
h,
h,
g,
g,
f,
f,
e,
e,
d,
d,
c,
c,
b,
b,
a,
a);
1421 unsigned short a = from[0].
x;
1422 unsigned short b = from[1].
x;
1423 unsigned short c = from[2].
x;
1424 unsigned short d = from[3].
x;
1425 return _mm256_set_epi16(
d,
d,
d,
d,
c,
c,
c,
c,
b,
b,
b,
b,
a,
a,
a,
a);
1429 #ifdef EIGEN_HAS_FP16_C
1430 return _mm512_cvtph_ps(
a);
1451 return _mm512_set_ps(
1452 ff, fe,
fd, fc, fb, fa, f9, f8, f7,
f6, f5,
f4,
f3,
f2,
f1,
f0);
1457 #ifdef EIGEN_HAS_FP16_C
1458 return _mm512_cvtps_ph(
a, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC);
1479 return _mm256_set_epi16(
1480 hf.
x, he.
x,
hd.x,
hc.x, hb.
x, ha.
x, h9.
x, h8.
x,
1481 h7.
x, h6.
x, h5.
x, h4.
x, h3.
x, h2.
x, h1.
x,
h0.x);
1491 const __m256i sign_mask = _mm256_set1_epi16(
static_cast<numext::uint16_t>(0x8000));
1492 return _mm256_andnot_si256(sign_mask,
a);
1528 return _mm256_blendv_epi8(
b,
a, mask);
1568 Packet16h sign_mask = _mm256_set1_epi16(
static_cast<unsigned short>(0x8000));
1569 return _mm256_xor_si256(
a, sign_mask);
1607 Packet8h lane0 = _mm256_extractf128_si256(
a, 0);
1608 Packet8h lane1 = _mm256_extractf128_si256(
a, 1);
1631 __m128i
m = _mm_setr_epi8(14,15,12,13,10,11,8,9,6,7,4,5,2,3,0,1);
1632 return _mm256_insertf128_si256(
1633 _mm256_castsi128_si256(_mm_shuffle_epi8(_mm256_extractf128_si256(
a,1),
m)),
1634 _mm_shuffle_epi8(_mm256_extractf128_si256(
a,0),
m), 1);
1639 return _mm256_set_epi16(
1640 from[15*stride].
x, from[14*stride].
x, from[13*stride].
x, from[12*stride].
x,
1641 from[11*stride].
x, from[10*stride].
x, from[9*stride].
x, from[8*stride].
x,
1642 from[7*stride].
x, from[6*stride].
x, from[5*stride].
x, from[4*stride].
x,
1643 from[3*stride].
x, from[2*stride].
x, from[1*stride].
x, from[0*stride].
x);
1650 to[stride*0] = aux[0];
1651 to[stride*1] = aux[1];
1652 to[stride*2] = aux[2];
1653 to[stride*3] = aux[3];
1654 to[stride*4] = aux[4];
1655 to[stride*5] = aux[5];
1656 to[stride*6] = aux[6];
1657 to[stride*7] = aux[7];
1658 to[stride*8] = aux[8];
1659 to[stride*9] = aux[9];
1660 to[stride*10] = aux[10];
1661 to[stride*11] = aux[11];
1662 to[stride*12] = aux[12];
1663 to[stride*13] = aux[13];
1664 to[stride*14] = aux[14];
1665 to[stride*15] = aux[15];
1680 __m256i k = kernel.
packet[10];
1681 __m256i
l = kernel.
packet[11];
1682 __m256i
m = kernel.
packet[12];
1683 __m256i
n = kernel.
packet[13];
1684 __m256i o = kernel.
packet[14];
1685 __m256i
p = kernel.
packet[15];
1687 __m256i ab_07 = _mm256_unpacklo_epi16(
a,
b);
1688 __m256i cd_07 = _mm256_unpacklo_epi16(
c,
d);
1689 __m256i ef_07 = _mm256_unpacklo_epi16(
e,
f);
1690 __m256i gh_07 = _mm256_unpacklo_epi16(
g,
h);
1691 __m256i ij_07 = _mm256_unpacklo_epi16(
i,
j);
1692 __m256i kl_07 = _mm256_unpacklo_epi16(k,
l);
1693 __m256i mn_07 = _mm256_unpacklo_epi16(
m,
n);
1694 __m256i op_07 = _mm256_unpacklo_epi16(o,
p);
1696 __m256i ab_8f = _mm256_unpackhi_epi16(
a,
b);
1697 __m256i cd_8f = _mm256_unpackhi_epi16(
c,
d);
1698 __m256i ef_8f = _mm256_unpackhi_epi16(
e,
f);
1699 __m256i gh_8f = _mm256_unpackhi_epi16(
g,
h);
1700 __m256i ij_8f = _mm256_unpackhi_epi16(
i,
j);
1701 __m256i kl_8f = _mm256_unpackhi_epi16(k,
l);
1702 __m256i mn_8f = _mm256_unpackhi_epi16(
m,
n);
1703 __m256i op_8f = _mm256_unpackhi_epi16(o,
p);
1705 __m256i abcd_03 = _mm256_unpacklo_epi32(ab_07, cd_07);
1706 __m256i abcd_47 = _mm256_unpackhi_epi32(ab_07, cd_07);
1707 __m256i efgh_03 = _mm256_unpacklo_epi32(ef_07, gh_07);
1708 __m256i efgh_47 = _mm256_unpackhi_epi32(ef_07, gh_07);
1709 __m256i ijkl_03 = _mm256_unpacklo_epi32(ij_07, kl_07);
1710 __m256i ijkl_47 = _mm256_unpackhi_epi32(ij_07, kl_07);
1711 __m256i mnop_03 = _mm256_unpacklo_epi32(mn_07, op_07);
1712 __m256i mnop_47 = _mm256_unpackhi_epi32(mn_07, op_07);
1714 __m256i abcd_8b = _mm256_unpacklo_epi32(ab_8f, cd_8f);
1715 __m256i abcd_cf = _mm256_unpackhi_epi32(ab_8f, cd_8f);
1716 __m256i efgh_8b = _mm256_unpacklo_epi32(ef_8f, gh_8f);
1717 __m256i efgh_cf = _mm256_unpackhi_epi32(ef_8f, gh_8f);
1718 __m256i ijkl_8b = _mm256_unpacklo_epi32(ij_8f, kl_8f);
1719 __m256i ijkl_cf = _mm256_unpackhi_epi32(ij_8f, kl_8f);
1720 __m256i mnop_8b = _mm256_unpacklo_epi32(mn_8f, op_8f);
1721 __m256i mnop_cf = _mm256_unpackhi_epi32(mn_8f, op_8f);
1723 __m256i abcdefgh_01 = _mm256_unpacklo_epi64(abcd_03, efgh_03);
1724 __m256i abcdefgh_23 = _mm256_unpackhi_epi64(abcd_03, efgh_03);
1725 __m256i ijklmnop_01 = _mm256_unpacklo_epi64(ijkl_03, mnop_03);
1726 __m256i ijklmnop_23 = _mm256_unpackhi_epi64(ijkl_03, mnop_03);
1727 __m256i abcdefgh_45 = _mm256_unpacklo_epi64(abcd_47, efgh_47);
1728 __m256i abcdefgh_67 = _mm256_unpackhi_epi64(abcd_47, efgh_47);
1729 __m256i ijklmnop_45 = _mm256_unpacklo_epi64(ijkl_47, mnop_47);
1730 __m256i ijklmnop_67 = _mm256_unpackhi_epi64(ijkl_47, mnop_47);
1731 __m256i abcdefgh_89 = _mm256_unpacklo_epi64(abcd_8b, efgh_8b);
1732 __m256i abcdefgh_ab = _mm256_unpackhi_epi64(abcd_8b, efgh_8b);
1733 __m256i ijklmnop_89 = _mm256_unpacklo_epi64(ijkl_8b, mnop_8b);
1734 __m256i ijklmnop_ab = _mm256_unpackhi_epi64(ijkl_8b, mnop_8b);
1735 __m256i abcdefgh_cd = _mm256_unpacklo_epi64(abcd_cf, efgh_cf);
1736 __m256i abcdefgh_ef = _mm256_unpackhi_epi64(abcd_cf, efgh_cf);
1737 __m256i ijklmnop_cd = _mm256_unpacklo_epi64(ijkl_cf, mnop_cf);
1738 __m256i ijklmnop_ef = _mm256_unpackhi_epi64(ijkl_cf, mnop_cf);
1741 __m256i a_p_0 = _mm256_permute2x128_si256(abcdefgh_01, ijklmnop_01, 0x20);
1742 __m256i a_p_1 = _mm256_permute2x128_si256(abcdefgh_23, ijklmnop_23, 0x20);
1743 __m256i a_p_2 = _mm256_permute2x128_si256(abcdefgh_45, ijklmnop_45, 0x20);
1744 __m256i a_p_3 = _mm256_permute2x128_si256(abcdefgh_67, ijklmnop_67, 0x20);
1745 __m256i a_p_4 = _mm256_permute2x128_si256(abcdefgh_89, ijklmnop_89, 0x20);
1746 __m256i a_p_5 = _mm256_permute2x128_si256(abcdefgh_ab, ijklmnop_ab, 0x20);
1747 __m256i a_p_6 = _mm256_permute2x128_si256(abcdefgh_cd, ijklmnop_cd, 0x20);
1748 __m256i a_p_7 = _mm256_permute2x128_si256(abcdefgh_ef, ijklmnop_ef, 0x20);
1749 __m256i a_p_8 = _mm256_permute2x128_si256(abcdefgh_01, ijklmnop_01, 0x31);
1750 __m256i a_p_9 = _mm256_permute2x128_si256(abcdefgh_23, ijklmnop_23, 0x31);
1751 __m256i a_p_a = _mm256_permute2x128_si256(abcdefgh_45, ijklmnop_45, 0x31);
1752 __m256i a_p_b = _mm256_permute2x128_si256(abcdefgh_67, ijklmnop_67, 0x31);
1753 __m256i a_p_c = _mm256_permute2x128_si256(abcdefgh_89, ijklmnop_89, 0x31);
1754 __m256i a_p_d = _mm256_permute2x128_si256(abcdefgh_ab, ijklmnop_ab, 0x31);
1755 __m256i a_p_e = _mm256_permute2x128_si256(abcdefgh_cd, ijklmnop_cd, 0x31);
1756 __m256i a_p_f = _mm256_permute2x128_si256(abcdefgh_ef, ijklmnop_ef, 0x31);
1758 kernel.
packet[0] = a_p_0;
1759 kernel.
packet[1] = a_p_1;
1760 kernel.
packet[2] = a_p_2;
1761 kernel.
packet[3] = a_p_3;
1762 kernel.
packet[4] = a_p_4;
1763 kernel.
packet[5] = a_p_5;
1764 kernel.
packet[6] = a_p_6;
1765 kernel.
packet[7] = a_p_7;
1766 kernel.
packet[8] = a_p_8;
1767 kernel.
packet[9] = a_p_9;
1768 kernel.
packet[10] = a_p_a;
1769 kernel.
packet[11] = a_p_b;
1770 kernel.
packet[12] = a_p_c;
1771 kernel.
packet[13] = a_p_d;
1772 kernel.
packet[14] = a_p_e;
1773 kernel.
packet[15] = a_p_f;
1790 for (
int i = 0;
i < 8; ++
i) {
1791 for (
int j = 0;
j < 8; ++
j) {
1794 for (
int j = 0;
j < 8; ++
j) {
1819 for (
int i = 0;
i < 4; ++
i) {
1820 for (
int j = 0;
j < 4; ++
j) {
1823 for (
int j = 0;
j < 4; ++
j) {
1826 for (
int j = 0;
j < 4; ++
j) {
1829 for (
int j = 0;
j < 4; ++
j) {
1855 #if EIGEN_GNUC_AT_LEAST(5, 3) || (!EIGEN_COMP_GNUC_STRICT)
1856 #ifdef EIGEN_VECTORIZE_AVX512DQ
1884 return _mm256_set1_epi16(from.
value);
1890 t.value =
static_cast<unsigned short>(_mm256_extract_epi16(from, 0));
1896 return _mm256_load_si256(
reinterpret_cast<const __m256i*
>(from));
1901 return _mm256_loadu_si256(
reinterpret_cast<const __m256i*
>(from));
1907 _mm256_store_si256(
reinterpret_cast<__m256i*
>(to), from);
1913 _mm256_storeu_si256(
reinterpret_cast<__m256i*
>(to), from);
1919 unsigned short a = from[0].
value;
1920 unsigned short b = from[1].
value;
1921 unsigned short c = from[2].
value;
1922 unsigned short d = from[3].
value;
1923 unsigned short e = from[4].
value;
1924 unsigned short f = from[5].
value;
1925 unsigned short g = from[6].
value;
1926 unsigned short h = from[7].
value;
1927 return _mm256_set_epi16(
h,
h,
g,
g,
f,
f,
e,
e,
d,
d,
c,
c,
b,
b,
a,
a);
1933 unsigned short a = from[0].
value;
1934 unsigned short b = from[1].
value;
1935 unsigned short c = from[2].
value;
1936 unsigned short d = from[3].
value;
1937 return _mm256_set_epi16(
d,
d,
d,
d,
c,
c,
c,
c,
b,
b,
b,
b,
a,
a,
a,
a);
1941 return _mm512_castsi512_ps(_mm512_slli_epi32(_mm512_cvtepu16_epi32(
a), 16));
1948 #if defined(EIGEN_VECTORIZE_AVX512BF16) && EIGEN_GNUC_AT_LEAST(10, 1)
1952 r = (__m256i)(_mm512_cvtneps_pbh(
a));
1956 __m512i input = _mm512_castps_si512(
a);
1957 __m512i nan = _mm512_set1_epi32(0x7fc0);
1960 t = _mm512_and_si512(_mm512_srli_epi32(input, 16), _mm512_set1_epi32(1));
1962 t = _mm512_add_epi32(
t, _mm512_set1_epi32(0x7fff));
1964 t = _mm512_add_epi32(
t, input);
1966 t = _mm512_srli_epi32(
t, 16);
1969 __mmask16 mask = _mm512_cmp_ps_mask(
a,
a, _CMP_ORD_Q);
1971 t = _mm512_mask_blend_epi32(mask, nan,
t);
1973 r = _mm512_cvtepi32_epi16(
t);
1974 #endif // EIGEN_VECTORIZE_AVX512BF16
2011 return _mm256_blendv_epi8(
b,
a, mask);
2057 Packet16bf sign_mask = _mm256_set1_epi16(
static_cast<unsigned short>(0x8000));
2058 return _mm256_xor_si256(
a, sign_mask);
2068 const __m256i sign_mask = _mm256_set1_epi16(
static_cast<numext::uint16_t>(0x8000));
2069 return _mm256_andnot_si256(sign_mask,
a);
2115 Packet8bf lane0 = _mm256_extractf128_si256(
a, 0);
2116 Packet8bf lane1 = _mm256_extractf128_si256(
a, 1);
2142 __m256i
m = _mm256_setr_epi8(14,15,12,13,10,11,8,9,6,7,4,5,2,3,0,1,
2143 14,15,12,13,10,11,8,9,6,7,4,5,2,3,0,1);
2147 res = _mm256_permute2x128_si256(
a,
a, 1);
2149 return _mm256_shuffle_epi8(
res,
m);
2155 return _mm256_set_epi16(
2168 to[stride*0] = aux[0];
2169 to[stride*1] = aux[1];
2170 to[stride*2] = aux[2];
2171 to[stride*3] = aux[3];
2172 to[stride*4] = aux[4];
2173 to[stride*5] = aux[5];
2174 to[stride*6] = aux[6];
2175 to[stride*7] = aux[7];
2176 to[stride*8] = aux[8];
2177 to[stride*9] = aux[9];
2178 to[stride*10] = aux[10];
2179 to[stride*11] = aux[11];
2180 to[stride*12] = aux[12];
2181 to[stride*13] = aux[13];
2182 to[stride*14] = aux[14];
2183 to[stride*15] = aux[15];
2197 __m256i k = kernel.
packet[10];
2198 __m256i
l = kernel.
packet[11];
2199 __m256i
m = kernel.
packet[12];
2200 __m256i
n = kernel.
packet[13];
2201 __m256i o = kernel.
packet[14];
2202 __m256i
p = kernel.
packet[15];
2204 __m256i ab_07 = _mm256_unpacklo_epi16(
a,
b);
2205 __m256i cd_07 = _mm256_unpacklo_epi16(
c,
d);
2206 __m256i ef_07 = _mm256_unpacklo_epi16(
e,
f);
2207 __m256i gh_07 = _mm256_unpacklo_epi16(
g,
h);
2208 __m256i ij_07 = _mm256_unpacklo_epi16(
i,
j);
2209 __m256i kl_07 = _mm256_unpacklo_epi16(k,
l);
2210 __m256i mn_07 = _mm256_unpacklo_epi16(
m,
n);
2211 __m256i op_07 = _mm256_unpacklo_epi16(o,
p);
2213 __m256i ab_8f = _mm256_unpackhi_epi16(
a,
b);
2214 __m256i cd_8f = _mm256_unpackhi_epi16(
c,
d);
2215 __m256i ef_8f = _mm256_unpackhi_epi16(
e,
f);
2216 __m256i gh_8f = _mm256_unpackhi_epi16(
g,
h);
2217 __m256i ij_8f = _mm256_unpackhi_epi16(
i,
j);
2218 __m256i kl_8f = _mm256_unpackhi_epi16(k,
l);
2219 __m256i mn_8f = _mm256_unpackhi_epi16(
m,
n);
2220 __m256i op_8f = _mm256_unpackhi_epi16(o,
p);
2222 __m256i abcd_03 = _mm256_unpacklo_epi32(ab_07, cd_07);
2223 __m256i abcd_47 = _mm256_unpackhi_epi32(ab_07, cd_07);
2224 __m256i efgh_03 = _mm256_unpacklo_epi32(ef_07, gh_07);
2225 __m256i efgh_47 = _mm256_unpackhi_epi32(ef_07, gh_07);
2226 __m256i ijkl_03 = _mm256_unpacklo_epi32(ij_07, kl_07);
2227 __m256i ijkl_47 = _mm256_unpackhi_epi32(ij_07, kl_07);
2228 __m256i mnop_03 = _mm256_unpacklo_epi32(mn_07, op_07);
2229 __m256i mnop_47 = _mm256_unpackhi_epi32(mn_07, op_07);
2231 __m256i abcd_8b = _mm256_unpacklo_epi32(ab_8f, cd_8f);
2232 __m256i abcd_cf = _mm256_unpackhi_epi32(ab_8f, cd_8f);
2233 __m256i efgh_8b = _mm256_unpacklo_epi32(ef_8f, gh_8f);
2234 __m256i efgh_cf = _mm256_unpackhi_epi32(ef_8f, gh_8f);
2235 __m256i ijkl_8b = _mm256_unpacklo_epi32(ij_8f, kl_8f);
2236 __m256i ijkl_cf = _mm256_unpackhi_epi32(ij_8f, kl_8f);
2237 __m256i mnop_8b = _mm256_unpacklo_epi32(mn_8f, op_8f);
2238 __m256i mnop_cf = _mm256_unpackhi_epi32(mn_8f, op_8f);
2240 __m256i abcdefgh_01 = _mm256_unpacklo_epi64(abcd_03, efgh_03);
2241 __m256i abcdefgh_23 = _mm256_unpackhi_epi64(abcd_03, efgh_03);
2242 __m256i ijklmnop_01 = _mm256_unpacklo_epi64(ijkl_03, mnop_03);
2243 __m256i ijklmnop_23 = _mm256_unpackhi_epi64(ijkl_03, mnop_03);
2244 __m256i abcdefgh_45 = _mm256_unpacklo_epi64(abcd_47, efgh_47);
2245 __m256i abcdefgh_67 = _mm256_unpackhi_epi64(abcd_47, efgh_47);
2246 __m256i ijklmnop_45 = _mm256_unpacklo_epi64(ijkl_47, mnop_47);
2247 __m256i ijklmnop_67 = _mm256_unpackhi_epi64(ijkl_47, mnop_47);
2248 __m256i abcdefgh_89 = _mm256_unpacklo_epi64(abcd_8b, efgh_8b);
2249 __m256i abcdefgh_ab = _mm256_unpackhi_epi64(abcd_8b, efgh_8b);
2250 __m256i ijklmnop_89 = _mm256_unpacklo_epi64(ijkl_8b, mnop_8b);
2251 __m256i ijklmnop_ab = _mm256_unpackhi_epi64(ijkl_8b, mnop_8b);
2252 __m256i abcdefgh_cd = _mm256_unpacklo_epi64(abcd_cf, efgh_cf);
2253 __m256i abcdefgh_ef = _mm256_unpackhi_epi64(abcd_cf, efgh_cf);
2254 __m256i ijklmnop_cd = _mm256_unpacklo_epi64(ijkl_cf, mnop_cf);
2255 __m256i ijklmnop_ef = _mm256_unpackhi_epi64(ijkl_cf, mnop_cf);
2258 kernel.
packet[0] = _mm256_permute2x128_si256(abcdefgh_01, ijklmnop_01, 0x20);
2259 kernel.
packet[1] = _mm256_permute2x128_si256(abcdefgh_23, ijklmnop_23, 0x20);
2260 kernel.
packet[2] = _mm256_permute2x128_si256(abcdefgh_45, ijklmnop_45, 0x20);
2261 kernel.
packet[3] = _mm256_permute2x128_si256(abcdefgh_67, ijklmnop_67, 0x20);
2262 kernel.
packet[4] = _mm256_permute2x128_si256(abcdefgh_89, ijklmnop_89, 0x20);
2263 kernel.
packet[5] = _mm256_permute2x128_si256(abcdefgh_ab, ijklmnop_ab, 0x20);
2264 kernel.
packet[6] = _mm256_permute2x128_si256(abcdefgh_cd, ijklmnop_cd, 0x20);
2265 kernel.
packet[7] = _mm256_permute2x128_si256(abcdefgh_ef, ijklmnop_ef, 0x20);
2266 kernel.
packet[8] = _mm256_permute2x128_si256(abcdefgh_01, ijklmnop_01, 0x31);
2267 kernel.
packet[9] = _mm256_permute2x128_si256(abcdefgh_23, ijklmnop_23, 0x31);
2268 kernel.
packet[10] = _mm256_permute2x128_si256(abcdefgh_45, ijklmnop_45, 0x31);
2269 kernel.
packet[11] = _mm256_permute2x128_si256(abcdefgh_67, ijklmnop_67, 0x31);
2270 kernel.
packet[12] = _mm256_permute2x128_si256(abcdefgh_89, ijklmnop_89, 0x31);
2271 kernel.
packet[13] = _mm256_permute2x128_si256(abcdefgh_ab, ijklmnop_ab, 0x31);
2272 kernel.
packet[14] = _mm256_permute2x128_si256(abcdefgh_cd, ijklmnop_cd, 0x31);
2273 kernel.
packet[15] = _mm256_permute2x128_si256(abcdefgh_ef, ijklmnop_ef, 0x31);
2282 __m256i ab_07 = _mm256_unpacklo_epi16(
a,
b);
2283 __m256i cd_07 = _mm256_unpacklo_epi16(
c,
d);
2284 __m256i ab_8f = _mm256_unpackhi_epi16(
a,
b);
2285 __m256i cd_8f = _mm256_unpackhi_epi16(
c,
d);
2287 __m256i abcd_03 = _mm256_unpacklo_epi32(ab_07, cd_07);
2288 __m256i abcd_47 = _mm256_unpackhi_epi32(ab_07, cd_07);
2289 __m256i abcd_8b = _mm256_unpacklo_epi32(ab_8f, cd_8f);
2290 __m256i abcd_cf = _mm256_unpackhi_epi32(ab_8f, cd_8f);
2293 kernel.
packet[0] = _mm256_permute2x128_si256(abcd_03, abcd_47, 0x20);
2294 kernel.
packet[1] = _mm256_permute2x128_si256(abcd_8b, abcd_cf, 0x20);
2295 kernel.
packet[2] = _mm256_permute2x128_si256(abcd_03, abcd_47, 0x31);
2296 kernel.
packet[3] = _mm256_permute2x128_si256(abcd_8b, abcd_cf, 0x31);
2303 #endif // EIGEN_PACKET_MATH_AVX512_H