10 #ifndef EIGEN_PACKET_MATH_AVX512_H 11 #define EIGEN_PACKET_MATH_AVX512_H 17 #ifndef EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 18 #define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 8 21 #ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 22 #define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 32 25 #ifdef EIGEN_VECTORIZE_FMA 26 #ifndef EIGEN_HAS_SINGLE_INSTRUCTION_MADD 27 #define EIGEN_HAS_SINGLE_INSTRUCTION_MADD 112 #if EIGEN_GNUC_AT_LEAST(5, 3) || (!EIGEN_COMP_GNUC_STRICT) 141 #if EIGEN_GNUC_AT_LEAST(5, 3) || (!EIGEN_COMP_GNUC_STRICT) 174 enum {
size = 16, alignment=
Aligned64, vectorizable=
true, masked_load_available=
true, masked_store_available=
true };
180 enum {
size = 8, alignment=
Aligned64, vectorizable=
true, masked_load_available=
false, masked_store_available=
false };
186 enum {
size = 16, alignment=
Aligned64, vectorizable=
false, masked_load_available=
false, masked_store_available=
false };
193 enum {
size=16, alignment=
Aligned32, vectorizable=
true, masked_load_available=
false, masked_store_available=
false};
198 return _mm512_set1_ps(from);
202 return _mm512_set1_pd(from);
206 return _mm512_set1_epi32(from);
211 return _mm512_castsi512_ps(_mm512_set1_epi32(from));
216 return _mm512_castsi512_pd(_mm512_set1_epi64(from));
224 return _mm512_castsi512_ps(_mm512_set_epi32(0, -1, 0, -1, 0, -1, 0, -1,
225 0, -1, 0, -1, 0, -1, 0, -1));
228 return _mm512_set_epi32(0, -1, 0, -1, 0, -1, 0, -1,
229 0, -1, 0, -1, 0, -1, 0, -1);
232 return _mm512_castsi512_pd(_mm512_set_epi32(0, 0, -1, -1, 0, 0, -1, -1,
233 0, 0, -1, -1, 0, 0, -1, -1));
238 return _mm512_broadcastss_ps(_mm_load_ps1(from));
242 return _mm512_set1_pd(*from);
247 return _mm512_add_ps(
249 _mm512_set_ps(15.0
f, 14.0
f, 13.0
f, 12.0
f, 11.0
f, 10.0
f, 9.0
f, 8.0
f, 7.0
f, 6.0
f, 5.0
f,
250 4.0
f, 3.0
f, 2.0
f, 1.0
f, 0.0
f));
254 return _mm512_add_pd(_mm512_set1_pd(
a),
255 _mm512_set_pd(7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0, 0.0));
260 const Packet16f&
b) {
261 return _mm512_add_ps(a,
b);
266 return _mm512_add_pd(a,
b);
270 const Packet16i&
b) {
271 return _mm512_add_epi32(a,
b);
276 const Packet16f&
b) {
277 return _mm512_sub_ps(a,
b);
282 return _mm512_sub_pd(a,
b);
286 const Packet16i&
b) {
287 return _mm512_sub_epi32(a,
b);
292 return _mm512_sub_ps(_mm512_set1_ps(0.0), a);
296 return _mm512_sub_pd(_mm512_set1_pd(0.0), a);
314 const Packet16f&
b) {
315 return _mm512_mul_ps(a,
b);
320 return _mm512_mul_pd(a,
b);
324 const Packet16i&
b) {
325 return _mm512_mullo_epi32(a,
b);
330 const Packet16f&
b) {
331 return _mm512_div_ps(a,
b);
336 return _mm512_div_pd(a,
b);
339 #ifdef EIGEN_VECTORIZE_FMA 342 const Packet16f&
c) {
343 return _mm512_fmadd_ps(a, b, c);
348 return _mm512_fmadd_pd(a, b, c);
355 const Packet16f& b) {
356 __mmask16 mask16 = _mm512_cmp_epi32_mask(
357 _mm512_castps_si512(mask), _mm512_setzero_epi32(), _MM_CMPINT_EQ);
358 return _mm512_mask_blend_ps(mask16, a, b);
365 __mmask8 mask8 = _mm512_cmp_epi64_mask(_mm512_castpd_si512(mask),
366 _mm512_setzero_epi32(), _MM_CMPINT_EQ);
367 return _mm512_mask_blend_pd(mask8, a, b);
372 const Packet16f&
b) {
374 return _mm512_min_ps(b, a);
380 return _mm512_min_pd(b, a);
385 const Packet16f&
b) {
387 return _mm512_max_ps(b, a);
393 return _mm512_max_pd(b, a);
431 #ifdef EIGEN_VECTORIZE_AVX512DQ 438 return _mm256_castsi256_ps(_mm512_extracti64x4_epi64( _mm512_castps_si512(x),I_));
443 return _mm_castsi128_pd(_mm512_extracti32x4_epi32( _mm512_castpd_si512(x),I_));
447 return _mm512_castsi512_ps(_mm512_inserti64x4(_mm512_castsi256_si512(_mm256_castps_si256(a)),
448 _mm256_castps_si256(b),1));
462 __m256i lo = _mm256_castps_si256(extract256<0>(rf));
463 __m256i hi = _mm256_castps_si256(extract256<1>(rf));
464 __m128i result_lo = _mm_packs_epi32(_mm256_extractf128_si256(lo, 0),
465 _mm256_extractf128_si256(lo, 1));
466 __m128i result_hi = _mm_packs_epi32(_mm256_extractf128_si256(hi, 0),
467 _mm256_extractf128_si256(hi, 1));
468 return _mm256_insertf128_si256(_mm256_castsi128_si256(result_lo), result_hi, 1);
473 __mmask16 mask = _mm512_cmp_ps_mask(a, b, _CMP_EQ_OQ);
474 return _mm512_castsi512_ps(
475 _mm512_mask_set1_epi32(_mm512_set1_epi32(0), mask, 0xffffffffu));
478 __mmask16 mask = _mm512_cmp_ps_mask(a, b, _CMP_LE_OQ);
479 return _mm512_castsi512_ps(
480 _mm512_mask_set1_epi32(_mm512_set1_epi32(0), mask, 0xffffffffu));
484 __mmask16 mask = _mm512_cmp_ps_mask(a, b, _CMP_LT_OQ);
485 return _mm512_castsi512_ps(
486 _mm512_mask_set1_epi32(_mm512_set1_epi32(0), mask, 0xffffffffu));
490 __mmask16 mask = _mm512_cmp_ps_mask(a, b, _CMP_NGE_UQ);
491 return _mm512_castsi512_ps(
492 _mm512_mask_set1_epi32(_mm512_set1_epi32(0), mask, 0xffffffffu));
496 __mmask16 mask = _mm512_cmp_epi32_mask(a, b, _CMP_EQ_OQ);
497 return _mm512_mask_set1_epi32(_mm512_set1_epi32(0), mask, 0xffffffffu);
503 __mmask8 mask = _mm512_cmp_pd_mask(a, b, _CMP_EQ_OQ);
504 return _mm512_castsi512_pd(
505 _mm512_mask_set1_epi64(_mm512_set1_epi64(0), mask, 0xffffffffffffffffu));
509 __mmask8 mask = _mm512_cmp_pd_mask(a, b, _CMP_LE_OQ);
510 return _mm512_castsi512_pd(
511 _mm512_mask_set1_epi64(_mm512_set1_epi64(0), mask, 0xffffffffffffffffu));
515 __mmask8 mask = _mm512_cmp_pd_mask(a, b, _CMP_LT_OQ);
516 return _mm512_castsi512_pd(
517 _mm512_mask_set1_epi64(_mm512_set1_epi64(0), mask, 0xffffffffffffffffu));
521 __mmask8 mask = _mm512_cmp_pd_mask(a, b, _CMP_NGE_UQ);
522 return _mm512_castsi512_pd(
523 _mm512_mask_set1_epi64(_mm512_set1_epi64(0), mask, 0xffffffffffffffffu));
537 return _mm512_set1_epi32(0xffffffffu);
552 const Packet16i&
b) {
553 return _mm512_and_si512(a,b);
558 const Packet16f&
b) {
559 #ifdef EIGEN_VECTORIZE_AVX512DQ 560 return _mm512_and_ps(a, b);
562 return _mm512_castsi512_ps(
pand(_mm512_castps_si512(a),_mm512_castps_si512(b)));
568 #ifdef EIGEN_VECTORIZE_AVX512DQ 569 return _mm512_and_pd(a, b);
571 Packet8d
res = _mm512_undefined_pd();
572 Packet4d lane0_a = _mm512_extractf64x4_pd(a, 0);
573 Packet4d lane0_b = _mm512_extractf64x4_pd(b, 0);
574 res = _mm512_insertf64x4(res, _mm256_and_pd(lane0_a, lane0_b), 0);
576 Packet4d lane1_a = _mm512_extractf64x4_pd(a, 1);
577 Packet4d lane1_b = _mm512_extractf64x4_pd(b, 1);
578 return _mm512_insertf64x4(res, _mm256_and_pd(lane1_a, lane1_b), 1);
584 return _mm512_or_si512(a, b);
589 #ifdef EIGEN_VECTORIZE_AVX512DQ 590 return _mm512_or_ps(a, b);
592 return _mm512_castsi512_ps(
por(_mm512_castps_si512(a),_mm512_castps_si512(b)));
599 #ifdef EIGEN_VECTORIZE_AVX512DQ 600 return _mm512_or_pd(a, b);
602 return _mm512_castsi512_pd(
por(_mm512_castpd_si512(a),_mm512_castpd_si512(b)));
608 return _mm512_xor_si512(a, b);
613 #ifdef EIGEN_VECTORIZE_AVX512DQ 614 return _mm512_xor_ps(a, b);
616 return _mm512_castsi512_ps(
pxor(_mm512_castps_si512(a),_mm512_castps_si512(b)));
622 #ifdef EIGEN_VECTORIZE_AVX512DQ 623 return _mm512_xor_pd(a, b);
625 return _mm512_castsi512_pd(
pxor(_mm512_castpd_si512(a),_mm512_castpd_si512(b)));
631 return _mm512_andnot_si512(b, a);
636 #ifdef EIGEN_VECTORIZE_AVX512DQ 637 return _mm512_andnot_ps(b, a);
639 return _mm512_castsi512_ps(
pandnot(_mm512_castps_si512(a),_mm512_castps_si512(b)));
644 #ifdef EIGEN_VECTORIZE_AVX512DQ 645 return _mm512_andnot_pd(b, a);
647 return _mm512_castsi512_pd(
pandnot(_mm512_castpd_si512(a),_mm512_castpd_si512(b)));
656 return _mm512_roundscale_ps(
padd(
por(
pand(a, mask), prev0dot5), a), _MM_FROUND_TO_ZERO);
663 return _mm512_roundscale_pd(
padd(
por(
pand(a, mask), prev0dot5), a), _MM_FROUND_TO_ZERO);
667 return _mm512_srai_epi32(a,
N);
671 return _mm512_srli_epi32(a,
N);
675 return _mm512_slli_epi32(a,
N);
689 reinterpret_cast<const __m512i*>(from));
703 reinterpret_cast<const __m512i*>(from));
708 __mmask16 mask =
static_cast<__mmask16
>(umask);
718 __m256i low_half = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(from));
719 __m512 even_elements = _mm512_castsi512_ps(_mm512_cvtepu32_epi64(low_half));
720 __m512 pairs = _mm512_permute_ps(even_elements, _MM_SHUFFLE(2, 2, 0, 0));
724 #ifdef EIGEN_VECTORIZE_AVX512DQ 730 __m512d x = _mm512_setzero_pd();
731 x = _mm512_insertf64x2(x, _mm_loaddup_pd(&from[0]), 0);
732 x = _mm512_insertf64x2(x, _mm_loaddup_pd(&from[1]), 1);
733 x = _mm512_insertf64x2(x, _mm_loaddup_pd(&from[2]), 2);
734 x = _mm512_insertf64x2(x, _mm_loaddup_pd(&from[3]), 3);
740 __m512d x = _mm512_setzero_pd();
741 x = _mm512_mask_broadcastsd_pd(x, 0
x3<<0, _mm_load_sd(from+0));
742 x = _mm512_mask_broadcastsd_pd(x, 0
x3<<2, _mm_load_sd(from+1));
743 x = _mm512_mask_broadcastsd_pd(x, 0
x3<<4, _mm_load_sd(from+2));
744 x = _mm512_mask_broadcastsd_pd(x, 0
x3<<6, _mm_load_sd(from+3));
754 const Packet16i scatter_mask = _mm512_set_epi32(3,3,3,3, 2,2,2,2, 1,1,1,1, 0,0,0,0);
755 return _mm512_permutexvar_ps(scatter_mask, tmp);
762 __m256d lane0 = _mm256_set1_pd(*from);
763 __m256d lane1 = _mm256_set1_pd(*(from+1));
764 __m512d tmp = _mm512_undefined_pd();
765 tmp = _mm512_insertf64x4(tmp, lane0, 0);
766 return _mm512_insertf64x4(tmp, lane1, 1);
794 reinterpret_cast<__m512i*>(to), from);
798 __mmask16 mask =
static_cast<__mmask16
>(umask);
805 Packet16i stride_vector = _mm512_set1_epi32(convert_index<int>(stride));
806 Packet16i stride_multiplier =
807 _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
808 Packet16i indices = _mm512_mullo_epi32(stride_vector, stride_multiplier);
810 return _mm512_i32gather_ps(indices, from, 4);
815 Packet8i stride_vector = _mm256_set1_epi32(convert_index<int>(stride));
816 Packet8i stride_multiplier = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
817 Packet8i indices = _mm256_mullo_epi32(stride_vector, stride_multiplier);
819 return _mm512_i32gather_pd(indices, from, 8);
824 const Packet16f& from,
826 Packet16i stride_vector = _mm512_set1_epi32(convert_index<int>(stride));
827 Packet16i stride_multiplier =
828 _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
829 Packet16i indices = _mm512_mullo_epi32(stride_vector, stride_multiplier);
830 _mm512_i32scatter_ps(to, indices, from, 4);
834 const Packet8d& from,
836 Packet8i stride_vector = _mm256_set1_epi32(convert_index<int>(stride));
837 Packet8i stride_multiplier = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
838 Packet8i indices = _mm256_mullo_epi32(stride_vector, stride_multiplier);
839 _mm512_i32scatter_pd(to, indices, from, 8);
864 return _mm_cvtss_f32(_mm512_extractf32x4_ps(a, 0));
868 return _mm_cvtsd_f64(_mm256_extractf128_pd(_mm512_extractf64x4_pd(a, 0), 0));
872 return _mm_extract_epi32(_mm512_extracti32x4_epi32(a, 0), 0);
877 return _mm512_permutexvar_ps(_mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15), a);
882 return _mm512_permutexvar_pd(_mm512_set_epi32(0, 0, 0, 1, 0, 2, 0, 3, 0, 4, 0, 5, 0, 6, 0, 7), a);
888 return _mm512_castsi512_ps(_mm512_and_si512(_mm512_castps_si512(a), _mm512_set1_epi32(0x7fffffff)));
893 return _mm512_castsi512_pd(_mm512_and_si512(_mm512_castpd_si512(a),
894 _mm512_set1_epi64(0x7fffffffffffffff)));
907 #ifdef EIGEN_VECTORIZE_AVX512DQ 908 return _mm512_cvtepi64_pd(_mm512_srli_epi64(_mm512_castpd_si512(
pand(a, cst_exp_mask)), 52));
910 return _mm512_cvtepi32_pd(_mm512_cvtepi64_epi32(_mm512_srli_epi64(_mm512_castpd_si512(
pand(a, cst_exp_mask)), 52)));
930 Packet8i b = parithmetic_shift_right<2>(
e);
933 const Packet8i permute_idx = _mm256_setr_epi32(0, 4, 1, 5, 2, 6, 3, 7);
934 Packet8i hi = _mm256_permutevar8x32_epi32(
padd(b, bias), permute_idx);
935 Packet8i lo = _mm256_slli_epi64(hi, 52);
936 hi = _mm256_slli_epi64(_mm256_srli_epi64(hi, 32), 52);
937 Packet8d c = _mm512_castsi512_pd(_mm512_inserti64x4(_mm512_castsi256_si512(lo), hi, 1));
942 hi = _mm256_permutevar8x32_epi32(
padd(b, bias), permute_idx);
943 lo = _mm256_slli_epi64(hi, 52);
944 hi = _mm256_slli_epi64(_mm256_srli_epi64(hi, 32), 52);
945 c = _mm512_castsi512_pd(_mm512_inserti64x4(_mm512_castsi256_si512(lo), hi, 1));
950 #ifdef EIGEN_VECTORIZE_AVX512DQ 952 #define EIGEN_EXTRACT_8f_FROM_16f(INPUT, OUTPUT) \ 953 __m256 OUTPUT##_0 = _mm512_extractf32x8_ps(INPUT, 0); \ 954 __m256 OUTPUT##_1 = _mm512_extractf32x8_ps(INPUT, 1) 956 #define EIGEN_EXTRACT_8f_FROM_16f(INPUT, OUTPUT) \ 957 __m256 OUTPUT##_0 = _mm256_insertf128_ps( \ 958 _mm256_castps128_ps256(_mm512_extractf32x4_ps(INPUT, 0)), \ 959 _mm512_extractf32x4_ps(INPUT, 1), 1); \ 960 __m256 OUTPUT##_1 = _mm256_insertf128_ps( \ 961 _mm256_castps128_ps256(_mm512_extractf32x4_ps(INPUT, 2)), \ 962 _mm512_extractf32x4_ps(INPUT, 3), 1); 965 #ifdef EIGEN_VECTORIZE_AVX512DQ 966 #define EIGEN_INSERT_8f_INTO_16f(OUTPUT, INPUTA, INPUTB) \ 967 OUTPUT = _mm512_insertf32x8(_mm512_castps256_ps512(INPUTA), INPUTB, 1); 969 #define EIGEN_INSERT_8f_INTO_16f(OUTPUT, INPUTA, INPUTB) \ 970 OUTPUT = _mm512_undefined_ps(); \ 971 OUTPUT = _mm512_insertf32x4(OUTPUT, _mm256_extractf128_ps(INPUTA, 0), 0); \ 972 OUTPUT = _mm512_insertf32x4(OUTPUT, _mm256_extractf128_ps(INPUTA, 1), 1); \ 973 OUTPUT = _mm512_insertf32x4(OUTPUT, _mm256_extractf128_ps(INPUTB, 0), 2); \ 974 OUTPUT = _mm512_insertf32x4(OUTPUT, _mm256_extractf128_ps(INPUTB, 1), 3); 979 #ifdef EIGEN_VECTORIZE_AVX512DQ 980 __m256 lane0 = _mm512_extractf32x8_ps(a, 0);
981 __m256 lane1 = _mm512_extractf32x8_ps(a, 1);
982 Packet8f x = _mm256_add_ps(lane0, lane1);
985 __m128 lane0 = _mm512_extractf32x4_ps(a, 0);
986 __m128 lane1 = _mm512_extractf32x4_ps(a, 1);
987 __m128 lane2 = _mm512_extractf32x4_ps(a, 2);
988 __m128 lane3 = _mm512_extractf32x4_ps(a, 3);
989 __m128 sum = _mm_add_ps(_mm_add_ps(lane0, lane1), _mm_add_ps(lane2, lane3));
990 sum = _mm_hadd_ps(sum, sum);
991 sum = _mm_hadd_ps(sum, _mm_permute_ps(sum, 1));
992 return _mm_cvtss_f32(sum);
997 __m256d lane0 = _mm512_extractf64x4_pd(a, 0);
998 __m256d lane1 = _mm512_extractf64x4_pd(a, 1);
999 __m256d sum = _mm256_add_pd(lane0, lane1);
1000 __m256d tmp0 = _mm256_hadd_pd(sum, _mm256_permute2f128_pd(sum, sum, 1));
1001 return _mm_cvtsd_f64(_mm256_castpd256_pd128(_mm256_hadd_pd(tmp0, tmp0)));
1006 #ifdef EIGEN_VECTORIZE_AVX512DQ 1007 __m256 lane0 = _mm512_extractf32x8_ps(a, 0);
1008 __m256 lane1 = _mm512_extractf32x8_ps(a, 1);
1009 return _mm256_add_ps(lane0, lane1);
1011 __m128 lane0 = _mm512_extractf32x4_ps(a, 0);
1012 __m128 lane1 = _mm512_extractf32x4_ps(a, 1);
1013 __m128 lane2 = _mm512_extractf32x4_ps(a, 2);
1014 __m128 lane3 = _mm512_extractf32x4_ps(a, 3);
1015 __m128 sum0 = _mm_add_ps(lane0, lane2);
1016 __m128 sum1 = _mm_add_ps(lane1, lane3);
1017 return _mm256_insertf128_ps(_mm256_castps128_ps256(sum0), sum1, 1);
1022 __m256d lane0 = _mm512_extractf64x4_pd(a, 0);
1023 __m256d lane1 = _mm512_extractf64x4_pd(a, 1);
1024 return _mm256_add_pd(lane0, lane1);
1031 Packet8f lane0 = _mm512_extractf32x8_ps(a, 0);
1032 Packet8f lane1 = _mm512_extractf32x8_ps(a, 1);
1034 res =
pmul(res, _mm256_permute2f128_ps(res, res, 1));
1035 res =
pmul(res, _mm_permute_ps(res, _MM_SHUFFLE(0, 0, 3, 2)));
1036 return pfirst(
pmul(res, _mm_permute_ps(res, _MM_SHUFFLE(0, 0, 0, 1))));
1038 __m128 lane0 = _mm512_extractf32x4_ps(a, 0);
1039 __m128 lane1 = _mm512_extractf32x4_ps(a, 1);
1040 __m128 lane2 = _mm512_extractf32x4_ps(a, 2);
1041 __m128 lane3 = _mm512_extractf32x4_ps(a, 3);
1042 __m128 res =
pmul(
pmul(lane0, lane1),
pmul(lane2, lane3));
1043 res =
pmul(res, _mm_permute_ps(res, _MM_SHUFFLE(0, 0, 3, 2)));
1044 return pfirst(
pmul(res, _mm_permute_ps(res, _MM_SHUFFLE(0, 0, 0, 1))));
1049 __m256d lane0 = _mm512_extractf64x4_pd(a, 0);
1050 __m256d lane1 = _mm512_extractf64x4_pd(a, 1);
1051 __m256d
res =
pmul(lane0, lane1);
1052 res =
pmul(res, _mm256_permute2f128_pd(res, res, 1));
1053 return pfirst(
pmul(res, _mm256_shuffle_pd(res, res, 1)));
1058 __m128 lane0 = _mm512_extractf32x4_ps(a, 0);
1059 __m128 lane1 = _mm512_extractf32x4_ps(a, 1);
1060 __m128 lane2 = _mm512_extractf32x4_ps(a, 2);
1061 __m128 lane3 = _mm512_extractf32x4_ps(a, 3);
1062 __m128
res = _mm_min_ps(_mm_min_ps(lane0, lane1), _mm_min_ps(lane2, lane3));
1063 res = _mm_min_ps(res, _mm_permute_ps(res, _MM_SHUFFLE(0, 0, 3, 2)));
1064 return pfirst(_mm_min_ps(res, _mm_permute_ps(res, _MM_SHUFFLE(0, 0, 0, 1))));
1068 __m256d lane0 = _mm512_extractf64x4_pd(a, 0);
1069 __m256d lane1 = _mm512_extractf64x4_pd(a, 1);
1070 __m256d
res = _mm256_min_pd(lane0, lane1);
1071 res = _mm256_min_pd(res, _mm256_permute2f128_pd(res, res, 1));
1072 return pfirst(_mm256_min_pd(res, _mm256_shuffle_pd(res, res, 1)));
1077 __m128 lane0 = _mm512_extractf32x4_ps(a, 0);
1078 __m128 lane1 = _mm512_extractf32x4_ps(a, 1);
1079 __m128 lane2 = _mm512_extractf32x4_ps(a, 2);
1080 __m128 lane3 = _mm512_extractf32x4_ps(a, 3);
1081 __m128
res = _mm_max_ps(_mm_max_ps(lane0, lane1), _mm_max_ps(lane2, lane3));
1082 res = _mm_max_ps(res, _mm_permute_ps(res, _MM_SHUFFLE(0, 0, 3, 2)));
1083 return pfirst(_mm_max_ps(res, _mm_permute_ps(res, _MM_SHUFFLE(0, 0, 0, 1))));
1088 __m256d lane0 = _mm512_extractf64x4_pd(a, 0);
1089 __m256d lane1 = _mm512_extractf64x4_pd(a, 1);
1090 __m256d
res = _mm256_max_pd(lane0, lane1);
1091 res = _mm256_max_pd(res, _mm256_permute2f128_pd(res, res, 1));
1092 return pfirst(_mm256_max_pd(res, _mm256_shuffle_pd(res, res, 1)));
1097 Packet16i
xi = _mm512_castps_si512(x);
1098 __mmask16 tmp = _mm512_test_epi32_mask(xi,xi);
1099 return !_mm512_kortestz(tmp,tmp);
1104 #define PACK_OUTPUT(OUTPUT, INPUT, INDEX, STRIDE) \ 1105 EIGEN_INSERT_8f_INTO_16f(OUTPUT[INDEX], INPUT[INDEX], INPUT[INDEX + STRIDE]); 1108 __m512 T0 = _mm512_unpacklo_ps(kernel.
packet[0], kernel.
packet[1]);
1109 __m512
T1 = _mm512_unpackhi_ps(kernel.
packet[0], kernel.
packet[1]);
1110 __m512
T2 = _mm512_unpacklo_ps(kernel.
packet[2], kernel.
packet[3]);
1111 __m512
T3 = _mm512_unpackhi_ps(kernel.
packet[2], kernel.
packet[3]);
1112 __m512
T4 = _mm512_unpacklo_ps(kernel.
packet[4], kernel.
packet[5]);
1113 __m512
T5 = _mm512_unpackhi_ps(kernel.
packet[4], kernel.
packet[5]);
1114 __m512
T6 = _mm512_unpacklo_ps(kernel.
packet[6], kernel.
packet[7]);
1115 __m512 T7 = _mm512_unpackhi_ps(kernel.
packet[6], kernel.
packet[7]);
1116 __m512 T8 = _mm512_unpacklo_ps(kernel.
packet[8], kernel.
packet[9]);
1117 __m512 T9 = _mm512_unpackhi_ps(kernel.
packet[8], kernel.
packet[9]);
1118 __m512 T10 = _mm512_unpacklo_ps(kernel.
packet[10], kernel.
packet[11]);
1119 __m512 T11 = _mm512_unpackhi_ps(kernel.
packet[10], kernel.
packet[11]);
1120 __m512 T12 = _mm512_unpacklo_ps(kernel.
packet[12], kernel.
packet[13]);
1121 __m512 T13 = _mm512_unpackhi_ps(kernel.
packet[12], kernel.
packet[13]);
1122 __m512 T14 = _mm512_unpacklo_ps(kernel.
packet[14], kernel.
packet[15]);
1123 __m512 T15 = _mm512_unpackhi_ps(kernel.
packet[14], kernel.
packet[15]);
1124 __m512 S0 = _mm512_shuffle_ps(T0, T2, _MM_SHUFFLE(1, 0, 1, 0));
1125 __m512 S1 = _mm512_shuffle_ps(T0, T2, _MM_SHUFFLE(3, 2, 3, 2));
1126 __m512
S2 = _mm512_shuffle_ps(T1, T3, _MM_SHUFFLE(1, 0, 1, 0));
1127 __m512
S3 = _mm512_shuffle_ps(T1, T3, _MM_SHUFFLE(3, 2, 3, 2));
1128 __m512 S4 = _mm512_shuffle_ps(T4, T6, _MM_SHUFFLE(1, 0, 1, 0));
1129 __m512 S5 = _mm512_shuffle_ps(T4, T6, _MM_SHUFFLE(3, 2, 3, 2));
1130 __m512 S6 = _mm512_shuffle_ps(T5, T7, _MM_SHUFFLE(1, 0, 1, 0));
1131 __m512 S7 = _mm512_shuffle_ps(T5, T7, _MM_SHUFFLE(3, 2, 3, 2));
1132 __m512 S8 = _mm512_shuffle_ps(T8, T10, _MM_SHUFFLE(1, 0, 1, 0));
1133 __m512 S9 = _mm512_shuffle_ps(T8, T10, _MM_SHUFFLE(3, 2, 3, 2));
1134 __m512 S10 = _mm512_shuffle_ps(T9, T11, _MM_SHUFFLE(1, 0, 1, 0));
1135 __m512 S11 = _mm512_shuffle_ps(T9, T11, _MM_SHUFFLE(3, 2, 3, 2));
1136 __m512 S12 = _mm512_shuffle_ps(T12, T14, _MM_SHUFFLE(1, 0, 1, 0));
1137 __m512 S13 = _mm512_shuffle_ps(T12, T14, _MM_SHUFFLE(3, 2, 3, 2));
1138 __m512 S14 = _mm512_shuffle_ps(T13, T15, _MM_SHUFFLE(1, 0, 1, 0));
1139 __m512 S15 = _mm512_shuffle_ps(T13, T15, _MM_SHUFFLE(3, 2, 3, 2));
1160 tmp.
packet[0] = _mm256_permute2f128_ps(S0_0, S4_0, 0x20);
1161 tmp.
packet[1] = _mm256_permute2f128_ps(S1_0, S5_0, 0x20);
1162 tmp.
packet[2] = _mm256_permute2f128_ps(S2_0, S6_0, 0x20);
1163 tmp.
packet[3] = _mm256_permute2f128_ps(S3_0, S7_0, 0x20);
1164 tmp.
packet[4] = _mm256_permute2f128_ps(S0_0, S4_0, 0x31);
1165 tmp.
packet[5] = _mm256_permute2f128_ps(S1_0, S5_0, 0x31);
1166 tmp.
packet[6] = _mm256_permute2f128_ps(S2_0, S6_0, 0x31);
1167 tmp.
packet[7] = _mm256_permute2f128_ps(S3_0, S7_0, 0x31);
1169 tmp.
packet[8] = _mm256_permute2f128_ps(S0_1, S4_1, 0x20);
1170 tmp.
packet[9] = _mm256_permute2f128_ps(S1_1, S5_1, 0x20);
1171 tmp.
packet[10] = _mm256_permute2f128_ps(S2_1, S6_1, 0x20);
1172 tmp.
packet[11] = _mm256_permute2f128_ps(S3_1, S7_1, 0x20);
1173 tmp.
packet[12] = _mm256_permute2f128_ps(S0_1, S4_1, 0x31);
1174 tmp.
packet[13] = _mm256_permute2f128_ps(S1_1, S5_1, 0x31);
1175 tmp.
packet[14] = _mm256_permute2f128_ps(S2_1, S6_1, 0x31);
1176 tmp.
packet[15] = _mm256_permute2f128_ps(S3_1, S7_1, 0x31);
1179 tmp.
packet[16] = _mm256_permute2f128_ps(S8_0, S12_0, 0x20);
1180 tmp.
packet[17] = _mm256_permute2f128_ps(S9_0, S13_0, 0x20);
1181 tmp.
packet[18] = _mm256_permute2f128_ps(S10_0, S14_0, 0x20);
1182 tmp.
packet[19] = _mm256_permute2f128_ps(S11_0, S15_0, 0x20);
1183 tmp.
packet[20] = _mm256_permute2f128_ps(S8_0, S12_0, 0x31);
1184 tmp.
packet[21] = _mm256_permute2f128_ps(S9_0, S13_0, 0x31);
1185 tmp.
packet[22] = _mm256_permute2f128_ps(S10_0, S14_0, 0x31);
1186 tmp.
packet[23] = _mm256_permute2f128_ps(S11_0, S15_0, 0x31);
1188 tmp.
packet[24] = _mm256_permute2f128_ps(S8_1, S12_1, 0x20);
1189 tmp.
packet[25] = _mm256_permute2f128_ps(S9_1, S13_1, 0x20);
1190 tmp.
packet[26] = _mm256_permute2f128_ps(S10_1, S14_1, 0x20);
1191 tmp.
packet[27] = _mm256_permute2f128_ps(S11_1, S15_1, 0x20);
1192 tmp.
packet[28] = _mm256_permute2f128_ps(S8_1, S12_1, 0x31);
1193 tmp.
packet[29] = _mm256_permute2f128_ps(S9_1, S13_1, 0x31);
1194 tmp.
packet[30] = _mm256_permute2f128_ps(S10_1, S14_1, 0x31);
1195 tmp.
packet[31] = _mm256_permute2f128_ps(S11_1, S15_1, 0x31);
1218 #define PACK_OUTPUT_2(OUTPUT, INPUT, INDEX, STRIDE) \ 1219 EIGEN_INSERT_8f_INTO_16f(OUTPUT[INDEX], INPUT[2 * INDEX], \ 1220 INPUT[2 * INDEX + STRIDE]); 1223 __m512 T0 = _mm512_unpacklo_ps(kernel.
packet[0], kernel.
packet[1]);
1224 __m512
T1 = _mm512_unpackhi_ps(kernel.
packet[0], kernel.
packet[1]);
1225 __m512
T2 = _mm512_unpacklo_ps(kernel.
packet[2], kernel.
packet[3]);
1226 __m512
T3 = _mm512_unpackhi_ps(kernel.
packet[2], kernel.
packet[3]);
1228 __m512 S0 = _mm512_shuffle_ps(T0, T2, _MM_SHUFFLE(1, 0, 1, 0));
1229 __m512 S1 = _mm512_shuffle_ps(T0, T2, _MM_SHUFFLE(3, 2, 3, 2));
1230 __m512
S2 = _mm512_shuffle_ps(T1, T3, _MM_SHUFFLE(1, 0, 1, 0));
1231 __m512
S3 = _mm512_shuffle_ps(T1, T3, _MM_SHUFFLE(3, 2, 3, 2));
1240 tmp.
packet[0] = _mm256_permute2f128_ps(S0_0, S1_0, 0x20);
1241 tmp.
packet[1] = _mm256_permute2f128_ps(S2_0, S3_0, 0x20);
1242 tmp.
packet[2] = _mm256_permute2f128_ps(S0_0, S1_0, 0x31);
1243 tmp.
packet[3] = _mm256_permute2f128_ps(S2_0, S3_0, 0x31);
1245 tmp.
packet[4] = _mm256_permute2f128_ps(S0_1, S1_1, 0x20);
1246 tmp.
packet[5] = _mm256_permute2f128_ps(S2_1, S3_1, 0x20);
1247 tmp.
packet[6] = _mm256_permute2f128_ps(S0_1, S1_1, 0x31);
1248 tmp.
packet[7] = _mm256_permute2f128_ps(S2_1, S3_1, 0x31);
1256 #define PACK_OUTPUT_SQ_D(OUTPUT, INPUT, INDEX, STRIDE) \ 1257 OUTPUT[INDEX] = _mm512_insertf64x4(OUTPUT[INDEX], INPUT[INDEX], 0); \ 1258 OUTPUT[INDEX] = _mm512_insertf64x4(OUTPUT[INDEX], INPUT[INDEX + STRIDE], 1); 1260 #define PACK_OUTPUT_D(OUTPUT, INPUT, INDEX, STRIDE) \ 1261 OUTPUT[INDEX] = _mm512_insertf64x4(OUTPUT[INDEX], INPUT[(2 * INDEX)], 0); \ 1263 _mm512_insertf64x4(OUTPUT[INDEX], INPUT[(2 * INDEX) + STRIDE], 1); 1266 __m512d T0 = _mm512_shuffle_pd(kernel.
packet[0], kernel.
packet[1], 0);
1267 __m512d
T1 = _mm512_shuffle_pd(kernel.
packet[0], kernel.
packet[1], 0xff);
1268 __m512d
T2 = _mm512_shuffle_pd(kernel.
packet[2], kernel.
packet[3], 0);
1269 __m512d
T3 = _mm512_shuffle_pd(kernel.
packet[2], kernel.
packet[3], 0xff);
1273 tmp.
packet[0] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T0, 0),
1274 _mm512_extractf64x4_pd(T2, 0), 0x20);
1275 tmp.
packet[1] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T1, 0),
1276 _mm512_extractf64x4_pd(T3, 0), 0x20);
1277 tmp.
packet[2] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T0, 0),
1278 _mm512_extractf64x4_pd(T2, 0), 0x31);
1279 tmp.
packet[3] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T1, 0),
1280 _mm512_extractf64x4_pd(T3, 0), 0x31);
1282 tmp.
packet[4] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T0, 1),
1283 _mm512_extractf64x4_pd(T2, 1), 0x20);
1284 tmp.
packet[5] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T1, 1),
1285 _mm512_extractf64x4_pd(T3, 1), 0x20);
1286 tmp.
packet[6] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T0, 1),
1287 _mm512_extractf64x4_pd(T2, 1), 0x31);
1288 tmp.
packet[7] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T1, 1),
1289 _mm512_extractf64x4_pd(T3, 1), 0x31);
1298 __m512d T0 = _mm512_unpacklo_pd(kernel.
packet[0], kernel.
packet[1]);
1299 __m512d
T1 = _mm512_unpackhi_pd(kernel.
packet[0], kernel.
packet[1]);
1300 __m512d
T2 = _mm512_unpacklo_pd(kernel.
packet[2], kernel.
packet[3]);
1301 __m512d
T3 = _mm512_unpackhi_pd(kernel.
packet[2], kernel.
packet[3]);
1302 __m512d
T4 = _mm512_unpacklo_pd(kernel.
packet[4], kernel.
packet[5]);
1303 __m512d
T5 = _mm512_unpackhi_pd(kernel.
packet[4], kernel.
packet[5]);
1304 __m512d
T6 = _mm512_unpacklo_pd(kernel.
packet[6], kernel.
packet[7]);
1305 __m512d T7 = _mm512_unpackhi_pd(kernel.
packet[6], kernel.
packet[7]);
1309 tmp.
packet[0] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T0, 0),
1310 _mm512_extractf64x4_pd(T2, 0), 0x20);
1311 tmp.
packet[1] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T1, 0),
1312 _mm512_extractf64x4_pd(T3, 0), 0x20);
1313 tmp.
packet[2] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T0, 0),
1314 _mm512_extractf64x4_pd(T2, 0), 0x31);
1315 tmp.
packet[3] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T1, 0),
1316 _mm512_extractf64x4_pd(T3, 0), 0x31);
1318 tmp.
packet[4] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T0, 1),
1319 _mm512_extractf64x4_pd(T2, 1), 0x20);
1320 tmp.
packet[5] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T1, 1),
1321 _mm512_extractf64x4_pd(T3, 1), 0x20);
1322 tmp.
packet[6] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T0, 1),
1323 _mm512_extractf64x4_pd(T2, 1), 0x31);
1324 tmp.
packet[7] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T1, 1),
1325 _mm512_extractf64x4_pd(T3, 1), 0x31);
1327 tmp.
packet[8] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T4, 0),
1328 _mm512_extractf64x4_pd(T6, 0), 0x20);
1329 tmp.
packet[9] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T5, 0),
1330 _mm512_extractf64x4_pd(T7, 0), 0x20);
1331 tmp.
packet[10] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T4, 0),
1332 _mm512_extractf64x4_pd(T6, 0), 0x31);
1333 tmp.
packet[11] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T5, 0),
1334 _mm512_extractf64x4_pd(T7, 0), 0x31);
1336 tmp.
packet[12] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T4, 1),
1337 _mm512_extractf64x4_pd(T6, 1), 0x20);
1338 tmp.
packet[13] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T5, 1),
1339 _mm512_extractf64x4_pd(T7, 1), 0x20);
1340 tmp.
packet[14] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T4, 1),
1341 _mm512_extractf64x4_pd(T6, 1), 0x31);
1342 tmp.
packet[15] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T5, 1),
1343 _mm512_extractf64x4_pd(T7, 1), 0x31);
1358 const Packet16f& ) {
1359 assert(
false &&
"To be implemented");
1364 const Packet8d& thenPacket,
1365 const Packet8d& elsePacket) {
1366 __mmask8
m = (ifPacket.
select[0] )
1367 | (ifPacket.
select[1]<<1)
1368 | (ifPacket.
select[2]<<2)
1369 | (ifPacket.
select[3]<<3)
1370 | (ifPacket.
select[4]<<4)
1371 | (ifPacket.
select[5]<<5)
1372 | (ifPacket.
select[6]<<6)
1373 | (ifPacket.
select[7]<<7);
1374 return _mm512_mask_blend_pd(m, elsePacket, thenPacket);
1379 return _mm256_set1_epi16(from.x);
1387 return _mm256_load_si256(reinterpret_cast<const __m256i*>(from));
1391 return _mm256_loadu_si256(reinterpret_cast<const __m256i*>(from));
1397 _mm256_store_si256((__m256i*)(
void*)to, from);
1403 _mm256_storeu_si256((__m256i*)(
void*)to, from);
1408 unsigned short a = from[0].x;
1409 unsigned short b = from[1].x;
1410 unsigned short c = from[2].x;
1411 unsigned short d = from[3].x;
1412 unsigned short e = from[4].x;
1413 unsigned short f = from[5].x;
1414 unsigned short g = from[6].x;
1415 unsigned short h = from[7].x;
1416 return _mm256_set_epi16(h, h, g, g, f, f, e, e, d, d, c, c, b, b, a, a);
1421 unsigned short a = from[0].
x;
1422 unsigned short b = from[1].
x;
1423 unsigned short c = from[2].
x;
1424 unsigned short d = from[3].
x;
1425 return _mm256_set_epi16(d, d, d, d, c, c, c, c, b, b, b, b, a, a, a, a);
1429 #ifdef EIGEN_HAS_FP16_C 1430 return _mm512_cvtph_ps(a);
1451 return _mm512_set_ps(
1452 ff, fe, fd, fc, fb, fa, f9, f8, f7, f6, f5, f4, f3, f2, f1, f0);
1457 #ifdef EIGEN_HAS_FP16_C 1458 return _mm512_cvtps_ph(a, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC);
1479 return _mm256_set_epi16(
1480 hf.
x, he.
x, hd.
x, hc.
x, hb.
x, ha.
x, h9.
x, h8.
x,
1481 h7.
x, h6.
x, h5.
x, h4.
x, h3.
x, h2.
x, h1.
x, h0.
x);
1491 const __m256i sign_mask = _mm256_set1_epi16(static_cast<numext::uint16_t>(0x8000));
1492 return _mm256_andnot_si256(sign_mask, a);
1497 const Packet16h&
b) {
1503 const Packet16h&
b) {
1528 return _mm256_blendv_epi8(b, a, mask);
1568 Packet16h sign_mask = _mm256_set1_epi16(static_cast<unsigned short>(0x8000));
1569 return _mm256_xor_si256(a, sign_mask);
1575 Packet16f rf =
padd(af, bf);
1582 Packet16f rf =
psub(af, bf);
1589 Packet16f rf =
pmul(af, bf);
1596 Packet16f rf =
pdiv(af, bf);
1607 Packet8h lane0 = _mm256_extractf128_si256(a, 0);
1608 Packet8h lane1 = _mm256_extractf128_si256(a, 1);
1631 __m128i
m = _mm_setr_epi8(14,15,12,13,10,11,8,9,6,7,4,5,2,3,0,1);
1632 return _mm256_insertf128_si256(
1633 _mm256_castsi128_si256(_mm_shuffle_epi8(_mm256_extractf128_si256(a,1),m)),
1634 _mm_shuffle_epi8(_mm256_extractf128_si256(a,0),m), 1);
1639 return _mm256_set_epi16(
1640 from[15*stride].x, from[14*stride].x, from[13*stride].x, from[12*stride].x,
1641 from[11*stride].x, from[10*stride].x, from[9*stride].x, from[8*stride].x,
1642 from[7*stride].x, from[6*stride].x, from[5*stride].x, from[4*stride].x,
1643 from[3*stride].x, from[2*stride].x, from[1*stride].x, from[0*stride].x);
1650 to[stride*0] = aux[0];
1651 to[stride*1] = aux[1];
1652 to[stride*2] = aux[2];
1653 to[stride*3] = aux[3];
1654 to[stride*4] = aux[4];
1655 to[stride*5] = aux[5];
1656 to[stride*6] = aux[6];
1657 to[stride*7] = aux[7];
1658 to[stride*8] = aux[8];
1659 to[stride*9] = aux[9];
1660 to[stride*10] = aux[10];
1661 to[stride*11] = aux[11];
1662 to[stride*12] = aux[12];
1663 to[stride*13] = aux[13];
1664 to[stride*14] = aux[14];
1665 to[stride*15] = aux[15];
1670 __m256i a = kernel.
packet[0];
1671 __m256i b = kernel.
packet[1];
1672 __m256i c = kernel.
packet[2];
1680 __m256i k = kernel.
packet[10];
1681 __m256i
l = kernel.
packet[11];
1682 __m256i
m = kernel.
packet[12];
1683 __m256i
n = kernel.
packet[13];
1684 __m256i o = kernel.
packet[14];
1685 __m256i
p = kernel.
packet[15];
1687 __m256i ab_07 = _mm256_unpacklo_epi16(a, b);
1688 __m256i cd_07 = _mm256_unpacklo_epi16(c, d);
1689 __m256i ef_07 = _mm256_unpacklo_epi16(e, f);
1690 __m256i gh_07 = _mm256_unpacklo_epi16(g, h);
1691 __m256i ij_07 = _mm256_unpacklo_epi16(i, j);
1692 __m256i kl_07 = _mm256_unpacklo_epi16(k, l);
1693 __m256i mn_07 = _mm256_unpacklo_epi16(m, n);
1694 __m256i op_07 = _mm256_unpacklo_epi16(o, p);
1696 __m256i ab_8f = _mm256_unpackhi_epi16(a, b);
1697 __m256i cd_8f = _mm256_unpackhi_epi16(c, d);
1698 __m256i ef_8f = _mm256_unpackhi_epi16(e, f);
1699 __m256i gh_8f = _mm256_unpackhi_epi16(g, h);
1700 __m256i ij_8f = _mm256_unpackhi_epi16(i, j);
1701 __m256i kl_8f = _mm256_unpackhi_epi16(k, l);
1702 __m256i mn_8f = _mm256_unpackhi_epi16(m, n);
1703 __m256i op_8f = _mm256_unpackhi_epi16(o, p);
1705 __m256i abcd_03 = _mm256_unpacklo_epi32(ab_07, cd_07);
1706 __m256i abcd_47 = _mm256_unpackhi_epi32(ab_07, cd_07);
1707 __m256i efgh_03 = _mm256_unpacklo_epi32(ef_07, gh_07);
1708 __m256i efgh_47 = _mm256_unpackhi_epi32(ef_07, gh_07);
1709 __m256i ijkl_03 = _mm256_unpacklo_epi32(ij_07, kl_07);
1710 __m256i ijkl_47 = _mm256_unpackhi_epi32(ij_07, kl_07);
1711 __m256i mnop_03 = _mm256_unpacklo_epi32(mn_07, op_07);
1712 __m256i mnop_47 = _mm256_unpackhi_epi32(mn_07, op_07);
1714 __m256i abcd_8b = _mm256_unpacklo_epi32(ab_8f, cd_8f);
1715 __m256i abcd_cf = _mm256_unpackhi_epi32(ab_8f, cd_8f);
1716 __m256i efgh_8b = _mm256_unpacklo_epi32(ef_8f, gh_8f);
1717 __m256i efgh_cf = _mm256_unpackhi_epi32(ef_8f, gh_8f);
1718 __m256i ijkl_8b = _mm256_unpacklo_epi32(ij_8f, kl_8f);
1719 __m256i ijkl_cf = _mm256_unpackhi_epi32(ij_8f, kl_8f);
1720 __m256i mnop_8b = _mm256_unpacklo_epi32(mn_8f, op_8f);
1721 __m256i mnop_cf = _mm256_unpackhi_epi32(mn_8f, op_8f);
1723 __m256i abcdefgh_01 = _mm256_unpacklo_epi64(abcd_03, efgh_03);
1724 __m256i abcdefgh_23 = _mm256_unpackhi_epi64(abcd_03, efgh_03);
1725 __m256i ijklmnop_01 = _mm256_unpacklo_epi64(ijkl_03, mnop_03);
1726 __m256i ijklmnop_23 = _mm256_unpackhi_epi64(ijkl_03, mnop_03);
1727 __m256i abcdefgh_45 = _mm256_unpacklo_epi64(abcd_47, efgh_47);
1728 __m256i abcdefgh_67 = _mm256_unpackhi_epi64(abcd_47, efgh_47);
1729 __m256i ijklmnop_45 = _mm256_unpacklo_epi64(ijkl_47, mnop_47);
1730 __m256i ijklmnop_67 = _mm256_unpackhi_epi64(ijkl_47, mnop_47);
1731 __m256i abcdefgh_89 = _mm256_unpacklo_epi64(abcd_8b, efgh_8b);
1732 __m256i abcdefgh_ab = _mm256_unpackhi_epi64(abcd_8b, efgh_8b);
1733 __m256i ijklmnop_89 = _mm256_unpacklo_epi64(ijkl_8b, mnop_8b);
1734 __m256i ijklmnop_ab = _mm256_unpackhi_epi64(ijkl_8b, mnop_8b);
1735 __m256i abcdefgh_cd = _mm256_unpacklo_epi64(abcd_cf, efgh_cf);
1736 __m256i abcdefgh_ef = _mm256_unpackhi_epi64(abcd_cf, efgh_cf);
1737 __m256i ijklmnop_cd = _mm256_unpacklo_epi64(ijkl_cf, mnop_cf);
1738 __m256i ijklmnop_ef = _mm256_unpackhi_epi64(ijkl_cf, mnop_cf);
1741 __m256i a_p_0 = _mm256_permute2x128_si256(abcdefgh_01, ijklmnop_01, 0x20);
1742 __m256i a_p_1 = _mm256_permute2x128_si256(abcdefgh_23, ijklmnop_23, 0x20);
1743 __m256i a_p_2 = _mm256_permute2x128_si256(abcdefgh_45, ijklmnop_45, 0x20);
1744 __m256i a_p_3 = _mm256_permute2x128_si256(abcdefgh_67, ijklmnop_67, 0x20);
1745 __m256i a_p_4 = _mm256_permute2x128_si256(abcdefgh_89, ijklmnop_89, 0x20);
1746 __m256i a_p_5 = _mm256_permute2x128_si256(abcdefgh_ab, ijklmnop_ab, 0x20);
1747 __m256i a_p_6 = _mm256_permute2x128_si256(abcdefgh_cd, ijklmnop_cd, 0x20);
1748 __m256i a_p_7 = _mm256_permute2x128_si256(abcdefgh_ef, ijklmnop_ef, 0x20);
1749 __m256i a_p_8 = _mm256_permute2x128_si256(abcdefgh_01, ijklmnop_01, 0x31);
1750 __m256i a_p_9 = _mm256_permute2x128_si256(abcdefgh_23, ijklmnop_23, 0x31);
1751 __m256i a_p_a = _mm256_permute2x128_si256(abcdefgh_45, ijklmnop_45, 0x31);
1752 __m256i a_p_b = _mm256_permute2x128_si256(abcdefgh_67, ijklmnop_67, 0x31);
1753 __m256i a_p_c = _mm256_permute2x128_si256(abcdefgh_89, ijklmnop_89, 0x31);
1754 __m256i a_p_d = _mm256_permute2x128_si256(abcdefgh_ab, ijklmnop_ab, 0x31);
1755 __m256i a_p_e = _mm256_permute2x128_si256(abcdefgh_cd, ijklmnop_cd, 0x31);
1756 __m256i a_p_f = _mm256_permute2x128_si256(abcdefgh_ef, ijklmnop_ef, 0x31);
1758 kernel.
packet[0] = a_p_0;
1759 kernel.
packet[1] = a_p_1;
1760 kernel.
packet[2] = a_p_2;
1761 kernel.
packet[3] = a_p_3;
1762 kernel.
packet[4] = a_p_4;
1763 kernel.
packet[5] = a_p_5;
1764 kernel.
packet[6] = a_p_6;
1765 kernel.
packet[7] = a_p_7;
1766 kernel.
packet[8] = a_p_8;
1767 kernel.
packet[9] = a_p_9;
1768 kernel.
packet[10] = a_p_a;
1769 kernel.
packet[11] = a_p_b;
1770 kernel.
packet[12] = a_p_c;
1771 kernel.
packet[13] = a_p_d;
1772 kernel.
packet[14] = a_p_e;
1773 kernel.
packet[15] = a_p_f;
1790 for (
int i = 0;
i < 8; ++
i) {
1791 for (
int j = 0;
j < 8; ++
j) {
1792 out[
i][
j] = in[
j][2*
i];
1794 for (
int j = 0;
j < 8; ++
j) {
1795 out[
i][
j+8] = in[
j][2*
i+1];
1819 for (
int i = 0;
i < 4; ++
i) {
1820 for (
int j = 0;
j < 4; ++
j) {
1821 out[
i][
j] = in[
j][4*
i];
1823 for (
int j = 0;
j < 4; ++
j) {
1824 out[
i][
j+4] = in[
j][4*
i+1];
1826 for (
int j = 0;
j < 4; ++
j) {
1827 out[
i][
j+8] = in[
j][4*
i+2];
1829 for (
int j = 0;
j < 4; ++
j) {
1830 out[
i][
j+12] = in[
j][4*
i+3];
1848 AlignedOnScalar = 1,
1855 #if EIGEN_GNUC_AT_LEAST(5, 3) || (!EIGEN_COMP_GNUC_STRICT) 1856 #ifdef EIGEN_VECTORIZE_AVX512DQ 1878 enum {
size=16, alignment=
Aligned32, vectorizable=
true, masked_load_available=
false, masked_store_available=
false};
1884 return _mm256_set1_epi16(from.value);
1890 t.
value =
static_cast<unsigned short>(_mm256_extract_epi16(from, 0));
1896 return _mm256_load_si256(reinterpret_cast<const __m256i*>(from));
1901 return _mm256_loadu_si256(reinterpret_cast<const __m256i*>(from));
1906 const Packet16bf& from) {
1907 _mm256_store_si256(reinterpret_cast<__m256i*>(to), from);
1912 const Packet16bf& from) {
1913 _mm256_storeu_si256(reinterpret_cast<__m256i*>(to), from);
1919 unsigned short a = from[0].value;
1920 unsigned short b = from[1].value;
1921 unsigned short c = from[2].value;
1922 unsigned short d = from[3].value;
1923 unsigned short e = from[4].value;
1924 unsigned short f = from[5].value;
1925 unsigned short g = from[6].value;
1926 unsigned short h = from[7].value;
1927 return _mm256_set_epi16(h, h, g, g, f, f, e, e, d, d, c, c, b, b, a, a);
1933 unsigned short a = from[0].
value;
1934 unsigned short b = from[1].
value;
1935 unsigned short c = from[2].
value;
1936 unsigned short d = from[3].
value;
1937 return _mm256_set_epi16(d, d, d, d, c, c, c, c, b, b, b, b, a, a, a, a);
1941 return _mm512_castsi512_ps(_mm512_slli_epi32(_mm512_cvtepu16_epi32(a), 16));
1948 #if defined(EIGEN_VECTORIZE_AVX512BF16) && EIGEN_GNUC_AT_LEAST(10, 1) 1952 r = (__m256i)(_mm512_cvtneps_pbh(a));
1956 __m512i input = _mm512_castps_si512(a);
1957 __m512i nan = _mm512_set1_epi32(0x7fc0);
1960 t = _mm512_and_si512(_mm512_srli_epi32(input, 16), _mm512_set1_epi32(1));
1962 t = _mm512_add_epi32(t, _mm512_set1_epi32(0x7fff));
1964 t = _mm512_add_epi32(t, input);
1966 t = _mm512_srli_epi32(t, 16);
1969 __mmask16 mask = _mm512_cmp_ps_mask(a, a, _CMP_ORD_Q);
1971 t = _mm512_mask_blend_epi32(mask, nan, t);
1973 r = _mm512_cvtepi32_epi16(t);
1974 #endif // EIGEN_VECTORIZE_AVX512BF16 2001 const Packet16bf& b) {
2007 const Packet16bf& a,
2008 const Packet16bf& b) {
2011 return _mm256_blendv_epi8(b, a, mask);
2033 const Packet16bf& b) {
2039 const Packet16bf& b) {
2045 const Packet16bf& b) {
2051 const Packet16bf& b) {
2057 Packet16bf sign_mask = _mm256_set1_epi16(static_cast<unsigned short>(0x8000));
2058 return _mm256_xor_si256(a, sign_mask);
2068 const __m256i sign_mask = _mm256_set1_epi16(static_cast<numext::uint16_t>(0x8000));
2069 return _mm256_andnot_si256(sign_mask, a);
2074 const Packet16bf&
b) {
2080 const Packet16bf&
b) {
2086 const Packet16bf&
b) {
2092 const Packet16bf&
b) {
2098 const Packet16bf&
b) {
2104 const Packet16bf&
b) {
2115 Packet8bf lane0 = _mm256_extractf128_si256(a, 0);
2116 Packet8bf lane1 = _mm256_extractf128_si256(a, 1);
2142 __m256i
m = _mm256_setr_epi8(14,15,12,13,10,11,8,9,6,7,4,5,2,3,0,1,
2143 14,15,12,13,10,11,8,9,6,7,4,5,2,3,0,1);
2147 res = _mm256_permute2x128_si256(a, a, 1);
2149 return _mm256_shuffle_epi8(res, m);
2155 return _mm256_set_epi16(
2156 from[15*stride].
value, from[14*stride].value, from[13*stride].value, from[12*stride].value,
2157 from[11*stride].value, from[10*stride].value, from[9*stride].value, from[8*stride].value,
2158 from[7*stride].value, from[6*stride].value, from[5*stride].value, from[4*stride].value,
2159 from[3*stride].value, from[2*stride].value, from[1*stride].value, from[0*stride].value);
2164 const Packet16bf& from,
2168 to[stride*0] = aux[0];
2169 to[stride*1] = aux[1];
2170 to[stride*2] = aux[2];
2171 to[stride*3] = aux[3];
2172 to[stride*4] = aux[4];
2173 to[stride*5] = aux[5];
2174 to[stride*6] = aux[6];
2175 to[stride*7] = aux[7];
2176 to[stride*8] = aux[8];
2177 to[stride*9] = aux[9];
2178 to[stride*10] = aux[10];
2179 to[stride*11] = aux[11];
2180 to[stride*12] = aux[12];
2181 to[stride*13] = aux[13];
2182 to[stride*14] = aux[14];
2183 to[stride*15] = aux[15];
2187 __m256i a = kernel.
packet[0];
2188 __m256i b = kernel.
packet[1];
2189 __m256i c = kernel.
packet[2];
2197 __m256i k = kernel.
packet[10];
2198 __m256i
l = kernel.
packet[11];
2199 __m256i
m = kernel.
packet[12];
2200 __m256i
n = kernel.
packet[13];
2201 __m256i o = kernel.
packet[14];
2202 __m256i
p = kernel.
packet[15];
2204 __m256i ab_07 = _mm256_unpacklo_epi16(a, b);
2205 __m256i cd_07 = _mm256_unpacklo_epi16(c, d);
2206 __m256i ef_07 = _mm256_unpacklo_epi16(e, f);
2207 __m256i gh_07 = _mm256_unpacklo_epi16(g, h);
2208 __m256i ij_07 = _mm256_unpacklo_epi16(i, j);
2209 __m256i kl_07 = _mm256_unpacklo_epi16(k, l);
2210 __m256i mn_07 = _mm256_unpacklo_epi16(m, n);
2211 __m256i op_07 = _mm256_unpacklo_epi16(o, p);
2213 __m256i ab_8f = _mm256_unpackhi_epi16(a, b);
2214 __m256i cd_8f = _mm256_unpackhi_epi16(c, d);
2215 __m256i ef_8f = _mm256_unpackhi_epi16(e, f);
2216 __m256i gh_8f = _mm256_unpackhi_epi16(g, h);
2217 __m256i ij_8f = _mm256_unpackhi_epi16(i, j);
2218 __m256i kl_8f = _mm256_unpackhi_epi16(k, l);
2219 __m256i mn_8f = _mm256_unpackhi_epi16(m, n);
2220 __m256i op_8f = _mm256_unpackhi_epi16(o, p);
2222 __m256i abcd_03 = _mm256_unpacklo_epi32(ab_07, cd_07);
2223 __m256i abcd_47 = _mm256_unpackhi_epi32(ab_07, cd_07);
2224 __m256i efgh_03 = _mm256_unpacklo_epi32(ef_07, gh_07);
2225 __m256i efgh_47 = _mm256_unpackhi_epi32(ef_07, gh_07);
2226 __m256i ijkl_03 = _mm256_unpacklo_epi32(ij_07, kl_07);
2227 __m256i ijkl_47 = _mm256_unpackhi_epi32(ij_07, kl_07);
2228 __m256i mnop_03 = _mm256_unpacklo_epi32(mn_07, op_07);
2229 __m256i mnop_47 = _mm256_unpackhi_epi32(mn_07, op_07);
2231 __m256i abcd_8b = _mm256_unpacklo_epi32(ab_8f, cd_8f);
2232 __m256i abcd_cf = _mm256_unpackhi_epi32(ab_8f, cd_8f);
2233 __m256i efgh_8b = _mm256_unpacklo_epi32(ef_8f, gh_8f);
2234 __m256i efgh_cf = _mm256_unpackhi_epi32(ef_8f, gh_8f);
2235 __m256i ijkl_8b = _mm256_unpacklo_epi32(ij_8f, kl_8f);
2236 __m256i ijkl_cf = _mm256_unpackhi_epi32(ij_8f, kl_8f);
2237 __m256i mnop_8b = _mm256_unpacklo_epi32(mn_8f, op_8f);
2238 __m256i mnop_cf = _mm256_unpackhi_epi32(mn_8f, op_8f);
2240 __m256i abcdefgh_01 = _mm256_unpacklo_epi64(abcd_03, efgh_03);
2241 __m256i abcdefgh_23 = _mm256_unpackhi_epi64(abcd_03, efgh_03);
2242 __m256i ijklmnop_01 = _mm256_unpacklo_epi64(ijkl_03, mnop_03);
2243 __m256i ijklmnop_23 = _mm256_unpackhi_epi64(ijkl_03, mnop_03);
2244 __m256i abcdefgh_45 = _mm256_unpacklo_epi64(abcd_47, efgh_47);
2245 __m256i abcdefgh_67 = _mm256_unpackhi_epi64(abcd_47, efgh_47);
2246 __m256i ijklmnop_45 = _mm256_unpacklo_epi64(ijkl_47, mnop_47);
2247 __m256i ijklmnop_67 = _mm256_unpackhi_epi64(ijkl_47, mnop_47);
2248 __m256i abcdefgh_89 = _mm256_unpacklo_epi64(abcd_8b, efgh_8b);
2249 __m256i abcdefgh_ab = _mm256_unpackhi_epi64(abcd_8b, efgh_8b);
2250 __m256i ijklmnop_89 = _mm256_unpacklo_epi64(ijkl_8b, mnop_8b);
2251 __m256i ijklmnop_ab = _mm256_unpackhi_epi64(ijkl_8b, mnop_8b);
2252 __m256i abcdefgh_cd = _mm256_unpacklo_epi64(abcd_cf, efgh_cf);
2253 __m256i abcdefgh_ef = _mm256_unpackhi_epi64(abcd_cf, efgh_cf);
2254 __m256i ijklmnop_cd = _mm256_unpacklo_epi64(ijkl_cf, mnop_cf);
2255 __m256i ijklmnop_ef = _mm256_unpackhi_epi64(ijkl_cf, mnop_cf);
2258 kernel.
packet[0] = _mm256_permute2x128_si256(abcdefgh_01, ijklmnop_01, 0x20);
2259 kernel.
packet[1] = _mm256_permute2x128_si256(abcdefgh_23, ijklmnop_23, 0x20);
2260 kernel.
packet[2] = _mm256_permute2x128_si256(abcdefgh_45, ijklmnop_45, 0x20);
2261 kernel.
packet[3] = _mm256_permute2x128_si256(abcdefgh_67, ijklmnop_67, 0x20);
2262 kernel.
packet[4] = _mm256_permute2x128_si256(abcdefgh_89, ijklmnop_89, 0x20);
2263 kernel.
packet[5] = _mm256_permute2x128_si256(abcdefgh_ab, ijklmnop_ab, 0x20);
2264 kernel.
packet[6] = _mm256_permute2x128_si256(abcdefgh_cd, ijklmnop_cd, 0x20);
2265 kernel.
packet[7] = _mm256_permute2x128_si256(abcdefgh_ef, ijklmnop_ef, 0x20);
2266 kernel.
packet[8] = _mm256_permute2x128_si256(abcdefgh_01, ijklmnop_01, 0x31);
2267 kernel.
packet[9] = _mm256_permute2x128_si256(abcdefgh_23, ijklmnop_23, 0x31);
2268 kernel.
packet[10] = _mm256_permute2x128_si256(abcdefgh_45, ijklmnop_45, 0x31);
2269 kernel.
packet[11] = _mm256_permute2x128_si256(abcdefgh_67, ijklmnop_67, 0x31);
2270 kernel.
packet[12] = _mm256_permute2x128_si256(abcdefgh_89, ijklmnop_89, 0x31);
2271 kernel.
packet[13] = _mm256_permute2x128_si256(abcdefgh_ab, ijklmnop_ab, 0x31);
2272 kernel.
packet[14] = _mm256_permute2x128_si256(abcdefgh_cd, ijklmnop_cd, 0x31);
2273 kernel.
packet[15] = _mm256_permute2x128_si256(abcdefgh_ef, ijklmnop_ef, 0x31);
2277 __m256i a = kernel.
packet[0];
2278 __m256i b = kernel.
packet[1];
2279 __m256i c = kernel.
packet[2];
2282 __m256i ab_07 = _mm256_unpacklo_epi16(a, b);
2283 __m256i cd_07 = _mm256_unpacklo_epi16(c, d);
2284 __m256i ab_8f = _mm256_unpackhi_epi16(a, b);
2285 __m256i cd_8f = _mm256_unpackhi_epi16(c, d);
2287 __m256i abcd_03 = _mm256_unpacklo_epi32(ab_07, cd_07);
2288 __m256i abcd_47 = _mm256_unpackhi_epi32(ab_07, cd_07);
2289 __m256i abcd_8b = _mm256_unpacklo_epi32(ab_8f, cd_8f);
2290 __m256i abcd_cf = _mm256_unpackhi_epi32(ab_8f, cd_8f);
2293 kernel.
packet[0] = _mm256_permute2x128_si256(abcd_03, abcd_47, 0x20);
2294 kernel.
packet[1] = _mm256_permute2x128_si256(abcd_8b, abcd_cf, 0x20);
2295 kernel.
packet[2] = _mm256_permute2x128_si256(abcd_03, abcd_47, 0x31);
2296 kernel.
packet[3] = _mm256_permute2x128_si256(abcd_8b, abcd_cf, 0x31);
2303 #endif // EIGEN_PACKET_MATH_AVX512_H EIGEN_STRONG_INLINE Packet8f half2float(const Packet8h &a)
EIGEN_STRONG_INLINE void pscatter< half, Packet16h >(half *to, const Packet16h &from, Index stride)
EIGEN_STRONG_INLINE Packet8f Bf16ToF32(const Packet8bf &a)
EIGEN_STRONG_INLINE Packet8bf F32ToBf16(Packet4f p4f)
EIGEN_STRONG_INLINE Packet16f pmax< Packet16f >(const Packet16f &a, const Packet16f &b)
EIGEN_DEVICE_FUNC Packet8d pgather< double, Packet8d >(const double *from, Index stride)
EIGEN_STRONG_INLINE double pfirst< Packet8d >(const Packet8d &a)
#define EIGEN_STRONG_INLINE
EIGEN_STRONG_INLINE Packet8d pceil< Packet8d >(const Packet8d &a)
#define PACK_OUTPUT_2(OUTPUT, INPUT, INDEX, STRIDE)
EIGEN_STRONG_INLINE Packet16f pmax< PropagateNumbers, Packet16f >(const Packet16f &a, const Packet16f &b)
EIGEN_STRONG_INLINE bool predux_any(const Packet4f &x)
EIGEN_STRONG_INLINE Packet8d pfloor< Packet8d >(const Packet8d &a)
EIGEN_STRONG_INLINE Packet16f pmin< PropagateNaN, Packet16f >(const Packet16f &a, const Packet16f &b)
static const Pose3 T3(Rot3::Rodrigues(-90, 0, 0), Point3(1, 2, 3))
EIGEN_STRONG_INLINE Packet16h pload< Packet16h >(const Eigen::half *from)
HessenbergDecomposition< MatrixXcf > hd(4)
EIGEN_STRONG_INLINE double predux_min< Packet8d >(const Packet8d &a)
EIGEN_STRONG_INLINE void pstore1< Packet8d >(double *to, const double &a)
EIGEN_STRONG_INLINE Packet16f pload1< Packet16f >(const float *from)
EIGEN_STRONG_INLINE Packet4d pfrexp_generic_get_biased_exponent(const Packet4d &a)
EIGEN_STRONG_INLINE Packet8h pandnot(const Packet8h &a, const Packet8h &b)
EIGEN_STRONG_INLINE half predux_mul< Packet16h >(const Packet16h &from)
EIGEN_STRONG_INLINE Packet16i ploadu< Packet16i >(const int *from)
#define EIGEN_DEBUG_UNALIGNED_LOAD
EIGEN_STRONG_INLINE Packet16f print< Packet16f >(const Packet16f &a)
std::ofstream out("Result.txt")
EIGEN_STRONG_INLINE Packet4f pcmp_le(const Packet4f &a, const Packet4f &b)
EIGEN_STRONG_INLINE Packet8d print< Packet8d >(const Packet8d &a)
Vector6 f6(const double x1, const double x2, const double x3, const double x4, const double x5, const double x6)
EIGEN_STRONG_INLINE Packet8d pmin< PropagateNaN, Packet8d >(const Packet8d &a, const Packet8d &b)
EIGEN_STRONG_INLINE Packet16f pand< Packet16f >(const Packet16f &a, const Packet16f &b)
EIGEN_STRONG_INLINE Packet16f padd< Packet16f >(const Packet16f &a, const Packet16f &b)
#define EIGEN_EXTRACT_8f_FROM_16f(INPUT, OUTPUT)
EIGEN_STRONG_INLINE Packet8f pzero(const Packet8f &)
EIGEN_STRONG_INLINE Packet16f cat256(Packet8f a, Packet8f b)
EIGEN_STRONG_INLINE Packet4i parithmetic_shift_right(const Packet4i &a)
EIGEN_STRONG_INLINE Packet16f pload< Packet16f >(const float *from)
EIGEN_STRONG_INLINE Packet16f pset1< Packet16f >(const float &from)
EIGEN_STRONG_INLINE Packet16f pxor< Packet16f >(const Packet16f &a, const Packet16f &b)
EIGEN_STRONG_INLINE Packet8d ptrue< Packet8d >(const Packet8d &a)
EIGEN_STRONG_INLINE Packet2d extract128(Packet8d x)
#define EIGEN_DEBUG_ALIGNED_STORE
Namespace containing all symbols from the Eigen library.
EIGEN_STRONG_INLINE void pstoreu< half >(Eigen::half *to, const Packet16h &from)
EIGEN_STRONG_INLINE Packet8d pmax< PropagateNaN, Packet8d >(const Packet8d &a, const Packet8d &b)
EIGEN_STRONG_INLINE Packet16f pmul< Packet16f >(const Packet16f &a, const Packet16f &b)
EIGEN_STRONG_INLINE Packet8h predux_half_dowto4< Packet16h >(const Packet16h &a)
EIGEN_STRONG_INLINE Packet16i pxor< Packet16i >(const Packet16i &a, const Packet16i &b)
EIGEN_STRONG_INLINE void pstoreu< bfloat16 >(bfloat16 *to, const Packet8bf &from)
EIGEN_STRONG_INLINE Packet16f pmin< Packet16f >(const Packet16f &a, const Packet16f &b)
EIGEN_STRONG_INLINE Packet16h pmax< Packet16h >(const Packet16h &a, const Packet16h &b)
EIGEN_STRONG_INLINE bfloat16 predux_mul< Packet16bf >(const Packet16bf &from)
EIGEN_STRONG_INLINE Packet8i por< Packet8i >(const Packet8i &a, const Packet8i &b)
static const Pose3 T2(Rot3::Rodrigues(0.3, 0.2, 0.1), P2)
EIGEN_STRONG_INLINE Packet8d psub< Packet8d >(const Packet8d &a, const Packet8d &b)
double f2(const Vector2 &x)
EIGEN_DEVICE_FUNC unpacket_traits< Packet >::type predux(const Packet &a)
#define EIGEN_DEBUG_UNALIGNED_STORE
EIGEN_STRONG_INLINE Packet16f pandnot< Packet16f >(const Packet16f &a, const Packet16f &b)
EIGEN_STRONG_INLINE void prefetch< float >(const float *addr)
EIGEN_STRONG_INLINE Packet8i pand< Packet8i >(const Packet8i &a, const Packet8i &b)
EIGEN_STRONG_INLINE Packet8d pxor< Packet8d >(const Packet8d &a, const Packet8d &b)
EIGEN_STRONG_INLINE Packet16f pceil< Packet16f >(const Packet16f &a)
EIGEN_STRONG_INLINE Packet16bf pset1< Packet16bf >(const bfloat16 &from)
EIGEN_STRONG_INLINE Packet8h padd< Packet8h >(const Packet8h &a, const Packet8h &b)
EIGEN_STRONG_INLINE Packet16i ptrue< Packet16i >(const Packet16i &)
EIGEN_STRONG_INLINE Packet8h pxor(const Packet8h &a, const Packet8h &b)
EIGEN_STRONG_INLINE Packet16f ptrue< Packet16f >(const Packet16f &a)
EIGEN_STRONG_INLINE void pstoreu< double >(double *to, const Packet4d &from)
EIGEN_STRONG_INLINE Packet16f pdiv< Packet16f >(const Packet16f &a, const Packet16f &b)
EIGEN_STRONG_INLINE Packet16h ploadu< Packet16h >(const Eigen::half *from)
EIGEN_STRONG_INLINE Packet16h plset< Packet16h >(const half &a)
EIGEN_DEVICE_FUNC void pscatter< double, Packet8d >(double *to, const Packet8d &from, Index stride)
EIGEN_STRONG_INLINE Packet16f pfloor< Packet16f >(const Packet16f &a)
void g(const string &key, int i)
EIGEN_STRONG_INLINE Packet8d padd< Packet8d >(const Packet8d &a, const Packet8d &b)
static const Similarity3 T4(R, P, s)
#define EIGEN_DEBUG_ALIGNED_LOAD
EIGEN_STRONG_INLINE Packet8d pload1< Packet8d >(const double *from)
EIGEN_STRONG_INLINE double predux< Packet8d >(const Packet8d &a)
EIGEN_STRONG_INLINE Packet16f pfrexp< Packet16f >(const Packet16f &a, Packet16f &exponent)
EIGEN_STRONG_INLINE Packet8d pset1< Packet8d >(const double &from)
EIGEN_STRONG_INLINE Packet4f pcmp_lt(const Packet4f &a, const Packet4f &b)
#define PACK_OUTPUT(OUTPUT, INPUT, INDEX, STRIDE)
EIGEN_STRONG_INLINE Packet16bf print< Packet16bf >(const Packet16bf &a)
EIGEN_DEVICE_FUNC Packet padd(const Packet &a, const Packet &b)
EIGEN_STRONG_INLINE Packet16i pload< Packet16i >(const int *from)
EIGEN_DEVICE_FUNC Packet pmin(const Packet &a, const Packet &b)
cout<< "Here is the matrix m:"<< endl<< m<< endl;Matrix< ptrdiff_t, 3, 1 > res
EIGEN_STRONG_INLINE Packet8h ptrue(const Packet8h &a)
EIGEN_STRONG_INLINE Packet8d pset1frombits< Packet8d >(const numext::uint64_t from)
EIGEN_STRONG_INLINE bfloat16 pfirst(const Packet8bf &a)
eigen_packet_wrapper< __m256i, 2 > Packet16bf
EIGEN_STRONG_INLINE void pstore< double >(double *to, const Packet4d &from)
EIGEN_STRONG_INLINE __m256i Pack32To16(Packet16f rf)
EIGEN_STRONG_INLINE Packet8d pandnot< Packet8d >(const Packet8d &a, const Packet8d &b)
EIGEN_STRONG_INLINE Packet16f pmin< PropagateNumbers, Packet16f >(const Packet16f &a, const Packet16f &b)
EIGEN_STRONG_INLINE Packet16h print< Packet16h >(const Packet16h &a)
static const Line3 l(Rot3(), 1, 1)
EIGEN_STRONG_INLINE double predux_mul< Packet8d >(const Packet8d &a)
EIGEN_STRONG_INLINE void prefetch< int >(const int *addr)
#define PACK_OUTPUT_SQ_D(OUTPUT, INPUT, INDEX, STRIDE)
EIGEN_STRONG_INLINE Packet16bf pgather< bfloat16, Packet16bf >(const bfloat16 *from, Index stride)
EIGEN_STRONG_INLINE Packet16bf pdiv< Packet16bf >(const Packet16bf &a, const Packet16bf &b)
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Packet pfrexp_generic(const Packet &a, Packet &exponent)
EIGEN_STRONG_INLINE Packet8d pround< Packet8d >(const Packet8d &a)
EIGEN_STRONG_INLINE Packet16h padd< Packet16h >(const Packet16h &a, const Packet16h &b)
EIGEN_STRONG_INLINE Packet16bf ploaddup< Packet16bf >(const bfloat16 *from)
EIGEN_DEVICE_FUNC void pscatter< float, Packet16f >(float *to, const Packet16f &from, Index stride)
EIGEN_STRONG_INLINE Packet16f psub< Packet16f >(const Packet16f &a, const Packet16f &b)
EIGEN_STRONG_INLINE Packet16bf pround< Packet16bf >(const Packet16bf &a)
EIGEN_STRONG_INLINE Eigen::half pfirst< Packet16h >(const Packet16h &from)
EIGEN_STRONG_INLINE Packet2cf pcmp_eq(const Packet2cf &a, const Packet2cf &b)
EIGEN_STRONG_INLINE Packet8i pxor< Packet8i >(const Packet8i &a, const Packet8i &b)
EIGEN_STRONG_INLINE float predux< Packet8f >(const Packet8f &a)
EIGEN_STRONG_INLINE Packet4f ploadu< Packet4f >(const float *from)
EIGEN_STRONG_INLINE void ptranspose(PacketBlock< Packet2cf, 2 > &kernel)
EIGEN_STRONG_INLINE Packet pminmax_propagate_nan(const Packet &a, const Packet &b, Op op)
EIGEN_STRONG_INLINE Packet16f por< Packet16f >(const Packet16f &a, const Packet16f &b)
EIGEN_STRONG_INLINE Packet16i padd< Packet16i >(const Packet16i &a, const Packet16i &b)
EIGEN_STRONG_INLINE void pstoreu< int >(int *to, const Packet4i &from)
EIGEN_STRONG_INLINE void pstore< float >(float *to, const Packet4f &from)
EIGEN_STRONG_INLINE Packet16h pceil< Packet16h >(const Packet16h &a)
EIGEN_DEFAULT_DENSE_INDEX_TYPE Index
The Index type as used for the API.
EIGEN_STRONG_INLINE Packet8d pand< Packet8d >(const Packet8d &a, const Packet8d &b)
EIGEN_STRONG_INLINE Packet8d pldexp< Packet8d >(const Packet8d &a, const Packet8d &exponent)
EIGEN_STRONG_INLINE Packet8bf predux_half_dowto4< Packet16bf >(const Packet16bf &a)
static const Similarity3 T6(Rot3(), Point3(1, 1, 0), 2)
EIGEN_STRONG_INLINE Packet4i plogical_shift_right(const Packet4i &a)
unsigned __int64 uint64_t
EIGEN_STRONG_INLINE Packet8f extract256(Packet16f x)
EIGEN_STRONG_INLINE Packet16f ploaddup< Packet16f >(const float *from)
EIGEN_STRONG_INLINE Packet4f pcmp_lt_or_nan(const Packet4f &a, const Packet4f &b)
EIGEN_STRONG_INLINE bfloat16 pfirst< Packet16bf >(const Packet16bf &from)
eigen_packet_wrapper< __m256i, 1 > Packet16h
EIGEN_STRONG_INLINE void pstore1< Packet16i >(int *to, const int &a)
Point2(* f)(const Point3 &, OptionalJacobian< 2, 3 >)
Array< double, 1, 3 > e(1./3., 0.5, 2.)
EIGEN_STRONG_INLINE Packet16bf plset< Packet16bf >(const bfloat16 &a)
EIGEN_STRONG_INLINE Packet8h pand(const Packet8h &a, const Packet8h &b)
EIGEN_STRONG_INLINE float predux_mul< Packet16f >(const Packet16f &a)
EIGEN_STRONG_INLINE bfloat16 predux_max< Packet16bf >(const Packet16bf &from)
EIGEN_STRONG_INLINE Packet16h pdiv< Packet16h >(const Packet16h &a, const Packet16h &b)
EIGEN_STRONG_INLINE Packet pminmax_propagate_numbers(const Packet &a, const Packet &b, Op op)
EIGEN_STRONG_INLINE Packet2cf pconj(const Packet2cf &a)
EIGEN_STRONG_INLINE Packet4d predux_half_dowto4< Packet8d >(const Packet8d &a)
EIGEN_STRONG_INLINE Packet4i plogical_shift_left(const Packet4i &a)
EIGEN_STRONG_INLINE Packet8d pmul< Packet8d >(const Packet8d &a, const Packet8d &b)
EIGEN_STRONG_INLINE Packet16f plset< Packet16f >(const float &a)
const char * SsePrefetchPtrType
EIGEN_STRONG_INLINE Packet16bf pmul< Packet16bf >(const Packet16bf &a, const Packet16bf &b)
EIGEN_STRONG_INLINE half predux< Packet16h >(const Packet16h &from)
EIGEN_STRONG_INLINE int pfirst< Packet16i >(const Packet16i &a)
EIGEN_STRONG_INLINE void pstoreu< float >(float *to, const Packet4f &from)
Pose3 x3(Rot3::Ypr(M_PI/4.0, 0.0, 0.0), l2)
EIGEN_STRONG_INLINE Packet16i pand< Packet16i >(const Packet16i &a, const Packet16i &b)
EIGEN_DEVICE_FUNC void pstore(Scalar *to, const Packet &from)
EIGEN_STRONG_INLINE Packet8f predux_half_dowto4< Packet16f >(const Packet16f &a)
EIGEN_STRONG_INLINE Packet8d ploadu< Packet8d >(const double *from)
EIGEN_STRONG_INLINE Packet16bf pmin< Packet16bf >(const Packet16bf &a, const Packet16bf &b)
EIGEN_CONSTEXPR Index size(const T &x)
#define EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE Packet8d plset< Packet8d >(const double &a)
EIGEN_STRONG_INLINE Packet8d pload< Packet8d >(const double *from)
EIGEN_STRONG_INLINE float predux_max< Packet16f >(const Packet16f &a)
EIGEN_STRONG_INLINE float pfirst< Packet16f >(const Packet16f &a)
EIGEN_STRONG_INLINE Packet16f pround< Packet16f >(const Packet16f &a)
EIGEN_STRONG_INLINE Packet2cf pnegate(const Packet2cf &a)
EIGEN_DEVICE_FUNC Packet pdiv(const Packet &a, const Packet &b)
EIGEN_STRONG_INLINE Packet8i ptrue< Packet8i >(const Packet8i &a)
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR __half_raw raw_uint16_to_half(numext::uint16_t x)
EIGEN_STRONG_INLINE Packet8f peven_mask(const Packet8f &)
EIGEN_STRONG_INLINE float predux< Packet16f >(const Packet16f &a)
EIGEN_STRONG_INLINE bfloat16 predux_min< Packet16bf >(const Packet16bf &from)
EIGEN_STRONG_INLINE Packet16bf ploadu< Packet16bf >(const bfloat16 *from)
Point2 f1(const Point3 &p, OptionalJacobian< 2, 3 > H)
static const Similarity3 T1(R, Point3(3.5, -8.2, 4.2), 1)
double f4(double x, double y, double z)
EIGEN_STRONG_INLINE Eigen::half predux_min< Packet16h >(const Packet16h &a)
EIGEN_STRONG_INLINE Packet16bf padd< Packet16bf >(const Packet16bf &a, const Packet16bf &b)
EIGEN_DEVICE_FUNC unpacket_traits< Packet >::type predux_mul(const Packet &a)
EIGEN_STRONG_INLINE Packet16f ploadquad< Packet16f >(const float *from)
EIGEN_STRONG_INLINE bfloat16 predux< Packet16bf >(const Packet16bf &p)
EIGEN_STRONG_INLINE void pstore< int >(int *to, const Packet4i &from)
EIGEN_STRONG_INLINE Packet16i psub< Packet16i >(const Packet16i &a, const Packet16i &b)
EIGEN_STRONG_INLINE Packet16i pset1< Packet16i >(const int &from)
EIGEN_STRONG_INLINE Packet16i pmul< Packet16i >(const Packet16i &a, const Packet16i &b)
EIGEN_STRONG_INLINE void pstore< half >(Eigen::half *to, const Packet16h &from)
EIGEN_STRONG_INLINE Packet16i pandnot< Packet16i >(const Packet16i &a, const Packet16i &b)
EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f &a, const Packet4f &b, const Packet4f &c)
EIGEN_DEVICE_FUNC Packet psub(const Packet &a, const Packet &b)
EIGEN_STRONG_INLINE Packet8d pmin< Packet8d >(const Packet8d &a, const Packet8d &b)
EIGEN_STRONG_INLINE Packet16h pfloor< Packet16h >(const Packet16h &a)
EIGEN_STRONG_INLINE Packet8d ploadquad< Packet8d >(const double *from)
double f3(double x1, double x2)
EIGEN_STRONG_INLINE Packet16h ploadquad(const Eigen::half *from)
EIGEN_STRONG_INLINE Packet4f pselect(const Packet4f &mask, const Packet4f &a, const Packet4f &b)
static const Similarity3 T5(R, P, 10)
EIGEN_STRONG_INLINE Packet16i por< Packet16i >(const Packet16i &a, const Packet16i &b)
EIGEN_STRONG_INLINE Packet16f pmax< PropagateNaN, Packet16f >(const Packet16f &a, const Packet16f &b)
EIGEN_STRONG_INLINE Packet8d por< Packet8d >(const Packet8d &a, const Packet8d &b)
EIGEN_STRONG_INLINE Packet8d pmin< PropagateNumbers, Packet8d >(const Packet8d &a, const Packet8d &b)
#define PACK_OUTPUT_D(OUTPUT, INPUT, INDEX, STRIDE)
EIGEN_STRONG_INLINE Packet16bf pfloor< Packet16bf >(const Packet16bf &a)
EIGEN_STRONG_INLINE Packet8h por(const Packet8h &a, const Packet8h &b)
EIGEN_STRONG_INLINE Packet16h pset1< Packet16h >(const Eigen::half &from)
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Packet pldexp_generic(const Packet &a, const Packet &exponent)
EIGEN_STRONG_INLINE Packet8d pfrexp< Packet8d >(const Packet8d &a, Packet8d &exponent)
EIGEN_STRONG_INLINE Packet16h pmin< Packet16h >(const Packet16h &a, const Packet16h &b)
EIGEN_STRONG_INLINE Packet8i pandnot< Packet8i >(const Packet8i &a, const Packet8i &b)
EIGEN_STRONG_INLINE Packet8h float2half(const Packet8f &a)
EIGEN_STRONG_INLINE void pstore1< Packet16f >(float *to, const float &a)
EIGEN_STRONG_INLINE Packet16bf pload< Packet16bf >(const bfloat16 *from)
EIGEN_STRONG_INLINE Packet16bf psub< Packet16bf >(const Packet16bf &a, const Packet16bf &b)
EIGEN_STRONG_INLINE Eigen::half predux_max< Packet16h >(const Packet16h &a)
EIGEN_STRONG_INLINE Packet16f pset1frombits< Packet16f >(unsigned int from)
EIGEN_STRONG_INLINE Packet16h pmul< Packet16h >(const Packet16h &a, const Packet16h &b)
set noclip points set clip one set noclip two set bar set border lt lw set xdata set ydata set zdata set x2data set y2data set boxwidth set dummy x
EIGEN_STRONG_INLINE Packet8d pmax< Packet8d >(const Packet8d &a, const Packet8d &b)
EIGEN_STRONG_INLINE Packet8bf padd< Packet8bf >(const Packet8bf &a, const Packet8bf &b)
EIGEN_STRONG_INLINE Packet16f ploadu< Packet16f >(const float *from)
EIGEN_STRONG_INLINE Packet16bf pceil< Packet16bf >(const Packet16bf &a)
EIGEN_STRONG_INLINE Packet16bf pmax< Packet16bf >(const Packet16bf &a, const Packet16bf &b)
EIGEN_STRONG_INLINE Packet8d pdiv< Packet8d >(const Packet8d &a, const Packet8d &b)
EIGEN_DEVICE_FUNC Packet pmul(const Packet &a, const Packet &b)
EIGEN_STRONG_INLINE void prefetch< double >(const double *addr)
EIGEN_STRONG_INLINE Packet8i pset1< Packet8i >(const int &from)
EIGEN_STRONG_INLINE Packet16h ploaddup< Packet16h >(const Eigen::half *from)
EIGEN_STRONG_INLINE double predux_max< Packet8d >(const Packet8d &a)
EIGEN_DEVICE_FUNC Packet pmax(const Packet &a, const Packet &b)
EIGEN_STRONG_INLINE Packet16h pround< Packet16h >(const Packet16h &a)
EIGEN_STRONG_INLINE Packet4i pblend(const Selector< 4 > &ifPacket, const Packet4i &thenPacket, const Packet4i &elsePacket)
EIGEN_STRONG_INLINE void pstore< bfloat16 >(bfloat16 *to, const Packet8bf &from)
EIGEN_STRONG_INLINE void pscatter< bfloat16, Packet16bf >(bfloat16 *to, const Packet16bf &from, Index stride)
EIGEN_STRONG_INLINE float predux_min< Packet16f >(const Packet16f &a)
EIGEN_STRONG_INLINE Packet2cf preverse(const Packet2cf &a)
EIGEN_STRONG_INLINE Packet16h psub< Packet16h >(const Packet16h &a, const Packet16h &b)
EIGEN_STRONG_INLINE Packet4f pabs(const Packet4f &a)
EIGEN_DEVICE_FUNC Packet16f pgather< float, Packet16f >(const float *from, Index stride)
EIGEN_STRONG_INLINE Packet16f pldexp< Packet16f >(const Packet16f &a, const Packet16f &exponent)
EIGEN_STRONG_INLINE Packet8d ploaddup< Packet8d >(const double *from)
EIGEN_STRONG_INLINE Packet8d pmax< PropagateNumbers, Packet8d >(const Packet8d &a, const Packet8d &b)