4 #define _USE_MATH_DEFINES 11 #if defined(__SSSE3__) && defined(__AVX2__) 12 #include <tmmintrin.h> 13 #include <immintrin.h> 15 #pragma pack(push, 1) // All structs in this file are assumed to be byte-packed 22 auto src =
reinterpret_cast<const __m256i *
>(
s);
23 auto dst =
reinterpret_cast<__m256i *
>(d[0]);
25 #pragma omp parallel for 26 for (
int i = 0;
i < n / 32;
i++)
28 const __m256i zero = _mm256_set1_epi8(0);
29 const __m256i n100 = _mm256_set1_epi16(100 << 4);
30 const __m256i n208 = _mm256_set1_epi16(208 << 4);
31 const __m256i n298 = _mm256_set1_epi16(298 << 4);
32 const __m256i n409 = _mm256_set1_epi16(409 << 4);
33 const __m256i n516 = _mm256_set1_epi16(516 << 4);
34 const __m256i evens_odds = _mm256_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30,
35 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30);
39 __m256i
s0 = _mm256_loadu_si256(&
src[
i * 2]);
40 __m256i
s1 = _mm256_loadu_si256(&
src[i * 2 + 1]);
45 __m256i
y0 = _mm256_shuffle_epi8(s0, _mm256_setr_epi8(1, 3, 5, 7, 9, 11, 13, 15, 0, 2, 4, 6, 8, 10, 12, 14,
46 1, 3, 5, 7, 9, 11, 13, 15, 0, 2, 4, 6, 8, 10, 12, 14));
47 __m256i
y1 = _mm256_shuffle_epi8(s1, _mm256_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15,
48 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15));
49 _mm256_storeu_si256(&
dst[i], _mm256_alignr_epi8(y0, y1, 8));
54 const __m256i evens_odd1s_odd3s = _mm256_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 5, 9, 13, 3, 7, 11, 15,
55 0, 2, 4, 6, 8, 10, 12, 14, 1, 5, 9, 13, 3, 7, 11, 15);
56 __m256i yyyyyyyyuuuuvvvv0 = _mm256_shuffle_epi8(s0, evens_odd1s_odd3s);
57 __m256i yyyyyyyyuuuuvvvv8 = _mm256_shuffle_epi8(s1, evens_odd1s_odd3s);
60 __m256i y16__0_7 = _mm256_unpacklo_epi8(yyyyyyyyuuuuvvvv0, zero);
61 __m256i y16__8_F = _mm256_unpacklo_epi8(yyyyyyyyuuuuvvvv8, zero);
65 _mm256_storeu_si256(&
dst[i * 2], _mm256_slli_epi16(y16__0_7, 8));
66 _mm256_storeu_si256(&
dst[i * 2 + 1], _mm256_slli_epi16(y16__8_F, 8));
71 __m256i uv = _mm256_unpackhi_epi32(yyyyyyyyuuuuvvvv0, yyyyyyyyuuuuvvvv8);
72 __m256i u = _mm256_unpacklo_epi8(uv, uv);
73 __m256i
v = _mm256_unpackhi_epi8(uv, uv);
74 __m256i u16__0_7 = _mm256_unpacklo_epi8(u, zero);
75 __m256i u16__8_F = _mm256_unpackhi_epi8(u, zero);
76 __m256i v16__0_7 = _mm256_unpacklo_epi8(v, zero);
77 __m256i v16__8_F = _mm256_unpackhi_epi8(v, zero);
80 __m256i c16__0_7 = _mm256_slli_epi16(_mm256_subs_epi16(y16__0_7, _mm256_set1_epi16(16)), 4);
81 __m256i d16__0_7 = _mm256_slli_epi16(_mm256_subs_epi16(u16__0_7, _mm256_set1_epi16(128)), 4);
82 __m256i e16__0_7 = _mm256_slli_epi16(_mm256_subs_epi16(v16__0_7, _mm256_set1_epi16(128)), 4);
83 __m256i r16__0_7 = _mm256_min_epi16(_mm256_set1_epi16(255), _mm256_max_epi16(zero, ((_mm256_add_epi16(_mm256_mulhi_epi16(c16__0_7, n298), _mm256_mulhi_epi16(e16__0_7, n409))))));
84 __m256i g16__0_7 = _mm256_min_epi16(_mm256_set1_epi16(255), _mm256_max_epi16(zero, ((_mm256_sub_epi16(_mm256_sub_epi16(_mm256_mulhi_epi16(c16__0_7, n298), _mm256_mulhi_epi16(d16__0_7, n100)), _mm256_mulhi_epi16(e16__0_7, n208))))));
85 __m256i b16__0_7 = _mm256_min_epi16(_mm256_set1_epi16(255), _mm256_max_epi16(zero, ((_mm256_add_epi16(_mm256_mulhi_epi16(c16__0_7, n298), _mm256_mulhi_epi16(d16__0_7, n516))))));
88 __m256i c16__8_F = _mm256_slli_epi16(_mm256_subs_epi16(y16__8_F, _mm256_set1_epi16(16)), 4);
89 __m256i d16__8_F = _mm256_slli_epi16(_mm256_subs_epi16(u16__8_F, _mm256_set1_epi16(128)), 4);
90 __m256i e16__8_F = _mm256_slli_epi16(_mm256_subs_epi16(v16__8_F, _mm256_set1_epi16(128)), 4);
91 __m256i r16__8_F = _mm256_min_epi16(_mm256_set1_epi16(255), _mm256_max_epi16(zero, ((_mm256_add_epi16(_mm256_mulhi_epi16(c16__8_F, n298), _mm256_mulhi_epi16(e16__8_F, n409))))));
92 __m256i g16__8_F = _mm256_min_epi16(_mm256_set1_epi16(255), _mm256_max_epi16(zero, ((_mm256_sub_epi16(_mm256_sub_epi16(_mm256_mulhi_epi16(c16__8_F, n298), _mm256_mulhi_epi16(d16__8_F, n100)), _mm256_mulhi_epi16(e16__8_F, n208))))));
93 __m256i b16__8_F = _mm256_min_epi16(_mm256_set1_epi16(255), _mm256_max_epi16(zero, ((_mm256_add_epi16(_mm256_mulhi_epi16(c16__8_F, n298), _mm256_mulhi_epi16(d16__8_F, n516))))));
98 __m256i rg8__0_7 = _mm256_unpacklo_epi8(_mm256_shuffle_epi8(r16__0_7, evens_odds), _mm256_shuffle_epi8(g16__0_7, evens_odds));
99 __m256i ba8__0_7 = _mm256_unpacklo_epi8(_mm256_shuffle_epi8(b16__0_7, evens_odds), _mm256_set1_epi8(-1));
100 __m256i rgba_0_3 = _mm256_unpacklo_epi16(rg8__0_7, ba8__0_7);
101 __m256i rgba_4_7 = _mm256_unpackhi_epi16(rg8__0_7, ba8__0_7);
103 __m128i ZW1 = _mm256_extracti128_si256(rgba_4_7, 0);
104 __m256i XYZW1 = _mm256_inserti128_si256(rgba_0_3, ZW1, 1);
106 __m128i UV1 = _mm256_extracti128_si256(rgba_0_3, 1);
107 __m256i UVST1 = _mm256_inserti128_si256(rgba_4_7, UV1, 0);
109 __m256i rg8__8_F = _mm256_unpacklo_epi8(_mm256_shuffle_epi8(r16__8_F, evens_odds), _mm256_shuffle_epi8(g16__8_F, evens_odds));
110 __m256i ba8__8_F = _mm256_unpacklo_epi8(_mm256_shuffle_epi8(b16__8_F, evens_odds), _mm256_set1_epi8(-1));
111 __m256i rgba_8_B = _mm256_unpacklo_epi16(rg8__8_F, ba8__8_F);
112 __m256i rgba_C_F = _mm256_unpackhi_epi16(rg8__8_F, ba8__8_F);
114 __m128i ZW2 = _mm256_extracti128_si256(rgba_C_F, 0);
115 __m256i XYZW2 = _mm256_inserti128_si256(rgba_8_B, ZW2, 1);
117 __m128i UV2 = _mm256_extracti128_si256(rgba_8_B, 1);
118 __m256i UVST2 = _mm256_inserti128_si256(rgba_C_F, UV2, 0);
123 _mm256_storeu_si256(&
dst[i * 4], XYZW1);
124 _mm256_storeu_si256(&
dst[i * 4 + 1], UVST1);
125 _mm256_storeu_si256(&
dst[i * 4 + 2], XYZW2);
126 _mm256_storeu_si256(&
dst[i * 4 + 3], UVST2);
131 __m128i rgba0 = _mm256_extracti128_si256(XYZW1, 0);
132 __m128i rgba1 = _mm256_extracti128_si256(XYZW1, 1);
133 __m128i rgba2 = _mm256_extracti128_si256(UVST1, 0);
134 __m128i rgba3 = _mm256_extracti128_si256(UVST1, 1);
135 __m128i rgba4 = _mm256_extracti128_si256(XYZW2, 0);
136 __m128i rgba5 = _mm256_extracti128_si256(XYZW2, 1);
137 __m128i rgba6 = _mm256_extracti128_si256(UVST2, 0);
138 __m128i rgba7 = _mm256_extracti128_si256(UVST2, 1);
141 __m128i rgb0 = _mm_shuffle_epi8(rgba0, _mm_setr_epi8(3, 7, 11, 15, 0, 1, 2, 4, 5, 6, 8, 9, 10, 12, 13, 14));
142 __m128i rgb1 = _mm_shuffle_epi8(rgba1, _mm_setr_epi8(0, 1, 2, 4, 3, 7, 11, 15, 5, 6, 8, 9, 10, 12, 13, 14));
143 __m128i rgb2 = _mm_shuffle_epi8(rgba2, _mm_setr_epi8(0, 1, 2, 4, 5, 6, 8, 9, 3, 7, 11, 15, 10, 12, 13, 14));
144 __m128i rgb3 = _mm_shuffle_epi8(rgba3, _mm_setr_epi8(0, 1, 2, 4, 5, 6, 8, 9, 10, 12, 13, 14, 3, 7, 11, 15));
145 __m128i rgb4 = _mm_shuffle_epi8(rgba4, _mm_setr_epi8(3, 7, 11, 15, 0, 1, 2, 4, 5, 6, 8, 9, 10, 12, 13, 14));
146 __m128i rgb5 = _mm_shuffle_epi8(rgba5, _mm_setr_epi8(0, 1, 2, 4, 3, 7, 11, 15, 5, 6, 8, 9, 10, 12, 13, 14));
147 __m128i rgb6 = _mm_shuffle_epi8(rgba6, _mm_setr_epi8(0, 1, 2, 4, 5, 6, 8, 9, 3, 7, 11, 15, 10, 12, 13, 14));
148 __m128i rgb7 = _mm_shuffle_epi8(rgba7, _mm_setr_epi8(0, 1, 2, 4, 5, 6, 8, 9, 10, 12, 13, 14, 3, 7, 11, 15));
150 __m128i a1 = _mm_alignr_epi8(rgb1, rgb0, 4);
151 __m128i a2 = _mm_alignr_epi8(rgb2, rgb1, 8);
152 __m128i a3 = _mm_alignr_epi8(rgb3, rgb2, 12);
153 __m128i a4 = _mm_alignr_epi8(rgb5, rgb4, 4);
154 __m128i a5 = _mm_alignr_epi8(rgb6, rgb5, 8);
155 __m128i a6 = _mm_alignr_epi8(rgb7, rgb6, 12);
157 __m256i a1_2 = _mm256_castsi128_si256(a1);
158 a1_2 = _mm256_inserti128_si256(a1_2, a2, 1);
160 __m256i a3_4 = _mm256_castsi128_si256(a3);
161 a3_4 = _mm256_inserti128_si256(a3_4, a4, 1);
163 __m256i a5_6 = _mm256_castsi128_si256(a5);
164 a5_6 = _mm256_inserti128_si256(a5_6, a6, 1);
167 _mm256_storeu_si256(&
dst[i * 3], a1_2);
168 _mm256_storeu_si256(&
dst[i * 3 + 1], a3_4);
169 _mm256_storeu_si256(&
dst[i * 3 + 2], a5_6);
176 __m256i bg8__0_7 = _mm256_unpacklo_epi8(_mm256_shuffle_epi8(b16__0_7, evens_odds), _mm256_shuffle_epi8(g16__0_7, evens_odds));
177 __m256i ra8__0_7 = _mm256_unpacklo_epi8(_mm256_shuffle_epi8(r16__0_7, evens_odds), _mm256_set1_epi8(-1));
178 __m256i bgra_0_3 = _mm256_unpacklo_epi16(bg8__0_7, ra8__0_7);
179 __m256i bgra_4_7 = _mm256_unpackhi_epi16(bg8__0_7, ra8__0_7);
181 __m128i ZW1 = _mm256_extracti128_si256(bgra_4_7, 0);
182 __m256i XYZW1 = _mm256_inserti128_si256(bgra_0_3, ZW1, 1);
184 __m128i UV1 = _mm256_extracti128_si256(bgra_0_3, 1);
185 __m256i UVST1 = _mm256_inserti128_si256(bgra_4_7, UV1, 0);
187 __m256i bg8__8_F = _mm256_unpacklo_epi8(_mm256_shuffle_epi8(b16__8_F, evens_odds), _mm256_shuffle_epi8(g16__8_F, evens_odds));
188 __m256i ra8__8_F = _mm256_unpacklo_epi8(_mm256_shuffle_epi8(r16__8_F, evens_odds), _mm256_set1_epi8(-1));
189 __m256i bgra_8_B = _mm256_unpacklo_epi16(bg8__8_F, ra8__8_F);
190 __m256i bgra_C_F = _mm256_unpackhi_epi16(bg8__8_F, ra8__8_F);
192 __m128i ZW2 = _mm256_extracti128_si256(bgra_C_F, 0);
193 __m256i XYZW2 = _mm256_inserti128_si256(bgra_8_B, ZW2, 1);
195 __m128i UV2 = _mm256_extracti128_si256(bgra_8_B, 1);
196 __m256i UVST2 = _mm256_inserti128_si256(bgra_C_F, UV2, 0);
201 _mm256_storeu_si256(&
dst[i * 4], XYZW1);
202 _mm256_storeu_si256(&
dst[i * 4 + 1], UVST1);
203 _mm256_storeu_si256(&
dst[i * 4 + 2], XYZW2);
204 _mm256_storeu_si256(&
dst[i * 4 + 3], UVST2);
209 __m128i rgba0 = _mm256_extracti128_si256(XYZW1, 0);
210 __m128i rgba1 = _mm256_extracti128_si256(XYZW1, 1);
211 __m128i rgba2 = _mm256_extracti128_si256(UVST1, 0);
212 __m128i rgba3 = _mm256_extracti128_si256(UVST1, 1);
213 __m128i rgba4 = _mm256_extracti128_si256(XYZW2, 0);
214 __m128i rgba5 = _mm256_extracti128_si256(XYZW2, 1);
215 __m128i rgba6 = _mm256_extracti128_si256(UVST2, 0);
216 __m128i rgba7 = _mm256_extracti128_si256(UVST2, 1);
219 __m128i bgr0 = _mm_shuffle_epi8(rgba0, _mm_setr_epi8(3, 7, 11, 15, 0, 1, 2, 4, 5, 6, 8, 9, 10, 12, 13, 14));
220 __m128i bgr1 = _mm_shuffle_epi8(rgba1, _mm_setr_epi8(0, 1, 2, 4, 3, 7, 11, 15, 5, 6, 8, 9, 10, 12, 13, 14));
221 __m128i bgr2 = _mm_shuffle_epi8(rgba2, _mm_setr_epi8(0, 1, 2, 4, 5, 6, 8, 9, 3, 7, 11, 15, 10, 12, 13, 1));
222 __m128i bgr3 = _mm_shuffle_epi8(rgba3, _mm_setr_epi8(0, 1, 2, 4, 5, 6, 8, 9, 10, 12, 13, 14, 3, 7, 11, 15));
223 __m128i bgr4 = _mm_shuffle_epi8(rgba4, _mm_setr_epi8(3, 7, 11, 15, 0, 1, 2, 4, 5, 6, 8, 9, 10, 12, 13, 14));
224 __m128i bgr5 = _mm_shuffle_epi8(rgba5, _mm_setr_epi8(0, 1, 2, 4, 3, 7, 11, 15, 5, 6, 8, 9, 10, 12, 13, 14));
225 __m128i bgr6 = _mm_shuffle_epi8(rgba6, _mm_setr_epi8(0, 1, 2, 4, 5, 6, 8, 9, 3, 7, 11, 15, 10, 12, 13, 1));
226 __m128i bgr7 = _mm_shuffle_epi8(rgba7, _mm_setr_epi8(0, 1, 2, 4, 5, 6, 8, 9, 10, 12, 13, 14, 3, 7, 11, 15));
228 __m128i a1 = _mm_alignr_epi8(bgr1, bgr0, 4);
229 __m128i a2 = _mm_alignr_epi8(bgr2, bgr1, 8);
230 __m128i a3 = _mm_alignr_epi8(bgr3, bgr2, 12);
231 __m128i a4 = _mm_alignr_epi8(bgr5, bgr4, 4);
232 __m128i a5 = _mm_alignr_epi8(bgr6, bgr5, 8);
233 __m128i a6 = _mm_alignr_epi8(bgr7, bgr6, 12);
235 __m256i a1_2 = _mm256_castsi128_si256(a1);
236 a1_2 = _mm256_inserti128_si256(a1_2, a2, 1);
238 __m256i a3_4 = _mm256_castsi128_si256(a3);
239 a3_4 = _mm256_inserti128_si256(a3_4, a4, 1);
241 __m256i a5_6 = _mm256_castsi128_si256(a5);
242 a5_6 = _mm256_inserti128_si256(a5_6, a6, 1);
245 _mm256_storeu_si256(&
dst[i * 3], a1_2);
246 _mm256_storeu_si256(&
dst[i * 3 + 1], a3_4);
247 _mm256_storeu_si256(&
dst[i * 3 + 2], a5_6);
253 void unpack_yuy2_avx_y8(
byte *
const d[],
const byte * s,
int n)
255 unpack_yuy2<RS2_FORMAT_Y8>(
d,
s,
n);
257 void unpack_yuy2_avx_y16(
byte *
const d[],
const byte * s,
int n)
259 unpack_yuy2<RS2_FORMAT_Y16>(
d,
s,
n);
261 void unpack_yuy2_avx_rgb8(
byte *
const d[],
const byte * s,
int n)
263 unpack_yuy2<RS2_FORMAT_RGB8>(
d,
s,
n);
265 void unpack_yuy2_avx_rgba8(
byte *
const d[],
const byte * s,
int n)
267 unpack_yuy2<RS2_FORMAT_RGBA8>(
d,
s,
n);
269 void unpack_yuy2_avx_bgr8(
byte *
const d[],
const byte * s,
int n)
271 unpack_yuy2<RS2_FORMAT_BGR8>(
d,
s,
n);
273 void unpack_yuy2_avx_bgra8(
byte *
const d[],
const byte * s,
int n)
275 unpack_yuy2<RS2_FORMAT_BGRA8>(
d,
s,
n);
GLuint GLfloat GLfloat GLfloat GLfloat GLfloat GLfloat s0
void unpack_yuy2(byte *const d[], const byte *s, int width, int height, int actual_size)
GLuint GLfloat GLfloat y0
GLuint GLfloat GLfloat GLfloat GLfloat GLfloat GLfloat GLfloat GLfloat s1