10 #define STB_IMAGE_STATIC
11 #define STB_IMAGE_IMPLEMENTATION
12 #include "../third-party/stb_image.h"
15 #include "cuda/cuda-conversion.cuh"
18 #include <tmmintrin.h>
21 #if defined (ANDROID) || (defined (__linux__) && !defined (__x86_64__)) || (defined (__APPLE__) && !defined (__x86_64__))
23 bool has_avx() {
return false; }
29 #define cpuid(info, x) __cpuidex(info, x, 0)
42 return (
info[2] & ((
int)1 << 28)) != 0;
59 rscuda::unpack_yuy2_cuda<FORMAT>(
d,
s,
n);
62 #if defined __SSSE3__ && ! defined ANDROID
78 auto src =
reinterpret_cast<const __m128i *
>(
s);
79 auto dst =
reinterpret_cast<__m128i *
>(
d[0]);
81 #pragma omp parallel for
82 for (
int i = 0;
i <
n / 16;
i++)
84 const __m128i zero = _mm_set1_epi8(0);
85 const __m128i n100 = _mm_set1_epi16(100 << 4);
86 const __m128i n208 = _mm_set1_epi16(208 << 4);
87 const __m128i n298 = _mm_set1_epi16(298 << 4);
88 const __m128i n409 = _mm_set1_epi16(409 << 4);
89 const __m128i n516 = _mm_set1_epi16(516 << 4);
90 const __m128i evens_odds = _mm_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15);
93 __m128i s0 = _mm_loadu_si128(&
src[
i * 2]);
94 __m128i
s1 = _mm_loadu_si128(&
src[
i * 2 + 1]);
98 const __m128i vmask = _mm_set1_epi16( 0x00ff );
99 s0 = _mm_and_si128( s0, vmask );
100 s1 = _mm_and_si128(
s1, vmask );
102 _mm_storeu_si128( &
dst[
i], _mm_packus_epi16( s0,
s1 ) );
107 const __m128i evens_odd1s_odd3s = _mm_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 5, 9, 13, 3, 7, 11, 15);
108 __m128i yyyyyyyyuuuuvvvv0 = _mm_shuffle_epi8(s0, evens_odd1s_odd3s);
109 __m128i yyyyyyyyuuuuvvvv8 = _mm_shuffle_epi8(
s1, evens_odd1s_odd3s);
112 __m128i y16__0_7 = _mm_unpacklo_epi8(yyyyyyyyuuuuvvvv0, zero);
113 __m128i y16__8_F = _mm_unpacklo_epi8(yyyyyyyyuuuuvvvv8, zero);
118 _mm_storeu_si128(&
dst[
i * 2], _mm_slli_epi16(y16__0_7, 8));
119 _mm_storeu_si128(&
dst[
i * 2 + 1], _mm_slli_epi16(y16__8_F, 8));
124 __m128i uv = _mm_unpackhi_epi32(yyyyyyyyuuuuvvvv0, yyyyyyyyuuuuvvvv8);
125 __m128i u = _mm_unpacklo_epi8(uv, uv);
126 __m128i
v = _mm_unpackhi_epi8(uv, uv);
127 __m128i u16__0_7 = _mm_unpacklo_epi8(u, zero);
128 __m128i u16__8_F = _mm_unpackhi_epi8(u, zero);
129 __m128i v16__0_7 = _mm_unpacklo_epi8(
v, zero);
130 __m128i v16__8_F = _mm_unpackhi_epi8(
v, zero);
133 __m128i c16__0_7 = _mm_slli_epi16(_mm_subs_epi16(y16__0_7, _mm_set1_epi16(16)), 4);
134 __m128i d16__0_7 = _mm_slli_epi16(_mm_subs_epi16(u16__0_7, _mm_set1_epi16(128)), 4);
135 __m128i e16__0_7 = _mm_slli_epi16(_mm_subs_epi16(v16__0_7, _mm_set1_epi16(128)), 4);
136 __m128i r16__0_7 = _mm_min_epi16(_mm_set1_epi16(255), _mm_max_epi16(zero, ((_mm_add_epi16(_mm_mulhi_epi16(c16__0_7, n298), _mm_mulhi_epi16(e16__0_7, n409))))));
137 __m128i g16__0_7 = _mm_min_epi16(_mm_set1_epi16(255), _mm_max_epi16(zero, ((_mm_sub_epi16(_mm_sub_epi16(_mm_mulhi_epi16(c16__0_7, n298), _mm_mulhi_epi16(d16__0_7, n100)), _mm_mulhi_epi16(e16__0_7, n208))))));
138 __m128i b16__0_7 = _mm_min_epi16(_mm_set1_epi16(255), _mm_max_epi16(zero, ((_mm_add_epi16(_mm_mulhi_epi16(c16__0_7, n298), _mm_mulhi_epi16(d16__0_7, n516))))));
141 __m128i c16__8_F = _mm_slli_epi16(_mm_subs_epi16(y16__8_F, _mm_set1_epi16(16)), 4);
142 __m128i d16__8_F = _mm_slli_epi16(_mm_subs_epi16(u16__8_F, _mm_set1_epi16(128)), 4);
143 __m128i e16__8_F = _mm_slli_epi16(_mm_subs_epi16(v16__8_F, _mm_set1_epi16(128)), 4);
144 __m128i r16__8_F = _mm_min_epi16(_mm_set1_epi16(255), _mm_max_epi16(zero, ((_mm_add_epi16(_mm_mulhi_epi16(c16__8_F, n298), _mm_mulhi_epi16(e16__8_F, n409))))));
145 __m128i g16__8_F = _mm_min_epi16(_mm_set1_epi16(255), _mm_max_epi16(zero, ((_mm_sub_epi16(_mm_sub_epi16(_mm_mulhi_epi16(c16__8_F, n298), _mm_mulhi_epi16(d16__8_F, n100)), _mm_mulhi_epi16(e16__8_F, n208))))));
146 __m128i b16__8_F = _mm_min_epi16(_mm_set1_epi16(255), _mm_max_epi16(zero, ((_mm_add_epi16(_mm_mulhi_epi16(c16__8_F, n298), _mm_mulhi_epi16(d16__8_F, n516))))));
151 __m128i rg8__0_7 = _mm_unpacklo_epi8(_mm_shuffle_epi8(r16__0_7, evens_odds), _mm_shuffle_epi8(g16__0_7, evens_odds));
152 __m128i ba8__0_7 = _mm_unpacklo_epi8(_mm_shuffle_epi8(b16__0_7, evens_odds), _mm_set1_epi8(-1));
153 __m128i rgba_0_3 = _mm_unpacklo_epi16(rg8__0_7, ba8__0_7);
154 __m128i rgba_4_7 = _mm_unpackhi_epi16(rg8__0_7, ba8__0_7);
156 __m128i rg8__8_F = _mm_unpacklo_epi8(_mm_shuffle_epi8(r16__8_F, evens_odds), _mm_shuffle_epi8(g16__8_F, evens_odds));
157 __m128i ba8__8_F = _mm_unpacklo_epi8(_mm_shuffle_epi8(b16__8_F, evens_odds), _mm_set1_epi8(-1));
158 __m128i rgba_8_B = _mm_unpacklo_epi16(rg8__8_F, ba8__8_F);
159 __m128i rgba_C_F = _mm_unpackhi_epi16(rg8__8_F, ba8__8_F);
164 _mm_storeu_si128(&
dst[
i * 4], rgba_0_3);
165 _mm_storeu_si128(&
dst[
i * 4 + 1], rgba_4_7);
166 _mm_storeu_si128(&
dst[
i * 4 + 2], rgba_8_B);
167 _mm_storeu_si128(&
dst[
i * 4 + 3], rgba_C_F);
173 __m128i rgb0 = _mm_shuffle_epi8(rgba_0_3, _mm_setr_epi8(3, 7, 11, 15, 0, 1, 2, 4, 5, 6, 8, 9, 10, 12, 13, 14));
174 __m128i rgb1 = _mm_shuffle_epi8(rgba_4_7, _mm_setr_epi8(0, 1, 2, 4, 3, 7, 11, 15, 5, 6, 8, 9, 10, 12, 13, 14));
175 __m128i rgb2 = _mm_shuffle_epi8(rgba_8_B, _mm_setr_epi8(0, 1, 2, 4, 5, 6, 8, 9, 3, 7, 11, 15, 10, 12, 13, 14));
176 __m128i rgb3 = _mm_shuffle_epi8(rgba_C_F, _mm_setr_epi8(0, 1, 2, 4, 5, 6, 8, 9, 10, 12, 13, 14, 3, 7, 11, 15));
179 _mm_storeu_si128(&
dst[
i * 3], _mm_alignr_epi8(rgb1, rgb0, 4));
180 _mm_storeu_si128(&
dst[
i * 3 + 1], _mm_alignr_epi8(rgb2, rgb1, 8));
181 _mm_storeu_si128(&
dst[
i * 3 + 2], _mm_alignr_epi8(rgb3, rgb2, 12));
188 __m128i bg8__0_7 = _mm_unpacklo_epi8(_mm_shuffle_epi8(b16__0_7, evens_odds), _mm_shuffle_epi8(g16__0_7, evens_odds));
189 __m128i ra8__0_7 = _mm_unpacklo_epi8(_mm_shuffle_epi8(r16__0_7, evens_odds), _mm_set1_epi8(-1));
190 __m128i bgra_0_3 = _mm_unpacklo_epi16(bg8__0_7, ra8__0_7);
191 __m128i bgra_4_7 = _mm_unpackhi_epi16(bg8__0_7, ra8__0_7);
193 __m128i bg8__8_F = _mm_unpacklo_epi8(_mm_shuffle_epi8(b16__8_F, evens_odds), _mm_shuffle_epi8(g16__8_F, evens_odds));
194 __m128i ra8__8_F = _mm_unpacklo_epi8(_mm_shuffle_epi8(r16__8_F, evens_odds), _mm_set1_epi8(-1));
195 __m128i bgra_8_B = _mm_unpacklo_epi16(bg8__8_F, ra8__8_F);
196 __m128i bgra_C_F = _mm_unpackhi_epi16(bg8__8_F, ra8__8_F);
201 _mm_storeu_si128(&
dst[
i * 4], bgra_0_3);
202 _mm_storeu_si128(&
dst[
i * 4 + 1], bgra_4_7);
203 _mm_storeu_si128(&
dst[
i * 4 + 2], bgra_8_B);
204 _mm_storeu_si128(&
dst[
i * 4 + 3], bgra_C_F);
210 __m128i bgr0 = _mm_shuffle_epi8(bgra_0_3, _mm_setr_epi8(3, 7, 11, 15, 0, 1, 2, 4, 5, 6, 8, 9, 10, 12, 13, 14));
211 __m128i bgr1 = _mm_shuffle_epi8(bgra_4_7, _mm_setr_epi8(0, 1, 2, 4, 3, 7, 11, 15, 5, 6, 8, 9, 10, 12, 13, 14));
212 __m128i bgr2 = _mm_shuffle_epi8(bgra_8_B, _mm_setr_epi8(0, 1, 2, 4, 5, 6, 8, 9, 3, 7, 11, 15, 10, 12, 13, 14));
213 __m128i bgr3 = _mm_shuffle_epi8(bgra_C_F, _mm_setr_epi8(0, 1, 2, 4, 5, 6, 8, 9, 10, 12, 13, 14, 3, 7, 11, 15));
216 _mm_storeu_si128(&
dst[
i * 3], _mm_alignr_epi8(bgr1, bgr0, 4));
217 _mm_storeu_si128(&
dst[
i * 3 + 1], _mm_alignr_epi8(bgr2, bgr1, 8));
218 _mm_storeu_si128(&
dst[
i * 3 + 2], _mm_alignr_epi8(bgr3, bgr2, 12));
223 #else // Generic code for when SSSE3 is not available.
226 for (;
n;
n -= 16,
src += 32)
273 for (
int i = 0;
i < 16;
i++)
280 #define clamp(x) ((t=(x)) > 255 ? 255 : t < 0 ? 0 : t)
281 r[
i] =
clamp((298 *
c + 409 *
e + 128) >> 8);
282 g[
i] =
clamp((298 *
c - 100 *
d - 208 *
e + 128) >> 8);
283 b[
i] =
clamp((298 *
c + 516 *
d + 128) >> 8);
290 r[0],
g[0],
b[0],
r[1],
g[1],
b[1],
291 r[2],
g[2],
b[2],
r[3],
g[3],
b[3],
292 r[4],
g[4],
b[4],
r[5],
g[5],
b[5],
293 r[6],
g[6],
b[6],
r[7],
g[7],
b[7],
294 r[8],
g[8],
b[8],
r[9],
g[9],
b[9],
295 r[10],
g[10],
b[10],
r[11],
g[11],
b[11],
296 r[12],
g[12],
b[12],
r[13],
g[13],
b[13],
297 r[14],
g[14],
b[14],
r[15],
g[15],
b[15],
307 b[0],
g[0],
r[0],
b[1],
g[1],
r[1],
308 b[2],
g[2],
r[2],
b[3],
g[3],
r[3],
309 b[4],
g[4],
r[4],
b[5],
g[5],
r[5],
310 b[6],
g[6],
r[6],
b[7],
g[7],
r[7],
311 b[8],
g[8],
r[8],
b[9],
g[9],
r[9],
312 b[10],
g[10],
r[10],
b[11],
g[11],
r[11],
313 b[12],
g[12],
r[12],
b[13],
g[13],
r[13],
314 b[14],
g[14],
r[14],
b[15],
g[15],
r[15],
324 r[0],
g[0],
b[0], 255,
r[1],
g[1],
b[1], 255,
325 r[2],
g[2],
b[2], 255,
r[3],
g[3],
b[3], 255,
326 r[4],
g[4],
b[4], 255,
r[5],
g[5],
b[5], 255,
327 r[6],
g[6],
b[6], 255,
r[7],
g[7],
b[7], 255,
328 r[8],
g[8],
b[8], 255,
r[9],
g[9],
b[9], 255,
329 r[10],
g[10],
b[10], 255,
r[11],
g[11],
b[11], 255,
330 r[12],
g[12],
b[12], 255,
r[13],
g[13],
b[13], 255,
331 r[14],
g[14],
b[14], 255,
r[15],
g[15],
b[15], 255,
341 b[0],
g[0],
r[0], 255,
b[1],
g[1],
r[1], 255,
342 b[2],
g[2],
r[2], 255,
b[3],
g[3],
r[3], 255,
343 b[4],
g[4],
r[4], 255,
b[5],
g[5],
r[5], 255,
344 b[6],
g[6],
r[6], 255,
b[7],
g[7],
r[7], 255,
345 b[8],
g[8],
r[8], 255,
b[9],
g[9],
r[9], 255,
346 b[10],
g[10],
r[10], 255,
b[11],
g[11],
r[11], 255,
347 b[12],
g[12],
r[12], 255,
b[13],
g[13],
r[13], 255,
348 b[14],
g[14],
r[14], 255,
b[15],
g[15],
r[15], 255,
358 template<rs2_format FORMAT>
362 for (
int y_pix = 0, uv_pix = 0; y_pix <
width; y_pix += 16, uv_pix += 16)
366 std::memcpy(
y, &y_one_line[y_pix], 16 );
369 uv_one_line[uv_pix + 0], uv_one_line[uv_pix + 0], uv_one_line[uv_pix + 2], uv_one_line[uv_pix + 2],
370 uv_one_line[uv_pix + 4], uv_one_line[uv_pix + 4], uv_one_line[uv_pix + 6], uv_one_line[uv_pix + 6],
371 uv_one_line[uv_pix + 8], uv_one_line[uv_pix + 8], uv_one_line[uv_pix + 10], uv_one_line[uv_pix + 10],
372 uv_one_line[uv_pix + 12], uv_one_line[uv_pix + 12], uv_one_line[uv_pix + 14], uv_one_line[uv_pix + 14]
376 uv_one_line[uv_pix + 1], uv_one_line[uv_pix + 1], uv_one_line[uv_pix + 3], uv_one_line[uv_pix + 3],
377 uv_one_line[uv_pix + 5], uv_one_line[uv_pix + 5], uv_one_line[uv_pix + 7], uv_one_line[uv_pix + 7],
378 uv_one_line[uv_pix + 9], uv_one_line[uv_pix + 9], uv_one_line[uv_pix + 11], uv_one_line[uv_pix + 11],
379 uv_one_line[uv_pix + 13], uv_one_line[uv_pix + 13], uv_one_line[uv_pix + 15], uv_one_line[uv_pix + 15]
384 for (
int i = 0;
i < 16;
i++)
391 #define clamp(x) ((t=(x)) > 255 ? 255 : t < 0 ? 0 : t)
392 r[
i] =
clamp((298 *
c + 409 *
e + 128) >> 8);
393 g[
i] =
clamp((298 *
c - 100 *
d - 208 *
e + 128) >> 8);
394 b[
i] =
clamp((298 *
c + 516 *
d + 128) >> 8);
402 r[0],
g[0],
b[0],
r[1],
g[1],
b[1],
403 r[2],
g[2],
b[2],
r[3],
g[3],
b[3],
404 r[4],
g[4],
b[4],
r[5],
g[5],
b[5],
405 r[6],
g[6],
b[6],
r[7],
g[7],
b[7],
406 r[8],
g[8],
b[8],
r[9],
g[9],
b[9],
407 r[10],
g[10],
b[10],
r[11],
g[11],
b[11],
408 r[12],
g[12],
b[12],
r[13],
g[13],
b[13],
409 r[14],
g[14],
b[14],
r[15],
g[15],
b[15]
419 b[0],
g[0],
r[0],
b[1],
g[1],
r[1],
420 b[2],
g[2],
r[2],
b[3],
g[3],
r[3],
421 b[4],
g[4],
r[4],
b[5],
g[5],
r[5],
422 b[6],
g[6],
r[6],
b[7],
g[7],
r[7],
423 b[8],
g[8],
r[8],
b[9],
g[9],
r[9],
424 b[10],
g[10],
r[10],
b[11],
g[11],
r[11],
425 b[12],
g[12],
r[12],
b[13],
g[13],
r[13],
426 b[14],
g[14],
r[14],
b[15],
g[15],
r[15],
436 r[0],
g[0],
b[0], 255,
r[1],
g[1],
b[1], 255,
437 r[2],
g[2],
b[2], 255,
r[3],
g[3],
b[3], 255,
438 r[4],
g[4],
b[4], 255,
r[5],
g[5],
b[5], 255,
439 r[6],
g[6],
b[6], 255,
r[7],
g[7],
b[7], 255,
440 r[8],
g[8],
b[8], 255,
r[9],
g[9],
b[9], 255,
441 r[10],
g[10],
b[10], 255,
r[11],
g[11],
b[11], 255,
442 r[12],
g[12],
b[12], 255,
r[13],
g[13],
b[13], 255,
443 r[14],
g[14],
b[14], 255,
r[15],
g[15],
b[15], 255,
453 b[0],
g[0],
r[0], 255,
b[1],
g[1],
r[1], 255,
454 b[2],
g[2],
r[2], 255,
b[3],
g[3],
r[3], 255,
455 b[4],
g[4],
r[4], 255,
b[5],
g[5],
r[5], 255,
456 b[6],
g[6],
r[6], 255,
b[7],
g[7],
r[7], 255,
457 b[8],
g[8],
r[8], 255,
b[9],
g[9],
r[9], 255,
458 b[10],
g[10],
r[10], 255,
b[11],
g[11],
r[11], 255,
459 b[12],
g[12],
r[12], 255,
b[13],
g[13],
r[13], 255,
460 b[14],
g[14],
r[14], 255,
b[15],
g[15],
r[15], 255,
469 #if defined __SSSE3__ && ! defined ANDROID
474 template<rs2_format FORMAT>
475 void m420_sse_parse_one_line(
const __m128i* source_chunks_y,
const __m128i* source_chunks_uv, __m128i*
dst,
int line_length)
477 #pragma omp parallel for
478 for (
int i = 0;
i < line_length; ++
i)
480 const __m128i zero = _mm_set1_epi8(0);
481 __m128i y16__0_7 = _mm_unpacklo_epi8(source_chunks_y[
i], zero);
482 __m128i y16__8_F = _mm_unpackhi_epi8(source_chunks_y[
i], zero);
484 const __m128i evens_odds = _mm_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15);
486 __m128i uuuuuuuuvvvvvvvv = _mm_shuffle_epi8(source_chunks_uv[
i], evens_odds);
487 __m128i u = _mm_unpacklo_epi8(uuuuuuuuvvvvvvvv, uuuuuuuuvvvvvvvv);
488 __m128i
v = _mm_unpackhi_epi8(uuuuuuuuvvvvvvvv, uuuuuuuuvvvvvvvv);
490 __m128i u16__0_7 = _mm_unpacklo_epi8(u, zero);
491 __m128i u16__8_F = _mm_unpackhi_epi8(u, zero);
492 __m128i v16__0_7 = _mm_unpacklo_epi8(
v, zero);
493 __m128i v16__8_F = _mm_unpackhi_epi8(
v, zero);
495 const __m128i n100 = _mm_set1_epi16(100 << 4);
496 const __m128i n208 = _mm_set1_epi16(208 << 4);
497 const __m128i n298 = _mm_set1_epi16(298 << 4);
498 const __m128i n409 = _mm_set1_epi16(409 << 4);
499 const __m128i n516 = _mm_set1_epi16(516 << 4);
501 __m128i c16__0_7 = _mm_slli_epi16(_mm_subs_epi16(y16__0_7, _mm_set1_epi16(16)), 4);
502 __m128i d16__0_7 = _mm_slli_epi16(_mm_subs_epi16(u16__0_7, _mm_set1_epi16(128)), 4);
503 __m128i e16__0_7 = _mm_slli_epi16(_mm_subs_epi16(v16__0_7, _mm_set1_epi16(128)), 4);
504 __m128i r16__0_7 = _mm_min_epi16(_mm_set1_epi16(255), _mm_max_epi16(zero, ((_mm_add_epi16(_mm_mulhi_epi16(c16__0_7, n298), _mm_mulhi_epi16(e16__0_7, n409))))));
505 __m128i g16__0_7 = _mm_min_epi16(_mm_set1_epi16(255), _mm_max_epi16(zero, ((_mm_sub_epi16(_mm_sub_epi16(_mm_mulhi_epi16(c16__0_7, n298), _mm_mulhi_epi16(d16__0_7, n100)), _mm_mulhi_epi16(e16__0_7, n208))))));
506 __m128i b16__0_7 = _mm_min_epi16(_mm_set1_epi16(255), _mm_max_epi16(zero, ((_mm_add_epi16(_mm_mulhi_epi16(c16__0_7, n298), _mm_mulhi_epi16(d16__0_7, n516))))));
509 __m128i c16__8_F = _mm_slli_epi16(_mm_subs_epi16(y16__8_F, _mm_set1_epi16(16)), 4);
510 __m128i d16__8_F = _mm_slli_epi16(_mm_subs_epi16(u16__8_F, _mm_set1_epi16(128)), 4);
511 __m128i e16__8_F = _mm_slli_epi16(_mm_subs_epi16(v16__8_F, _mm_set1_epi16(128)), 4);
512 __m128i r16__8_F = _mm_min_epi16(_mm_set1_epi16(255), _mm_max_epi16(zero, ((_mm_add_epi16(_mm_mulhi_epi16(c16__8_F, n298), _mm_mulhi_epi16(e16__8_F, n409))))));
513 __m128i g16__8_F = _mm_min_epi16(_mm_set1_epi16(255), _mm_max_epi16(zero, ((_mm_sub_epi16(_mm_sub_epi16(_mm_mulhi_epi16(c16__8_F, n298), _mm_mulhi_epi16(d16__8_F, n100)), _mm_mulhi_epi16(e16__8_F, n208))))));
514 __m128i b16__8_F = _mm_min_epi16(_mm_set1_epi16(255), _mm_max_epi16(zero, ((_mm_add_epi16(_mm_mulhi_epi16(c16__8_F, n298), _mm_mulhi_epi16(d16__8_F, n516))))));
521 __m128i rg8__0_7 = _mm_unpacklo_epi8(_mm_shuffle_epi8(r16__0_7, evens_odds), _mm_shuffle_epi8(g16__0_7, evens_odds));
522 __m128i ba8__0_7 = _mm_unpacklo_epi8(_mm_shuffle_epi8(b16__0_7, evens_odds), _mm_set1_epi8(-1));
523 __m128i rgba_0_3 = _mm_unpacklo_epi16(rg8__0_7, ba8__0_7);
524 __m128i rgba_4_7 = _mm_unpackhi_epi16(rg8__0_7, ba8__0_7);
526 __m128i rg8__8_F = _mm_unpacklo_epi8(_mm_shuffle_epi8(r16__8_F, evens_odds), _mm_shuffle_epi8(g16__8_F, evens_odds));
527 __m128i ba8__8_F = _mm_unpacklo_epi8(_mm_shuffle_epi8(b16__8_F, evens_odds), _mm_set1_epi8(-1));
528 __m128i rgba_8_B = _mm_unpacklo_epi16(rg8__8_F, ba8__8_F);
529 __m128i rgba_C_F = _mm_unpackhi_epi16(rg8__8_F, ba8__8_F);
534 _mm_storeu_si128(&
dst[
i * 4], rgba_0_3);
535 _mm_storeu_si128(&
dst[
i * 4 + 1], rgba_4_7);
536 _mm_storeu_si128(&
dst[
i * 4 + 2], rgba_8_B);
537 _mm_storeu_si128(&
dst[
i * 4 + 3], rgba_C_F);
545 __m128i rgb0 = _mm_shuffle_epi8(rgba_0_3, _mm_setr_epi8(3, 7, 11, 15, 0, 1, 2, 4, 5, 6, 8, 9, 10, 12, 13, 14));
546 __m128i rgb1 = _mm_shuffle_epi8(rgba_4_7, _mm_setr_epi8(0, 1, 2, 4, 3, 7, 11, 15, 5, 6, 8, 9, 10, 12, 13, 14));
547 __m128i rgb2 = _mm_shuffle_epi8(rgba_8_B, _mm_setr_epi8(0, 1, 2, 4, 5, 6, 8, 9, 3, 7, 11, 15, 10, 12, 13, 14));
548 __m128i rgb3 = _mm_shuffle_epi8(rgba_C_F, _mm_setr_epi8(0, 1, 2, 4, 5, 6, 8, 9, 10, 12, 13, 14, 3, 7, 11, 15));
551 _mm_storeu_si128(&
dst[
i * 3], _mm_alignr_epi8(rgb1, rgb0, 4));
552 _mm_storeu_si128(&
dst[
i * 3 + 1], _mm_alignr_epi8(rgb2, rgb1, 8));
553 _mm_storeu_si128(&
dst[
i * 3 + 2], _mm_alignr_epi8(rgb3, rgb2, 12));
562 __m128i bg8__0_7 = _mm_unpacklo_epi8(_mm_shuffle_epi8(b16__0_7, evens_odds), _mm_shuffle_epi8(g16__0_7, evens_odds));
563 __m128i ra8__0_7 = _mm_unpacklo_epi8(_mm_shuffle_epi8(r16__0_7, evens_odds), _mm_set1_epi8(-1));
564 __m128i bgra_0_3 = _mm_unpacklo_epi16(bg8__0_7, ra8__0_7);
565 __m128i bgra_4_7 = _mm_unpackhi_epi16(bg8__0_7, ra8__0_7);
567 __m128i bg8__8_F = _mm_unpacklo_epi8(_mm_shuffle_epi8(b16__8_F, evens_odds), _mm_shuffle_epi8(g16__8_F, evens_odds));
568 __m128i ra8__8_F = _mm_unpacklo_epi8(_mm_shuffle_epi8(r16__8_F, evens_odds), _mm_set1_epi8(-1));
569 __m128i bgra_8_B = _mm_unpacklo_epi16(bg8__8_F, ra8__8_F);
570 __m128i bgra_C_F = _mm_unpackhi_epi16(bg8__8_F, ra8__8_F);
575 _mm_storeu_si128(&
dst[
i * 4], bgra_0_3);
576 _mm_storeu_si128(&
dst[
i * 4 + 1], bgra_4_7);
577 _mm_storeu_si128(&
dst[
i * 4 + 2], bgra_8_B);
578 _mm_storeu_si128(&
dst[
i * 4 + 3], bgra_C_F);
586 __m128i bgr0 = _mm_shuffle_epi8(bgra_0_3, _mm_setr_epi8(3, 7, 11, 15, 0, 1, 2, 4, 5, 6, 8, 9, 10, 12, 13, 14));
587 __m128i bgr1 = _mm_shuffle_epi8(bgra_4_7, _mm_setr_epi8(0, 1, 2, 4, 3, 7, 11, 15, 5, 6, 8, 9, 10, 12, 13, 14));
588 __m128i bgr2 = _mm_shuffle_epi8(bgra_8_B, _mm_setr_epi8(0, 1, 2, 4, 5, 6, 8, 9, 3, 7, 11, 15, 10, 12, 13, 14));
589 __m128i bgr3 = _mm_shuffle_epi8(bgra_C_F, _mm_setr_epi8(0, 1, 2, 4, 5, 6, 8, 9, 10, 12, 13, 14, 3, 7, 11, 15));
592 _mm_storeu_si128(&
dst[
i * 3], _mm_alignr_epi8(bgr1, bgr0, 4));
593 _mm_storeu_si128(&
dst[
i * 3 + 1], _mm_alignr_epi8(bgr2, bgr1, 8));
594 _mm_storeu_si128(&
dst[
i * 3 + 2], _mm_alignr_epi8(bgr3, bgr2, 12));
623 #if defined __SSSE3__ && ! defined ANDROID
624 static bool do_avx =
has_avx();
626 auto src =
reinterpret_cast<const __m128i*
>(
s);
627 auto dst =
reinterpret_cast<__m128i*
>(
d[0]);
629 __m128i* source_chunks_y =
new __m128i[2 *
width / 16];
630 __m128i* source_chunks_uv =
new __m128i[
width / 16];
632 #pragma omp parallel for
635 #pragma omp parallel for
636 for (
int i = 0;
i < 2 *
width / 16; ++
i)
638 auto offset_to_current_2_y_lines_for_src = (3 *
width *
j) / 16;
640 source_chunks_y[
i] = _mm_loadu_si128(&
src[offset_to_current_2_y_lines_for_src +
i]);
644 auto offset_to_current_2_y_lines_for_dst = (2 *
width *
j) / 16;
646 _mm_storeu_si128(&
dst[offset_to_current_2_y_lines_for_dst +
i], source_chunks_y[
i]);
653 auto offset_to_current_2_y_lines_for_dst = (2 *
width *
j) / 16 *
bpp;
654 const __m128i zero = _mm_set1_epi8(0);
655 __m128i y16__0_7 = _mm_unpacklo_epi8(source_chunks_y[
i], zero);
656 __m128i y16__8_F = _mm_unpackhi_epi8(source_chunks_y[
i], zero);
657 __m128i y16_0_7_epi_16 = _mm_slli_epi16(y16__0_7, 8);
658 __m128i y16_8_F_epi_16 = _mm_slli_epi16(y16__8_F, 8);
660 _mm_storeu_si128(&
dst[offset_to_current_2_y_lines_for_dst +
i * 2], y16_0_7_epi_16);
661 _mm_storeu_si128(&
dst[offset_to_current_2_y_lines_for_dst +
i * 2 + 1], y16_8_F_epi_16);
665 auto offset_to_current_uv_line_for_src = offset_to_current_2_y_lines_for_src + 2 *
width / 16;
667 source_chunks_uv[
i] = _mm_load_si128(&
src[offset_to_current_uv_line_for_src +
i]);
676 auto offset_to_current_first_line_for_dst = (2 *
width *
j) / 16 *
bpp;
677 auto offset_to_current_second_line_for_dst = offset_to_current_first_line_for_dst +
width *
bpp / 16;
679 auto line_length =
width / 16;
680 auto first_line_y = source_chunks_y;
681 auto second_line_y = source_chunks_y + line_length;
683 m420_sse_parse_one_line<FORMAT>(first_line_y, source_chunks_uv, &
dst[offset_to_current_first_line_for_dst], line_length);
684 m420_sse_parse_one_line<FORMAT>(second_line_y, source_chunks_uv, &
dst[offset_to_current_second_line_for_dst], line_length);
688 delete[] source_chunks_y;
689 delete[] source_chunks_uv;
695 auto src_height =
height * 12 >> 3;
699 for (
int k = 0; k < src_height; k += 3)
704 std::memcpy(
dst, start_of_y, 2 *
width );
711 for (
int k = 0; k < src_height; k += 3)
717 for (
int pix = 0; pix < 2 *
width; pix += 16)
720 for (
int dst_idx = 0, src_idx = 0; dst_idx < 16; dst_idx += 1, ++src_idx)
722 y[dst_idx] = start_of_y[src_idx + pix] << 8;
724 std::memcpy(
dst,
y,
sizeof y );
731 for (
int k = 0; k < src_height; k += 3)
736 auto start_of_second_line = start_of_y +
width;
737 auto end_of_y = start_of_second_line +
width;
738 auto start_of_uv = end_of_y;
739 auto end_of_uv = start_of_uv +
width;
741 m420_parse_one_line<FORMAT>(start_of_y, start_of_uv, &
dst,
width);
742 m420_parse_one_line<FORMAT>(start_of_second_line, start_of_uv, &
dst,
width);
753 unpack_yuy2<RS2_FORMAT_RGB8>(
d,
s,
w,
h, actual_size);
756 unpack_yuy2<RS2_FORMAT_Y8>(
d,
s,
w,
h, actual_size);
759 unpack_yuy2<RS2_FORMAT_RGBA8>(
d,
s,
w,
h, actual_size);
762 unpack_yuy2<RS2_FORMAT_BGR8>(
d,
s,
w,
h, actual_size);
765 unpack_yuy2<RS2_FORMAT_BGRA8>(
d,
s,
w,
h, actual_size);
768 unpack_yuy2<RS2_FORMAT_Y16>(
d,
s,
w,
h, actual_size );
771 LOG_ERROR(
"Unsupported format for YUY2 conversion.");
782 unpack_m420<RS2_FORMAT_Y8>(
d,
s,
w,
h, actual_size);
785 unpack_m420<RS2_FORMAT_Y16>(
d,
s,
w,
h, actual_size);
788 unpack_m420<RS2_FORMAT_RGB8>(
d,
s,
w,
h, actual_size);
791 unpack_m420<RS2_FORMAT_RGBA8>(
d,
s,
w,
h, actual_size);
794 unpack_m420<RS2_FORMAT_BGR8>(
d,
s,
w,
h, actual_size);
797 unpack_m420<RS2_FORMAT_BGRA8>(
d,
s,
w,
h, actual_size);
800 LOG_ERROR(
"Unsupported format for M420 conversion.");
815 auto src =
reinterpret_cast<const __m128i *
>(
s);
816 auto dst =
reinterpret_cast<__m128i *
>(
d[0]);
819 const __m128i zero = _mm_set1_epi8(0);
820 const __m128i n100 = _mm_set1_epi16(100 << 4);
821 const __m128i n208 = _mm_set1_epi16(208 << 4);
822 const __m128i n298 = _mm_set1_epi16(298 << 4);
823 const __m128i n409 = _mm_set1_epi16(409 << 4);
824 const __m128i n516 = _mm_set1_epi16(516 << 4);
825 const __m128i evens_odds = _mm_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15);
828 __m128i s0 = _mm_loadu_si128(
src++);
829 __m128i
s1 = _mm_loadu_si128(
src++);
833 const __m128i evens_odd1s_odd3s = _mm_setr_epi8(1, 3, 5, 7, 9, 11, 13, 15, 0, 4, 8, 12, 2, 6, 10, 14);
834 __m128i yyyyyyyyuuuuvvvv0 = _mm_shuffle_epi8(s0, evens_odd1s_odd3s);
835 __m128i yyyyyyyyuuuuvvvv8 = _mm_shuffle_epi8(
s1, evens_odd1s_odd3s);
838 __m128i y16__0_7 = _mm_unpacklo_epi8(yyyyyyyyuuuuvvvv0, zero);
839 __m128i y16__8_F = _mm_unpacklo_epi8(yyyyyyyyuuuuvvvv8, zero);
843 __m128i uv = _mm_unpackhi_epi32(yyyyyyyyuuuuvvvv0, yyyyyyyyuuuuvvvv8);
844 __m128i u = _mm_unpacklo_epi8(uv, uv);
845 __m128i
v = _mm_unpackhi_epi8(uv, uv);
846 __m128i u16__0_7 = _mm_unpacklo_epi8(u, zero);
847 __m128i u16__8_F = _mm_unpackhi_epi8(u, zero);
848 __m128i v16__0_7 = _mm_unpacklo_epi8(
v, zero);
849 __m128i v16__8_F = _mm_unpackhi_epi8(
v, zero);
852 __m128i c16__0_7 = _mm_slli_epi16(_mm_subs_epi16(y16__0_7, _mm_set1_epi16(16)), 4);
853 __m128i d16__0_7 = _mm_slli_epi16(_mm_subs_epi16(u16__0_7, _mm_set1_epi16(128)), 4);
854 __m128i e16__0_7 = _mm_slli_epi16(_mm_subs_epi16(v16__0_7, _mm_set1_epi16(128)), 4);
855 __m128i r16__0_7 = _mm_min_epi16(_mm_set1_epi16(255), _mm_max_epi16(zero, ((_mm_add_epi16(_mm_mulhi_epi16(c16__0_7, n298), _mm_mulhi_epi16(e16__0_7, n409))))));
856 __m128i g16__0_7 = _mm_min_epi16(_mm_set1_epi16(255), _mm_max_epi16(zero, ((_mm_sub_epi16(_mm_sub_epi16(_mm_mulhi_epi16(c16__0_7, n298), _mm_mulhi_epi16(d16__0_7, n100)), _mm_mulhi_epi16(e16__0_7, n208))))));
857 __m128i b16__0_7 = _mm_min_epi16(_mm_set1_epi16(255), _mm_max_epi16(zero, ((_mm_add_epi16(_mm_mulhi_epi16(c16__0_7, n298), _mm_mulhi_epi16(d16__0_7, n516))))));
860 __m128i c16__8_F = _mm_slli_epi16(_mm_subs_epi16(y16__8_F, _mm_set1_epi16(16)), 4);
861 __m128i d16__8_F = _mm_slli_epi16(_mm_subs_epi16(u16__8_F, _mm_set1_epi16(128)), 4);
862 __m128i e16__8_F = _mm_slli_epi16(_mm_subs_epi16(v16__8_F, _mm_set1_epi16(128)), 4);
863 __m128i r16__8_F = _mm_min_epi16(_mm_set1_epi16(255), _mm_max_epi16(zero, ((_mm_add_epi16(_mm_mulhi_epi16(c16__8_F, n298), _mm_mulhi_epi16(e16__8_F, n409))))));
864 __m128i g16__8_F = _mm_min_epi16(_mm_set1_epi16(255), _mm_max_epi16(zero, ((_mm_sub_epi16(_mm_sub_epi16(_mm_mulhi_epi16(c16__8_F, n298), _mm_mulhi_epi16(d16__8_F, n100)), _mm_mulhi_epi16(e16__8_F, n208))))));
865 __m128i b16__8_F = _mm_min_epi16(_mm_set1_epi16(255), _mm_max_epi16(zero, ((_mm_add_epi16(_mm_mulhi_epi16(c16__8_F, n298), _mm_mulhi_epi16(d16__8_F, n516))))));
870 __m128i rg8__0_7 = _mm_unpacklo_epi8(_mm_shuffle_epi8(r16__0_7, evens_odds), _mm_shuffle_epi8(g16__0_7, evens_odds));
871 __m128i ba8__0_7 = _mm_unpacklo_epi8(_mm_shuffle_epi8(b16__0_7, evens_odds), _mm_set1_epi8(-1));
872 __m128i rgba_0_3 = _mm_unpacklo_epi16(rg8__0_7, ba8__0_7);
873 __m128i rgba_4_7 = _mm_unpackhi_epi16(rg8__0_7, ba8__0_7);
875 __m128i rg8__8_F = _mm_unpacklo_epi8(_mm_shuffle_epi8(r16__8_F, evens_odds), _mm_shuffle_epi8(g16__8_F, evens_odds));
876 __m128i ba8__8_F = _mm_unpacklo_epi8(_mm_shuffle_epi8(b16__8_F, evens_odds), _mm_set1_epi8(-1));
877 __m128i rgba_8_B = _mm_unpacklo_epi16(rg8__8_F, ba8__8_F);
878 __m128i rgba_C_F = _mm_unpackhi_epi16(rg8__8_F, ba8__8_F);
883 _mm_storeu_si128(
dst++, rgba_0_3);
884 _mm_storeu_si128(
dst++, rgba_4_7);
885 _mm_storeu_si128(
dst++, rgba_8_B);
886 _mm_storeu_si128(
dst++, rgba_C_F);
892 __m128i rgb0 = _mm_shuffle_epi8(rgba_0_3, _mm_setr_epi8(3, 7, 11, 15, 0, 1, 2, 4, 5, 6, 8, 9, 10, 12, 13, 14));
893 __m128i rgb1 = _mm_shuffle_epi8(rgba_4_7, _mm_setr_epi8(0, 1, 2, 4, 3, 7, 11, 15, 5, 6, 8, 9, 10, 12, 13, 14));
894 __m128i rgb2 = _mm_shuffle_epi8(rgba_8_B, _mm_setr_epi8(0, 1, 2, 4, 5, 6, 8, 9, 3, 7, 11, 15, 10, 12, 13, 14));
895 __m128i rgb3 = _mm_shuffle_epi8(rgba_C_F, _mm_setr_epi8(0, 1, 2, 4, 5, 6, 8, 9, 10, 12, 13, 14, 3, 7, 11, 15));
898 _mm_storeu_si128(
dst++, _mm_alignr_epi8(rgb1, rgb0, 4));
899 _mm_storeu_si128(
dst++, _mm_alignr_epi8(rgb2, rgb1, 8));
900 _mm_storeu_si128(
dst++, _mm_alignr_epi8(rgb3, rgb2, 12));
907 __m128i bg8__0_7 = _mm_unpacklo_epi8(_mm_shuffle_epi8(b16__0_7, evens_odds), _mm_shuffle_epi8(g16__0_7, evens_odds));
908 __m128i ra8__0_7 = _mm_unpacklo_epi8(_mm_shuffle_epi8(r16__0_7, evens_odds), _mm_set1_epi8(-1));
909 __m128i bgra_0_3 = _mm_unpacklo_epi16(bg8__0_7, ra8__0_7);
910 __m128i bgra_4_7 = _mm_unpackhi_epi16(bg8__0_7, ra8__0_7);
912 __m128i bg8__8_F = _mm_unpacklo_epi8(_mm_shuffle_epi8(b16__8_F, evens_odds), _mm_shuffle_epi8(g16__8_F, evens_odds));
913 __m128i ra8__8_F = _mm_unpacklo_epi8(_mm_shuffle_epi8(r16__8_F, evens_odds), _mm_set1_epi8(-1));
914 __m128i bgra_8_B = _mm_unpacklo_epi16(bg8__8_F, ra8__8_F);
915 __m128i bgra_C_F = _mm_unpackhi_epi16(bg8__8_F, ra8__8_F);
920 _mm_storeu_si128(
dst++, bgra_0_3);
921 _mm_storeu_si128(
dst++, bgra_4_7);
922 _mm_storeu_si128(
dst++, bgra_8_B);
923 _mm_storeu_si128(
dst++, bgra_C_F);
929 __m128i bgr0 = _mm_shuffle_epi8(bgra_0_3, _mm_setr_epi8(3, 7, 11, 15, 0, 1, 2, 4, 5, 6, 8, 9, 10, 12, 13, 14));
930 __m128i bgr1 = _mm_shuffle_epi8(bgra_4_7, _mm_setr_epi8(0, 1, 2, 4, 3, 7, 11, 15, 5, 6, 8, 9, 10, 12, 13, 14));
931 __m128i bgr2 = _mm_shuffle_epi8(bgra_8_B, _mm_setr_epi8(0, 1, 2, 4, 5, 6, 8, 9, 3, 7, 11, 15, 10, 12, 13, 14));
932 __m128i bgr3 = _mm_shuffle_epi8(bgra_C_F, _mm_setr_epi8(0, 1, 2, 4, 5, 6, 8, 9, 10, 12, 13, 14, 3, 7, 11, 15));
935 _mm_storeu_si128(
dst++, _mm_alignr_epi8(bgr1, bgr0, 4));
936 _mm_storeu_si128(
dst++, _mm_alignr_epi8(bgr2, bgr1, 8));
937 _mm_storeu_si128(
dst++, _mm_alignr_epi8(bgr3, bgr2, 12));
941 #else // Generic code for when SSSE3 is not available.
944 for (;
n;
n -= 16,
src += 32)
964 for (
int i = 0;
i < 16;
i++)
971 #define clamp(x) ((t=(x)) > 255 ? 255 : t < 0 ? 0 : t)
972 r[
i] =
clamp((298 *
c + 409 *
e + 128) >> 8);
973 g[
i] =
clamp((298 *
c - 100 *
d - 208 *
e + 128) >> 8);
974 b[
i] =
clamp((298 *
c + 516 *
d + 128) >> 8);
981 r[0],
g[0],
b[0],
r[1],
g[1],
b[1],
982 r[2],
g[2],
b[2],
r[3],
g[3],
b[3],
983 r[4],
g[4],
b[4],
r[5],
g[5],
b[5],
984 r[6],
g[6],
b[6],
r[7],
g[7],
b[7],
985 r[8],
g[8],
b[8],
r[9],
g[9],
b[9],
986 r[10],
g[10],
b[10],
r[11],
g[11],
b[11],
987 r[12],
g[12],
b[12],
r[13],
g[13],
b[13],
988 r[14],
g[14],
b[14],
r[15],
g[15],
b[15],
998 b[0],
g[0],
r[0],
b[1],
g[1],
r[1],
999 b[2],
g[2],
r[2],
b[3],
g[3],
r[3],
1000 b[4],
g[4],
r[4],
b[5],
g[5],
r[5],
1001 b[6],
g[6],
r[6],
b[7],
g[7],
r[7],
1002 b[8],
g[8],
r[8],
b[9],
g[9],
r[9],
1003 b[10],
g[10],
r[10],
b[11],
g[11],
r[11],
1004 b[12],
g[12],
r[12],
b[13],
g[13],
r[13],
1005 b[14],
g[14],
r[14],
b[15],
g[15],
r[15],
1015 r[0],
g[0],
b[0], 255,
r[1],
g[1],
b[1], 255,
1016 r[2],
g[2],
b[2], 255,
r[3],
g[3],
b[3], 255,
1017 r[4],
g[4],
b[4], 255,
r[5],
g[5],
b[5], 255,
1018 r[6],
g[6],
b[6], 255,
r[7],
g[7],
b[7], 255,
1019 r[8],
g[8],
b[8], 255,
r[9],
g[9],
b[9], 255,
1020 r[10],
g[10],
b[10], 255,
r[11],
g[11],
b[11], 255,
1021 r[12],
g[12],
b[12], 255,
r[13],
g[13],
b[13], 255,
1022 r[14],
g[14],
b[14], 255,
r[15],
g[15],
b[15], 255,
1032 b[0],
g[0],
r[0], 255,
b[1],
g[1],
r[1], 255,
1033 b[2],
g[2],
r[2], 255,
b[3],
g[3],
r[3], 255,
1034 b[4],
g[4],
r[4], 255,
b[5],
g[5],
r[5], 255,
1035 b[6],
g[6],
r[6], 255,
b[7],
g[7],
r[7], 255,
1036 b[8],
g[8],
r[8], 255,
b[9],
g[9],
r[9], 255,
1037 b[10],
g[10],
r[10], 255,
b[11],
g[11],
r[11], 255,
1038 b[12],
g[12],
r[12], 255,
b[13],
g[13],
r[13], 255,
1039 b[14],
g[14],
r[14], 255,
b[15],
g[15],
r[15], 255,
1054 unpack_uyvy<RS2_FORMAT_RGB8>(
d,
s,
w,
h, actual_size);
1057 unpack_uyvy<RS2_FORMAT_RGBA8>(
d,
s,
w,
h, actual_size);
1060 unpack_uyvy<RS2_FORMAT_BGR8>(
d,
s,
w,
h, actual_size);
1063 unpack_uyvy<RS2_FORMAT_BGRA8>(
d,
s,
w,
h, actual_size);
1066 LOG_ERROR(
"Unsupported format for UYVY conversion.");
1078 if (uncompressed_rgb)
1080 auto uncompressed_size =
w *
h *
bpp;
1081 std::memcpy(
dest[0], uncompressed_rgb, uncompressed_size );
1100 std::swap(
out[
i * 3],
out[
i * 3 + 2]);