color-formats-converter.cpp
Go to the documentation of this file.
1 // License: Apache 2.0. See LICENSE file in root directory.
2 // Copyright(c) 2019 Intel Corporation. All Rights Reserved.
3 
5 
6 #include "option.h"
7 #include "image-avx.h"
8 #include "image.h"
9 
10 #define STB_IMAGE_STATIC
11 #define STB_IMAGE_IMPLEMENTATION
12 #include "../third-party/stb_image.h"
13 
14 #ifdef RS2_USE_CUDA
15 #include "cuda/cuda-conversion.cuh"
16 #endif
17 #ifdef __SSSE3__
18 #include <tmmintrin.h> // For SSSE3 intrinsics
19 #endif
20 
21 #if defined (ANDROID) || (defined (__linux__) && !defined (__x86_64__)) || (defined (__APPLE__) && !defined (__x86_64__))
22 
23 bool has_avx() { return false; }
24 
25 #else
26 
27 #ifdef _WIN32
28 #include <intrin.h>
29 #define cpuid(info, x) __cpuidex(info, x, 0)
30 #else
31 #include <cpuid.h>
32 void cpuid(int info[4], int info_type) {
33  __cpuid_count(info_type, 0, info[0], info[1], info[2], info[3]);
34 }
35 #endif
36 
37 bool has_avx()
38 {
39  int info[4];
40  cpuid(info, 0);
41  cpuid(info, 0x80000000);
42  return (info[2] & ((int)1 << 28)) != 0;
43 }
44 
45 #endif
46 
47 namespace librealsense
48 {
50  // YUY2 unpacking routines //
52  // This templated function unpacks YUY2 into Y8/Y16/RGB8/RGBA8/BGR8/BGRA8, depending on the compile-time parameter FORMAT.
53  // It is expected that all branching outside of the loop control variable will be removed due to constant-folding.
54  template<rs2_format FORMAT> void unpack_yuy2(byte * const d[], const byte * s, int width, int height, int actual_size)
55  {
56  auto n = width * height;
57  assert(n % 16 == 0); // All currently supported color resolutions are multiples of 16 pixels. Could easily extend support to other resolutions by copying final n<16 pixels into a zero-padded buffer and recursively calling self for final iteration.
58 #ifdef RS2_USE_CUDA
59  rscuda::unpack_yuy2_cuda<FORMAT>(d, s, n);
60  return;
61 #endif
62 #if defined __SSSE3__ && ! defined ANDROID
63  static bool do_avx = has_avx();
64 #ifdef __AVX2__
65 
66  if (do_avx)
67  {
68  if (FORMAT == RS2_FORMAT_Y8) unpack_yuy2_avx_y8(d, s, n);
69  if (FORMAT == RS2_FORMAT_Y16) unpack_yuy2_avx_y16(d, s, n);
70  if (FORMAT == RS2_FORMAT_RGB8) unpack_yuy2_avx_rgb8(d, s, n);
71  if (FORMAT == RS2_FORMAT_RGBA8) unpack_yuy2_avx_rgba8(d, s, n);
72  if (FORMAT == RS2_FORMAT_BGR8) unpack_yuy2_avx_bgr8(d, s, n);
73  if (FORMAT == RS2_FORMAT_BGRA8) unpack_yuy2_avx_bgra8(d, s, n);
74  }
75  else
76 #endif
77  {
78  auto src = reinterpret_cast<const __m128i *>(s);
79  auto dst = reinterpret_cast<__m128i *>(d[0]);
80 
81 #pragma omp parallel for
82  for (int i = 0; i < n / 16; i++)
83  {
84  const __m128i zero = _mm_set1_epi8(0);
85  const __m128i n100 = _mm_set1_epi16(100 << 4);
86  const __m128i n208 = _mm_set1_epi16(208 << 4);
87  const __m128i n298 = _mm_set1_epi16(298 << 4);
88  const __m128i n409 = _mm_set1_epi16(409 << 4);
89  const __m128i n516 = _mm_set1_epi16(516 << 4);
90  const __m128i evens_odds = _mm_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15);
91 
92  // Load 8 YUY2 pixels each into two 16-byte registers
93  __m128i s0 = _mm_loadu_si128(&src[i * 2]);
94  __m128i s1 = _mm_loadu_si128(&src[i * 2 + 1]);
95 
96  if (FORMAT == RS2_FORMAT_Y8)
97  {
98  // Align all Y components and output 16 pixels (16 bytes) at once
99  __m128i y0 = _mm_shuffle_epi8(s0, _mm_setr_epi8(1, 3, 5, 7, 9, 11, 13, 15, 0, 2, 4, 6, 8, 10, 12, 14));
100  __m128i y1 = _mm_shuffle_epi8(s1, _mm_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15));
101  _mm_storeu_si128(&dst[i], _mm_alignr_epi8(y0, y1, 8));
102  continue;
103  }
104 
105  // Shuffle all Y components to the low order bytes of the register, and all U/V components to the high order bytes
106  const __m128i evens_odd1s_odd3s = _mm_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 5, 9, 13, 3, 7, 11, 15); // to get yyyyyyyyuuuuvvvv
107  __m128i yyyyyyyyuuuuvvvv0 = _mm_shuffle_epi8(s0, evens_odd1s_odd3s);
108  __m128i yyyyyyyyuuuuvvvv8 = _mm_shuffle_epi8(s1, evens_odd1s_odd3s);
109 
110  // Retrieve all 16 Y components as 16-bit values (8 components per register))
111  __m128i y16__0_7 = _mm_unpacklo_epi8(yyyyyyyyuuuuvvvv0, zero); // convert to 16 bit
112  __m128i y16__8_F = _mm_unpacklo_epi8(yyyyyyyyuuuuvvvv8, zero); // convert to 16 bit
113 
114  if (FORMAT == RS2_FORMAT_Y16)
115  {
116  // Output 16 pixels (32 bytes) at once
117  _mm_storeu_si128(&dst[i * 2], _mm_slli_epi16(y16__0_7, 8));
118  _mm_storeu_si128(&dst[i * 2 + 1], _mm_slli_epi16(y16__8_F, 8));
119  continue;
120  }
121 
122  // Retrieve all 16 U and V components as 16-bit values (8 components per register)
123  __m128i uv = _mm_unpackhi_epi32(yyyyyyyyuuuuvvvv0, yyyyyyyyuuuuvvvv8); // uuuuuuuuvvvvvvvv
124  __m128i u = _mm_unpacklo_epi8(uv, uv); // uu uu uu uu uu uu uu uu u's duplicated
125  __m128i v = _mm_unpackhi_epi8(uv, uv); // vv vv vv vv vv vv vv vv
126  __m128i u16__0_7 = _mm_unpacklo_epi8(u, zero); // convert to 16 bit
127  __m128i u16__8_F = _mm_unpackhi_epi8(u, zero); // convert to 16 bit
128  __m128i v16__0_7 = _mm_unpacklo_epi8(v, zero); // convert to 16 bit
129  __m128i v16__8_F = _mm_unpackhi_epi8(v, zero); // convert to 16 bit
130 
131  // Compute R, G, B values for first 8 pixels
132  __m128i c16__0_7 = _mm_slli_epi16(_mm_subs_epi16(y16__0_7, _mm_set1_epi16(16)), 4);
133  __m128i d16__0_7 = _mm_slli_epi16(_mm_subs_epi16(u16__0_7, _mm_set1_epi16(128)), 4); // perhaps could have done these u,v to d,e before the duplication
134  __m128i e16__0_7 = _mm_slli_epi16(_mm_subs_epi16(v16__0_7, _mm_set1_epi16(128)), 4);
135  __m128i r16__0_7 = _mm_min_epi16(_mm_set1_epi16(255), _mm_max_epi16(zero, ((_mm_add_epi16(_mm_mulhi_epi16(c16__0_7, n298), _mm_mulhi_epi16(e16__0_7, n409)))))); // (298 * c + 409 * e + 128) ; //
136  __m128i g16__0_7 = _mm_min_epi16(_mm_set1_epi16(255), _mm_max_epi16(zero, ((_mm_sub_epi16(_mm_sub_epi16(_mm_mulhi_epi16(c16__0_7, n298), _mm_mulhi_epi16(d16__0_7, n100)), _mm_mulhi_epi16(e16__0_7, n208)))))); // (298 * c - 100 * d - 208 * e + 128)
137  __m128i b16__0_7 = _mm_min_epi16(_mm_set1_epi16(255), _mm_max_epi16(zero, ((_mm_add_epi16(_mm_mulhi_epi16(c16__0_7, n298), _mm_mulhi_epi16(d16__0_7, n516)))))); // clampbyte((298 * c + 516 * d + 128) >> 8);
138 
139  // Compute R, G, B values for second 8 pixels
140  __m128i c16__8_F = _mm_slli_epi16(_mm_subs_epi16(y16__8_F, _mm_set1_epi16(16)), 4);
141  __m128i d16__8_F = _mm_slli_epi16(_mm_subs_epi16(u16__8_F, _mm_set1_epi16(128)), 4); // perhaps could have done these u,v to d,e before the duplication
142  __m128i e16__8_F = _mm_slli_epi16(_mm_subs_epi16(v16__8_F, _mm_set1_epi16(128)), 4);
143  __m128i r16__8_F = _mm_min_epi16(_mm_set1_epi16(255), _mm_max_epi16(zero, ((_mm_add_epi16(_mm_mulhi_epi16(c16__8_F, n298), _mm_mulhi_epi16(e16__8_F, n409)))))); // (298 * c + 409 * e + 128) ; //
144  __m128i g16__8_F = _mm_min_epi16(_mm_set1_epi16(255), _mm_max_epi16(zero, ((_mm_sub_epi16(_mm_sub_epi16(_mm_mulhi_epi16(c16__8_F, n298), _mm_mulhi_epi16(d16__8_F, n100)), _mm_mulhi_epi16(e16__8_F, n208)))))); // (298 * c - 100 * d - 208 * e + 128)
145  __m128i b16__8_F = _mm_min_epi16(_mm_set1_epi16(255), _mm_max_epi16(zero, ((_mm_add_epi16(_mm_mulhi_epi16(c16__8_F, n298), _mm_mulhi_epi16(d16__8_F, n516)))))); // clampbyte((298 * c + 516 * d + 128) >> 8);
146 
148  {
149  // Shuffle separate R, G, B values into four registers storing four pixels each in (R, G, B, A) order
150  __m128i rg8__0_7 = _mm_unpacklo_epi8(_mm_shuffle_epi8(r16__0_7, evens_odds), _mm_shuffle_epi8(g16__0_7, evens_odds)); // hi to take the odds which are the upper bytes we care about
151  __m128i ba8__0_7 = _mm_unpacklo_epi8(_mm_shuffle_epi8(b16__0_7, evens_odds), _mm_set1_epi8(-1));
152  __m128i rgba_0_3 = _mm_unpacklo_epi16(rg8__0_7, ba8__0_7);
153  __m128i rgba_4_7 = _mm_unpackhi_epi16(rg8__0_7, ba8__0_7);
154 
155  __m128i rg8__8_F = _mm_unpacklo_epi8(_mm_shuffle_epi8(r16__8_F, evens_odds), _mm_shuffle_epi8(g16__8_F, evens_odds)); // hi to take the odds which are the upper bytes we care about
156  __m128i ba8__8_F = _mm_unpacklo_epi8(_mm_shuffle_epi8(b16__8_F, evens_odds), _mm_set1_epi8(-1));
157  __m128i rgba_8_B = _mm_unpacklo_epi16(rg8__8_F, ba8__8_F);
158  __m128i rgba_C_F = _mm_unpackhi_epi16(rg8__8_F, ba8__8_F);
159 
160  if (FORMAT == RS2_FORMAT_RGBA8)
161  {
162  // Store 16 pixels (64 bytes) at once
163  _mm_storeu_si128(&dst[i * 4], rgba_0_3);
164  _mm_storeu_si128(&dst[i * 4 + 1], rgba_4_7);
165  _mm_storeu_si128(&dst[i * 4 + 2], rgba_8_B);
166  _mm_storeu_si128(&dst[i * 4 + 3], rgba_C_F);
167  }
168 
169  if (FORMAT == RS2_FORMAT_RGB8)
170  {
171  // Shuffle rgb triples to the start and end of each register
172  __m128i rgb0 = _mm_shuffle_epi8(rgba_0_3, _mm_setr_epi8(3, 7, 11, 15, 0, 1, 2, 4, 5, 6, 8, 9, 10, 12, 13, 14));
173  __m128i rgb1 = _mm_shuffle_epi8(rgba_4_7, _mm_setr_epi8(0, 1, 2, 4, 3, 7, 11, 15, 5, 6, 8, 9, 10, 12, 13, 14));
174  __m128i rgb2 = _mm_shuffle_epi8(rgba_8_B, _mm_setr_epi8(0, 1, 2, 4, 5, 6, 8, 9, 3, 7, 11, 15, 10, 12, 13, 14));
175  __m128i rgb3 = _mm_shuffle_epi8(rgba_C_F, _mm_setr_epi8(0, 1, 2, 4, 5, 6, 8, 9, 10, 12, 13, 14, 3, 7, 11, 15));
176 
177  // Align registers and store 16 pixels (48 bytes) at once
178  _mm_storeu_si128(&dst[i * 3], _mm_alignr_epi8(rgb1, rgb0, 4));
179  _mm_storeu_si128(&dst[i * 3 + 1], _mm_alignr_epi8(rgb2, rgb1, 8));
180  _mm_storeu_si128(&dst[i * 3 + 2], _mm_alignr_epi8(rgb3, rgb2, 12));
181  }
182  }
183 
185  {
186  // Shuffle separate R, G, B values into four registers storing four pixels each in (B, G, R, A) order
187  __m128i bg8__0_7 = _mm_unpacklo_epi8(_mm_shuffle_epi8(b16__0_7, evens_odds), _mm_shuffle_epi8(g16__0_7, evens_odds)); // hi to take the odds which are the upper bytes we care about
188  __m128i ra8__0_7 = _mm_unpacklo_epi8(_mm_shuffle_epi8(r16__0_7, evens_odds), _mm_set1_epi8(-1));
189  __m128i bgra_0_3 = _mm_unpacklo_epi16(bg8__0_7, ra8__0_7);
190  __m128i bgra_4_7 = _mm_unpackhi_epi16(bg8__0_7, ra8__0_7);
191 
192  __m128i bg8__8_F = _mm_unpacklo_epi8(_mm_shuffle_epi8(b16__8_F, evens_odds), _mm_shuffle_epi8(g16__8_F, evens_odds)); // hi to take the odds which are the upper bytes we care about
193  __m128i ra8__8_F = _mm_unpacklo_epi8(_mm_shuffle_epi8(r16__8_F, evens_odds), _mm_set1_epi8(-1));
194  __m128i bgra_8_B = _mm_unpacklo_epi16(bg8__8_F, ra8__8_F);
195  __m128i bgra_C_F = _mm_unpackhi_epi16(bg8__8_F, ra8__8_F);
196 
197  if (FORMAT == RS2_FORMAT_BGRA8)
198  {
199  // Store 16 pixels (64 bytes) at once
200  _mm_storeu_si128(&dst[i * 4], bgra_0_3);
201  _mm_storeu_si128(&dst[i * 4 + 1], bgra_4_7);
202  _mm_storeu_si128(&dst[i * 4 + 2], bgra_8_B);
203  _mm_storeu_si128(&dst[i * 4 + 3], bgra_C_F);
204  }
205 
206  if (FORMAT == RS2_FORMAT_BGR8)
207  {
208  // Shuffle rgb triples to the start and end of each register
209  __m128i bgr0 = _mm_shuffle_epi8(bgra_0_3, _mm_setr_epi8(3, 7, 11, 15, 0, 1, 2, 4, 5, 6, 8, 9, 10, 12, 13, 14));
210  __m128i bgr1 = _mm_shuffle_epi8(bgra_4_7, _mm_setr_epi8(0, 1, 2, 4, 3, 7, 11, 15, 5, 6, 8, 9, 10, 12, 13, 14));
211  __m128i bgr2 = _mm_shuffle_epi8(bgra_8_B, _mm_setr_epi8(0, 1, 2, 4, 5, 6, 8, 9, 3, 7, 11, 15, 10, 12, 13, 14));
212  __m128i bgr3 = _mm_shuffle_epi8(bgra_C_F, _mm_setr_epi8(0, 1, 2, 4, 5, 6, 8, 9, 10, 12, 13, 14, 3, 7, 11, 15));
213 
214  // Align registers and store 16 pixels (48 bytes) at once
215  _mm_storeu_si128(&dst[i * 3], _mm_alignr_epi8(bgr1, bgr0, 4));
216  _mm_storeu_si128(&dst[i * 3 + 1], _mm_alignr_epi8(bgr2, bgr1, 8));
217  _mm_storeu_si128(&dst[i * 3 + 2], _mm_alignr_epi8(bgr3, bgr2, 12));
218  }
219  }
220  }
221  }
222 #else // Generic code for when SSSE3 is not available.
223  auto src = reinterpret_cast<const uint8_t *>(s);
224  auto dst = reinterpret_cast<uint8_t *>(d[0]);
225  for (; n; n -= 16, src += 32)
226  {
227  if (FORMAT == RS2_FORMAT_Y8)
228  {
229  uint8_t out[16] = {
230  src[0], src[2], src[4], src[6],
231  src[8], src[10], src[12], src[14],
232  src[16], src[18], src[20], src[22],
233  src[24], src[26], src[28], src[30],
234  };
235  librealsense::copy(dst, out, sizeof out);
236  dst += sizeof out;
237  continue;
238  }
239 
240  if (FORMAT == RS2_FORMAT_Y16)
241  {
242  // Y16 is little-endian. We output Y << 8.
243  uint8_t out[32] = {
244  0, src[0], 0, src[2], 0, src[4], 0, src[6],
245  0, src[8], 0, src[10], 0, src[12], 0, src[14],
246  0, src[16], 0, src[18], 0, src[20], 0, src[22],
247  0, src[24], 0, src[26], 0, src[28], 0, src[30],
248  };
249  librealsense::copy(dst, out, sizeof out);
250  dst += sizeof out;
251  continue;
252  }
253 
254  int16_t y[16] = {
255  src[0], src[2], src[4], src[6],
256  src[8], src[10], src[12], src[14],
257  src[16], src[18], src[20], src[22],
258  src[24], src[26], src[28], src[30],
259  }, u[16] = {
260  src[1], src[1], src[5], src[5],
261  src[9], src[9], src[13], src[13],
262  src[17], src[17], src[21], src[21],
263  src[25], src[25], src[29], src[29],
264  }, v[16] = {
265  src[3], src[3], src[7], src[7],
266  src[11], src[11], src[15], src[15],
267  src[19], src[19], src[23], src[23],
268  src[27], src[27], src[31], src[31],
269  };
270 
271  uint8_t r[16], g[16], b[16];
272  for (int i = 0; i < 16; i++)
273  {
274  int32_t c = y[i] - 16;
275  int32_t d = u[i] - 128;
276  int32_t e = v[i] - 128;
277 
278  int32_t t;
279 #define clamp(x) ((t=(x)) > 255 ? 255 : t < 0 ? 0 : t)
280  r[i] = clamp((298 * c + 409 * e + 128) >> 8);
281  g[i] = clamp((298 * c - 100 * d - 208 * e + 128) >> 8);
282  b[i] = clamp((298 * c + 516 * d + 128) >> 8);
283 #undef clamp
284  }
285 
286  if (FORMAT == RS2_FORMAT_RGB8)
287  {
288  uint8_t out[16 * 3] = {
289  r[0], g[0], b[0], r[1], g[1], b[1],
290  r[2], g[2], b[2], r[3], g[3], b[3],
291  r[4], g[4], b[4], r[5], g[5], b[5],
292  r[6], g[6], b[6], r[7], g[7], b[7],
293  r[8], g[8], b[8], r[9], g[9], b[9],
294  r[10], g[10], b[10], r[11], g[11], b[11],
295  r[12], g[12], b[12], r[13], g[13], b[13],
296  r[14], g[14], b[14], r[15], g[15], b[15],
297  };
298  librealsense::copy(dst, out, sizeof out);
299  dst += sizeof out;
300  continue;
301  }
302 
303  if (FORMAT == RS2_FORMAT_BGR8)
304  {
305  uint8_t out[16 * 3] = {
306  b[0], g[0], r[0], b[1], g[1], r[1],
307  b[2], g[2], r[2], b[3], g[3], r[3],
308  b[4], g[4], r[4], b[5], g[5], r[5],
309  b[6], g[6], r[6], b[7], g[7], r[7],
310  b[8], g[8], r[8], b[9], g[9], r[9],
311  b[10], g[10], r[10], b[11], g[11], r[11],
312  b[12], g[12], r[12], b[13], g[13], r[13],
313  b[14], g[14], r[14], b[15], g[15], r[15],
314  };
315  librealsense::copy(dst, out, sizeof out);
316  dst += sizeof out;
317  continue;
318  }
319 
320  if (FORMAT == RS2_FORMAT_RGBA8)
321  {
322  uint8_t out[16 * 4] = {
323  r[0], g[0], b[0], 255, r[1], g[1], b[1], 255,
324  r[2], g[2], b[2], 255, r[3], g[3], b[3], 255,
325  r[4], g[4], b[4], 255, r[5], g[5], b[5], 255,
326  r[6], g[6], b[6], 255, r[7], g[7], b[7], 255,
327  r[8], g[8], b[8], 255, r[9], g[9], b[9], 255,
328  r[10], g[10], b[10], 255, r[11], g[11], b[11], 255,
329  r[12], g[12], b[12], 255, r[13], g[13], b[13], 255,
330  r[14], g[14], b[14], 255, r[15], g[15], b[15], 255,
331  };
332  librealsense::copy(dst, out, sizeof out);
333  dst += sizeof out;
334  continue;
335  }
336 
337  if (FORMAT == RS2_FORMAT_BGRA8)
338  {
339  uint8_t out[16 * 4] = {
340  b[0], g[0], r[0], 255, b[1], g[1], r[1], 255,
341  b[2], g[2], r[2], 255, b[3], g[3], r[3], 255,
342  b[4], g[4], r[4], 255, b[5], g[5], r[5], 255,
343  b[6], g[6], r[6], 255, b[7], g[7], r[7], 255,
344  b[8], g[8], r[8], 255, b[9], g[9], r[9], 255,
345  b[10], g[10], r[10], 255, b[11], g[11], r[11], 255,
346  b[12], g[12], r[12], 255, b[13], g[13], r[13], 255,
347  b[14], g[14], r[14], 255, b[15], g[15], r[15], 255,
348  };
349  librealsense::copy(dst, out, sizeof out);
350  dst += sizeof out;
351  continue;
352  }
353  }
354 #endif
355  }
356 
357  void unpack_yuy2(rs2_format dst_format, rs2_stream dst_stream, byte * const d[], const byte * s, int w, int h, int actual_size)
358  {
359  switch (dst_format)
360  {
361  case RS2_FORMAT_Y8:
362  unpack_yuy2<RS2_FORMAT_Y8>(d, s, w, h, actual_size);
363  break;
364  case RS2_FORMAT_Y16:
365  unpack_yuy2<RS2_FORMAT_Y16>(d, s, w, h, actual_size);
366  break;
367  case RS2_FORMAT_RGB8:
368  unpack_yuy2<RS2_FORMAT_RGB8>(d, s, w, h, actual_size);
369  break;
370  case RS2_FORMAT_RGBA8:
371  unpack_yuy2<RS2_FORMAT_RGBA8>(d, s, w, h, actual_size);
372  break;
373  case RS2_FORMAT_BGR8:
374  unpack_yuy2<RS2_FORMAT_BGR8>(d, s, w, h, actual_size);
375  break;
376  case RS2_FORMAT_BGRA8:
377  unpack_yuy2<RS2_FORMAT_BGRA8>(d, s, w, h, actual_size);
378  break;
379  default:
380  LOG_ERROR("Unsupported format for YUY2 conversion.");
381  break;
382  }
383  }
384 
385 
387  // UYVY unpacking routines //
389  // This templated function unpacks UYVY into RGB8/RGBA8/BGR8/BGRA8, depending on the compile-time parameter FORMAT.
390  // It is expected that all branching outside of the loop control variable will be removed due to constant-folding.
391  template<rs2_format FORMAT> void unpack_uyvy(byte * const d[], const byte * s, int width, int height, int actual_size)
392  {
393  auto n = width * height;
394  assert(n % 16 == 0); // All currently supported color resolutions are multiples of 16 pixels. Could easily extend support to other resolutions by copying final n<16 pixels into a zero-padded buffer and recursively calling self for final iteration.
395 #ifdef __SSSE3__
396  auto src = reinterpret_cast<const __m128i *>(s);
397  auto dst = reinterpret_cast<__m128i *>(d[0]);
398  for (; n; n -= 16)
399  {
400  const __m128i zero = _mm_set1_epi8(0);
401  const __m128i n100 = _mm_set1_epi16(100 << 4);
402  const __m128i n208 = _mm_set1_epi16(208 << 4);
403  const __m128i n298 = _mm_set1_epi16(298 << 4);
404  const __m128i n409 = _mm_set1_epi16(409 << 4);
405  const __m128i n516 = _mm_set1_epi16(516 << 4);
406  const __m128i evens_odds = _mm_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15);
407 
408  // Load 8 UYVY pixels each into two 16-byte registers
409  __m128i s0 = _mm_loadu_si128(src++);
410  __m128i s1 = _mm_loadu_si128(src++);
411 
412 
413  // Shuffle all Y components to the low order bytes of the register, and all U/V components to the high order bytes
414  const __m128i evens_odd1s_odd3s = _mm_setr_epi8(1, 3, 5, 7, 9, 11, 13, 15, 0, 4, 8, 12, 2, 6, 10, 14); // to get yyyyyyyyuuuuvvvv
415  __m128i yyyyyyyyuuuuvvvv0 = _mm_shuffle_epi8(s0, evens_odd1s_odd3s);
416  __m128i yyyyyyyyuuuuvvvv8 = _mm_shuffle_epi8(s1, evens_odd1s_odd3s);
417 
418  // Retrieve all 16 Y components as 16-bit values (8 components per register))
419  __m128i y16__0_7 = _mm_unpacklo_epi8(yyyyyyyyuuuuvvvv0, zero); // convert to 16 bit
420  __m128i y16__8_F = _mm_unpacklo_epi8(yyyyyyyyuuuuvvvv8, zero); // convert to 16 bit
421 
422 
423  // Retrieve all 16 U and V components as 16-bit values (8 components per register)
424  __m128i uv = _mm_unpackhi_epi32(yyyyyyyyuuuuvvvv0, yyyyyyyyuuuuvvvv8); // uuuuuuuuvvvvvvvv
425  __m128i u = _mm_unpacklo_epi8(uv, uv); // uu uu uu uu uu uu uu uu u's duplicated
426  __m128i v = _mm_unpackhi_epi8(uv, uv); // vv vv vv vv vv vv vv vv
427  __m128i u16__0_7 = _mm_unpacklo_epi8(u, zero); // convert to 16 bit
428  __m128i u16__8_F = _mm_unpackhi_epi8(u, zero); // convert to 16 bit
429  __m128i v16__0_7 = _mm_unpacklo_epi8(v, zero); // convert to 16 bit
430  __m128i v16__8_F = _mm_unpackhi_epi8(v, zero); // convert to 16 bit
431 
432  // Compute R, G, B values for first 8 pixels
433  __m128i c16__0_7 = _mm_slli_epi16(_mm_subs_epi16(y16__0_7, _mm_set1_epi16(16)), 4);
434  __m128i d16__0_7 = _mm_slli_epi16(_mm_subs_epi16(u16__0_7, _mm_set1_epi16(128)), 4); // perhaps could have done these u,v to d,e before the duplication
435  __m128i e16__0_7 = _mm_slli_epi16(_mm_subs_epi16(v16__0_7, _mm_set1_epi16(128)), 4);
436  __m128i r16__0_7 = _mm_min_epi16(_mm_set1_epi16(255), _mm_max_epi16(zero, ((_mm_add_epi16(_mm_mulhi_epi16(c16__0_7, n298), _mm_mulhi_epi16(e16__0_7, n409)))))); // (298 * c + 409 * e + 128) ; //
437  __m128i g16__0_7 = _mm_min_epi16(_mm_set1_epi16(255), _mm_max_epi16(zero, ((_mm_sub_epi16(_mm_sub_epi16(_mm_mulhi_epi16(c16__0_7, n298), _mm_mulhi_epi16(d16__0_7, n100)), _mm_mulhi_epi16(e16__0_7, n208)))))); // (298 * c - 100 * d - 208 * e + 128)
438  __m128i b16__0_7 = _mm_min_epi16(_mm_set1_epi16(255), _mm_max_epi16(zero, ((_mm_add_epi16(_mm_mulhi_epi16(c16__0_7, n298), _mm_mulhi_epi16(d16__0_7, n516)))))); // clampbyte((298 * c + 516 * d + 128) >> 8);
439 
440  // Compute R, G, B values for second 8 pixels
441  __m128i c16__8_F = _mm_slli_epi16(_mm_subs_epi16(y16__8_F, _mm_set1_epi16(16)), 4);
442  __m128i d16__8_F = _mm_slli_epi16(_mm_subs_epi16(u16__8_F, _mm_set1_epi16(128)), 4); // perhaps could have done these u,v to d,e before the duplication
443  __m128i e16__8_F = _mm_slli_epi16(_mm_subs_epi16(v16__8_F, _mm_set1_epi16(128)), 4);
444  __m128i r16__8_F = _mm_min_epi16(_mm_set1_epi16(255), _mm_max_epi16(zero, ((_mm_add_epi16(_mm_mulhi_epi16(c16__8_F, n298), _mm_mulhi_epi16(e16__8_F, n409)))))); // (298 * c + 409 * e + 128) ; //
445  __m128i g16__8_F = _mm_min_epi16(_mm_set1_epi16(255), _mm_max_epi16(zero, ((_mm_sub_epi16(_mm_sub_epi16(_mm_mulhi_epi16(c16__8_F, n298), _mm_mulhi_epi16(d16__8_F, n100)), _mm_mulhi_epi16(e16__8_F, n208)))))); // (298 * c - 100 * d - 208 * e + 128)
446  __m128i b16__8_F = _mm_min_epi16(_mm_set1_epi16(255), _mm_max_epi16(zero, ((_mm_add_epi16(_mm_mulhi_epi16(c16__8_F, n298), _mm_mulhi_epi16(d16__8_F, n516)))))); // clampbyte((298 * c + 516 * d + 128) >> 8);
447 
449  {
450  // Shuffle separate R, G, B values into four registers storing four pixels each in (R, G, B, A) order
451  __m128i rg8__0_7 = _mm_unpacklo_epi8(_mm_shuffle_epi8(r16__0_7, evens_odds), _mm_shuffle_epi8(g16__0_7, evens_odds)); // hi to take the odds which are the upper bytes we care about
452  __m128i ba8__0_7 = _mm_unpacklo_epi8(_mm_shuffle_epi8(b16__0_7, evens_odds), _mm_set1_epi8(-1));
453  __m128i rgba_0_3 = _mm_unpacklo_epi16(rg8__0_7, ba8__0_7);
454  __m128i rgba_4_7 = _mm_unpackhi_epi16(rg8__0_7, ba8__0_7);
455 
456  __m128i rg8__8_F = _mm_unpacklo_epi8(_mm_shuffle_epi8(r16__8_F, evens_odds), _mm_shuffle_epi8(g16__8_F, evens_odds)); // hi to take the odds which are the upper bytes we care about
457  __m128i ba8__8_F = _mm_unpacklo_epi8(_mm_shuffle_epi8(b16__8_F, evens_odds), _mm_set1_epi8(-1));
458  __m128i rgba_8_B = _mm_unpacklo_epi16(rg8__8_F, ba8__8_F);
459  __m128i rgba_C_F = _mm_unpackhi_epi16(rg8__8_F, ba8__8_F);
460 
461  if (FORMAT == RS2_FORMAT_RGBA8)
462  {
463  // Store 16 pixels (64 bytes) at once
464  _mm_storeu_si128(dst++, rgba_0_3);
465  _mm_storeu_si128(dst++, rgba_4_7);
466  _mm_storeu_si128(dst++, rgba_8_B);
467  _mm_storeu_si128(dst++, rgba_C_F);
468  }
469 
470  if (FORMAT == RS2_FORMAT_RGB8)
471  {
472  // Shuffle rgb triples to the start and end of each register
473  __m128i rgb0 = _mm_shuffle_epi8(rgba_0_3, _mm_setr_epi8(3, 7, 11, 15, 0, 1, 2, 4, 5, 6, 8, 9, 10, 12, 13, 14));
474  __m128i rgb1 = _mm_shuffle_epi8(rgba_4_7, _mm_setr_epi8(0, 1, 2, 4, 3, 7, 11, 15, 5, 6, 8, 9, 10, 12, 13, 14));
475  __m128i rgb2 = _mm_shuffle_epi8(rgba_8_B, _mm_setr_epi8(0, 1, 2, 4, 5, 6, 8, 9, 3, 7, 11, 15, 10, 12, 13, 14));
476  __m128i rgb3 = _mm_shuffle_epi8(rgba_C_F, _mm_setr_epi8(0, 1, 2, 4, 5, 6, 8, 9, 10, 12, 13, 14, 3, 7, 11, 15));
477 
478  // Align registers and store 16 pixels (48 bytes) at once
479  _mm_storeu_si128(dst++, _mm_alignr_epi8(rgb1, rgb0, 4));
480  _mm_storeu_si128(dst++, _mm_alignr_epi8(rgb2, rgb1, 8));
481  _mm_storeu_si128(dst++, _mm_alignr_epi8(rgb3, rgb2, 12));
482  }
483  }
484 
486  {
487  // Shuffle separate R, G, B values into four registers storing four pixels each in (B, G, R, A) order
488  __m128i bg8__0_7 = _mm_unpacklo_epi8(_mm_shuffle_epi8(b16__0_7, evens_odds), _mm_shuffle_epi8(g16__0_7, evens_odds)); // hi to take the odds which are the upper bytes we care about
489  __m128i ra8__0_7 = _mm_unpacklo_epi8(_mm_shuffle_epi8(r16__0_7, evens_odds), _mm_set1_epi8(-1));
490  __m128i bgra_0_3 = _mm_unpacklo_epi16(bg8__0_7, ra8__0_7);
491  __m128i bgra_4_7 = _mm_unpackhi_epi16(bg8__0_7, ra8__0_7);
492 
493  __m128i bg8__8_F = _mm_unpacklo_epi8(_mm_shuffle_epi8(b16__8_F, evens_odds), _mm_shuffle_epi8(g16__8_F, evens_odds)); // hi to take the odds which are the upper bytes we care about
494  __m128i ra8__8_F = _mm_unpacklo_epi8(_mm_shuffle_epi8(r16__8_F, evens_odds), _mm_set1_epi8(-1));
495  __m128i bgra_8_B = _mm_unpacklo_epi16(bg8__8_F, ra8__8_F);
496  __m128i bgra_C_F = _mm_unpackhi_epi16(bg8__8_F, ra8__8_F);
497 
498  if (FORMAT == RS2_FORMAT_BGRA8)
499  {
500  // Store 16 pixels (64 bytes) at once
501  _mm_storeu_si128(dst++, bgra_0_3);
502  _mm_storeu_si128(dst++, bgra_4_7);
503  _mm_storeu_si128(dst++, bgra_8_B);
504  _mm_storeu_si128(dst++, bgra_C_F);
505  }
506 
507  if (FORMAT == RS2_FORMAT_BGR8)
508  {
509  // Shuffle rgb triples to the start and end of each register
510  __m128i bgr0 = _mm_shuffle_epi8(bgra_0_3, _mm_setr_epi8(3, 7, 11, 15, 0, 1, 2, 4, 5, 6, 8, 9, 10, 12, 13, 14));
511  __m128i bgr1 = _mm_shuffle_epi8(bgra_4_7, _mm_setr_epi8(0, 1, 2, 4, 3, 7, 11, 15, 5, 6, 8, 9, 10, 12, 13, 14));
512  __m128i bgr2 = _mm_shuffle_epi8(bgra_8_B, _mm_setr_epi8(0, 1, 2, 4, 5, 6, 8, 9, 3, 7, 11, 15, 10, 12, 13, 14));
513  __m128i bgr3 = _mm_shuffle_epi8(bgra_C_F, _mm_setr_epi8(0, 1, 2, 4, 5, 6, 8, 9, 10, 12, 13, 14, 3, 7, 11, 15));
514 
515  // Align registers and store 16 pixels (48 bytes) at once
516  _mm_storeu_si128(dst++, _mm_alignr_epi8(bgr1, bgr0, 4));
517  _mm_storeu_si128(dst++, _mm_alignr_epi8(bgr2, bgr1, 8));
518  _mm_storeu_si128(dst++, _mm_alignr_epi8(bgr3, bgr2, 12));
519  }
520  }
521  }
522 #else // Generic code for when SSSE3 is not available.
523  auto src = reinterpret_cast<const uint8_t *>(s);
524  auto dst = reinterpret_cast<uint8_t *>(d[0]);
525  for (; n; n -= 16, src += 32)
526  {
527  int16_t y[16] = {
528  src[1], src[3], src[5], src[7],
529  src[9], src[11], src[13], src[15],
530  src[17], src[19], src[21], src[23],
531  src[25], src[27], src[29], src[31],
532  }, u[16] = {
533  src[0], src[0], src[4], src[4],
534  src[8], src[8], src[12], src[12],
535  src[16], src[16], src[20], src[20],
536  src[24], src[24], src[28], src[28],
537  }, v[16] = {
538  src[2], src[2], src[6], src[6],
539  src[10], src[10], src[14], src[14],
540  src[18], src[18], src[22], src[22],
541  src[26], src[26], src[30], src[30],
542  };
543 
544  uint8_t r[16], g[16], b[16];
545  for (int i = 0; i < 16; i++)
546  {
547  int32_t c = y[i] - 16;
548  int32_t d = u[i] - 128;
549  int32_t e = v[i] - 128;
550 
551  int32_t t;
552 #define clamp(x) ((t=(x)) > 255 ? 255 : t < 0 ? 0 : t)
553  r[i] = clamp((298 * c + 409 * e + 128) >> 8);
554  g[i] = clamp((298 * c - 100 * d - 208 * e + 128) >> 8);
555  b[i] = clamp((298 * c + 516 * d + 128) >> 8);
556 #undef clamp
557  }
558 
559  if (FORMAT == RS2_FORMAT_RGB8)
560  {
561  uint8_t out[16 * 3] = {
562  r[0], g[0], b[0], r[1], g[1], b[1],
563  r[2], g[2], b[2], r[3], g[3], b[3],
564  r[4], g[4], b[4], r[5], g[5], b[5],
565  r[6], g[6], b[6], r[7], g[7], b[7],
566  r[8], g[8], b[8], r[9], g[9], b[9],
567  r[10], g[10], b[10], r[11], g[11], b[11],
568  r[12], g[12], b[12], r[13], g[13], b[13],
569  r[14], g[14], b[14], r[15], g[15], b[15],
570  };
571  librealsense::copy(dst, out, sizeof out);
572  dst += sizeof out;
573  continue;
574  }
575 
576  if (FORMAT == RS2_FORMAT_BGR8)
577  {
578  uint8_t out[16 * 3] = {
579  b[0], g[0], r[0], b[1], g[1], r[1],
580  b[2], g[2], r[2], b[3], g[3], r[3],
581  b[4], g[4], r[4], b[5], g[5], r[5],
582  b[6], g[6], r[6], b[7], g[7], r[7],
583  b[8], g[8], r[8], b[9], g[9], r[9],
584  b[10], g[10], r[10], b[11], g[11], r[11],
585  b[12], g[12], r[12], b[13], g[13], r[13],
586  b[14], g[14], r[14], b[15], g[15], r[15],
587  };
588  librealsense::copy(dst, out, sizeof out);
589  dst += sizeof out;
590  continue;
591  }
592 
593  if (FORMAT == RS2_FORMAT_RGBA8)
594  {
595  uint8_t out[16 * 4] = {
596  r[0], g[0], b[0], 255, r[1], g[1], b[1], 255,
597  r[2], g[2], b[2], 255, r[3], g[3], b[3], 255,
598  r[4], g[4], b[4], 255, r[5], g[5], b[5], 255,
599  r[6], g[6], b[6], 255, r[7], g[7], b[7], 255,
600  r[8], g[8], b[8], 255, r[9], g[9], b[9], 255,
601  r[10], g[10], b[10], 255, r[11], g[11], b[11], 255,
602  r[12], g[12], b[12], 255, r[13], g[13], b[13], 255,
603  r[14], g[14], b[14], 255, r[15], g[15], b[15], 255,
604  };
605  librealsense::copy(dst, out, sizeof out);
606  dst += sizeof out;
607  continue;
608  }
609 
610  if (FORMAT == RS2_FORMAT_BGRA8)
611  {
612  uint8_t out[16 * 4] = {
613  b[0], g[0], r[0], 255, b[1], g[1], r[1], 255,
614  b[2], g[2], r[2], 255, b[3], g[3], r[3], 255,
615  b[4], g[4], r[4], 255, b[5], g[5], r[5], 255,
616  b[6], g[6], r[6], 255, b[7], g[7], r[7], 255,
617  b[8], g[8], r[8], 255, b[9], g[9], r[9], 255,
618  b[10], g[10], r[10], 255, b[11], g[11], r[11], 255,
619  b[12], g[12], r[12], 255, b[13], g[13], r[13], 255,
620  b[14], g[14], r[14], 255, b[15], g[15], r[15], 255,
621  };
622  librealsense::copy(dst, out, sizeof out);
623  dst += sizeof out;
624  continue;
625  }
626  }
627 #endif
628  }
629 
630  void unpack_uyvyc(rs2_format dst_format, rs2_stream dst_stream, byte * const d[], const byte * s, int w, int h, int actual_size)
631  {
632  switch (dst_format)
633  {
634  case RS2_FORMAT_RGB8:
635  unpack_uyvy<RS2_FORMAT_RGB8>(d, s, w, h, actual_size);
636  break;
637  case RS2_FORMAT_RGBA8:
638  unpack_uyvy<RS2_FORMAT_RGBA8>(d, s, w, h, actual_size);
639  break;
640  case RS2_FORMAT_BGR8:
641  unpack_uyvy<RS2_FORMAT_BGR8>(d, s, w, h, actual_size);
642  break;
643  case RS2_FORMAT_BGRA8:
644  unpack_uyvy<RS2_FORMAT_BGRA8>(d, s, w, h, actual_size);
645  break;
646  default:
647  LOG_ERROR("Unsupported format for UYVY conversion.");
648  break;
649  }
650  }
651 
653  // MJPEG unpacking routines //
655  void unpack_mjpeg(byte * const dest[], const byte * source, int width, int height, int actual_size, int input_size)
656  {
657  int w, h, bpp;
658  auto uncompressed_rgb = stbi_load_from_memory(source, actual_size, &w, &h, &bpp, false);
659  if (uncompressed_rgb)
660  {
661  auto uncompressed_size = w * h * bpp;
662  librealsense::copy(dest[0], uncompressed_rgb, uncompressed_size);
663  stbi_image_free(uncompressed_rgb);
664  }
665  else
666  LOG_ERROR("jpeg decode failed");
667  }
668 
670  // BGR unpacking routines //
672  void unpack_rgb_from_bgr(byte * const dest[], const byte * source, int width, int height, int actual_size)
673  {
674  auto count = width * height;
675  auto in = reinterpret_cast<const uint8_t *>(source);
676  auto out = reinterpret_cast<uint8_t *>(dest[0]);
677 
678  librealsense::copy(out, in, count * 3);
679  for (auto i = 0; i < count; i++)
680  {
681  std::swap(out[i * 3], out[i * 3 + 2]);
682  }
683  }
684 
685  void yuy2_converter::process_function(byte * const dest[], const byte * source, int width, int height, int actual_size, int input_size)
686  {
687  unpack_yuy2(_target_format, _target_stream, dest, source, width, height, actual_size);
688  }
689 
690  void uyvy_converter::process_function(byte * const dest[], const byte * source, int width, int height, int actual_size, int input_size)
691  {
692  unpack_uyvyc(_target_format, _target_stream, dest, source, width, height, actual_size);
693  }
694 
695  void mjpeg_converter::process_function(byte * const dest[], const byte * source, int width, int height, int actual_size, int input_size)
696  {
697  unpack_mjpeg(dest, source, width, height, actual_size, input_size);
698  }
699 
700  void bgr_to_rgb::process_function(byte * const dest[], const byte * source, int width, int height, int actual_size, int input_size)
701  {
702  unpack_rgb_from_bgr(dest, source, width, height, actual_size);
703  }
704 }
GLboolean GLboolean g
GLsizei GLsizei GLchar * source
#define LOG_ERROR(...)
Definition: easyloggingpp.h:58
GLint y
void unpack_uyvyc(rs2_format dst_format, rs2_stream dst_stream, byte *const d[], const byte *s, int w, int h, int actual_size)
int y0
Definition: rmse.py:49
GLdouble s
int bpp
Definition: sw.py:18
void unpack_rgb_from_bgr(byte *const dest[], const byte *source, int width, int height, int actual_size)
#define clamp(x)
STBIDEF stbi_uc * stbi_load_from_memory(stbi_uc const *buffer, int len, int *x, int *y, int *comp, int req_comp)
char * dst
Definition: lz4.h:724
GLdouble GLdouble GLdouble w
d
Definition: rmse.py:171
void cpuid(int info[4], int info_type)
unsigned char uint8_t
Definition: stdint.h:78
e
Definition: rmse.py:177
GLdouble t
def info(name, value, persistent=False)
Definition: test.py:327
GLdouble GLdouble r
#define assert(condition)
Definition: lz4.c:245
bool has_avx()
NLOHMANN_BASIC_JSON_TPL_DECLARATION void swap(nlohmann::NLOHMANN_BASIC_JSON_TPL &j1, nlohmann::NLOHMANN_BASIC_JSON_TPL &j2) noexcept(//NOLINT(readability-inconsistent-declaration-parameter-name) is_nothrow_move_constructible< nlohmann::NLOHMANN_BASIC_JSON_TPL >::value &&//NOLINT(misc-redundant-expression) is_nothrow_move_assignable< nlohmann::NLOHMANN_BASIC_JSON_TPL >::value)
exchanges the values of two JSON objects
Definition: json.hpp:24397
void process_function(byte *const dest[], const byte *source, int width, int height, int actual_size, int input_size) override
signed short int16_t
Definition: stdint.h:76
GLint GLsizei GLsizei height
void process_function(byte *const dest[], const byte *source, int width, int height, int actual_size, int input_size) override
rs2_format
A stream&#39;s format identifies how binary data is encoded within a frame.
Definition: rs_sensor.h:59
void process_function(byte *const dest[], const byte *source, int width, int height, int actual_size, int input_size) override
rs2_stream
Streams are different types of data provided by RealSense devices.
Definition: rs_sensor.h:42
void unpack_uyvy(byte *const d[], const byte *s, int width, int height, int actual_size)
GLboolean GLboolean GLboolean b
unsigned char byte
Definition: src/types.h:40
void unpack_yuy2(byte *const d[], const byte *s, int width, int height, int actual_size)
void unpack_mjpeg(byte *const dest[], const byte *source, int width, int height, int actual_size, int input_size)
GLint GLsizei count
STBIDEF void stbi_image_free(void *retval_from_stbi_load)
int i
signed int int32_t
Definition: stdint.h:77
int h
Definition: sw.py:17
#define FORMAT
Definition: rs-color.c:19
GLdouble v
GLdouble y1
void process_function(byte *const dest[], const byte *source, int width, int height, int actual_size, int input_size) override
GLint GLsizei width
void copy(void *dst, void const *src, size_t size)
Definition: types.cpp:255
char * dest
Definition: lz4.h:697


librealsense2
Author(s): LibRealSense ROS Team
autogenerated on Thu Dec 22 2022 03:43:16