00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022 #include <stdio.h>
00023 #include <iostream>
00024 #include <string.h>
00025 #include <cassert>
00026
00027 #include "filter.h"
00028
00029
00030
00031 #ifndef _MSC_VER
00032 #include <stdint.h>
00033 #else
00034 typedef __int8 int8_t;
00035 typedef __int16 int16_t;
00036 typedef __int32 int32_t;
00037 typedef __int64 int64_t;
00038 typedef unsigned __int8 uint8_t;
00039 typedef unsigned __int16 uint16_t;
00040 typedef unsigned __int32 uint32_t;
00041 typedef unsigned __int64 uint64_t;
00042 #endif
00043
00044
00045
00046 namespace filter {
00047
00048
00049 namespace detail {
00050
00051 void integral_image( const uint8_t* in, int32_t* out, int w, int h ) {
00052 int32_t* out_top = out;
00053 const uint8_t* line_end = in + w;
00054 const uint8_t* in_end = in + w*h;
00055 int32_t line_sum = 0;
00056 for( ; in != line_end; in++, out++ ) {
00057 line_sum += *in;
00058 *out = line_sum;
00059 }
00060 for( ; in != in_end; ) {
00061 int32_t line_sum = 0;
00062 const uint8_t* line_end = in + w;
00063 for( ; in != line_end; in++, out++, out_top++ ) {
00064 line_sum += *in;
00065 *out = *out_top + line_sum;
00066 }
00067 }
00068 }
00069
00070 void unpack_8bit_to_16bit( const __m128i a, __m128i& b0, __m128i& b1 ) {
00071 __m128i zero = _mm_setzero_si128();
00072
00073 b0 = _mm_unpacklo_epi8( a, zero );
00074
00075 b1 = _mm_unpackhi_epi8( a, zero );
00076 }
00077
00078 void pack_16bit_to_8bit_saturate( const __m128i a0, const __m128i a1, __m128i& b ) {
00079 b = _mm_packus_epi16( a0, a1 );
00080 }
00081
00082
00083
00084 void convolve_14641_row_5x5_16bit( const int16_t* in, uint8_t* out, int w, int h ) {
00085 assert( w % 16 == 0 && "width must be multiple of 16!" );
00086 const __m128i* i0 = (const __m128i*)(in);
00087 const int16_t* i1 = in+1;
00088 const int16_t* i2 = in+2;
00089 const int16_t* i3 = in+3;
00090 const int16_t* i4 = in+4;
00091 uint8_t* result = out + 2;
00092 const int16_t* const end_input = in + w*h;
00093 __m128i offs = _mm_set1_epi16( 128 );
00094 for( ; i4 < end_input; i0 += 1, i1 += 8, i2 += 8, i3 += 8, i4 += 8, result += 16 ) {
00095 __m128i result_register_lo;
00096 __m128i result_register_hi;
00097 for( int i=0; i<2; i++ ) {
00098 __m128i* result_register;
00099 if( i==0 ) result_register = &result_register_lo;
00100 else result_register = &result_register_hi;
00101 __m128i i0_register = *i0;
00102 __m128i i1_register = _mm_loadu_si128( (__m128i*)( i1 ) );
00103 __m128i i2_register = _mm_loadu_si128( (__m128i*)( i2 ) );
00104 __m128i i3_register = _mm_loadu_si128( (__m128i*)( i3 ) );
00105 __m128i i4_register = _mm_loadu_si128( (__m128i*)( i4 ) );
00106 *result_register = _mm_setzero_si128();
00107 *result_register = _mm_add_epi16( i0_register, *result_register );
00108 i1_register = _mm_add_epi16( i1_register, i1_register );
00109 i1_register = _mm_add_epi16( i1_register, i1_register );
00110 *result_register = _mm_add_epi16( i1_register, *result_register );
00111 i2_register = _mm_add_epi16( i2_register, i2_register );
00112 *result_register = _mm_add_epi16( i2_register, *result_register );
00113 i2_register = _mm_add_epi16( i2_register, i2_register );
00114 *result_register = _mm_add_epi16( i2_register, *result_register );
00115 i3_register = _mm_add_epi16( i3_register, i3_register );
00116 i3_register = _mm_add_epi16( i3_register, i3_register );
00117 *result_register = _mm_add_epi16( i3_register, *result_register );
00118 *result_register = _mm_add_epi16( i4_register, *result_register );
00119
00120 *result_register = _mm_srai_epi16( *result_register, 7 );
00121 *result_register = _mm_add_epi16( *result_register, offs );
00122 if( i==0 ) {
00123 i0 += 1;
00124 i1 += 8;
00125 i2 += 8;
00126 i3 += 8;
00127 i4 += 8;
00128 }
00129 }
00130 pack_16bit_to_8bit_saturate( result_register_lo, result_register_hi, result_register_lo );
00131 _mm_storeu_si128( ((__m128i*)( result )), result_register_lo );
00132 }
00133 }
00134
00135
00136
00137
00138 void convolve_12021_row_5x5_16bit( const int16_t* in, uint8_t* out, int w, int h ) {
00139 assert( w % 16 == 0 && "width must be multiple of 16!" );
00140 const __m128i* i0 = (const __m128i*)(in);
00141 const int16_t* i1 = in+1;
00142 const int16_t* i3 = in+3;
00143 const int16_t* i4 = in+4;
00144 uint8_t* result = out + 2;
00145 const int16_t* const end_input = in + w*h;
00146 __m128i offs = _mm_set1_epi16( 128 );
00147 for( ; i4 < end_input; i0 += 1, i1 += 8, i3 += 8, i4 += 8, result += 16 ) {
00148 __m128i result_register_lo;
00149 __m128i result_register_hi;
00150 for( int i=0; i<2; i++ ) {
00151 __m128i* result_register;
00152 if( i==0 ) result_register = &result_register_lo;
00153 else result_register = &result_register_hi;
00154 __m128i i0_register = *i0;
00155 __m128i i1_register = _mm_loadu_si128( (__m128i*)( i1 ) );
00156 __m128i i3_register = _mm_loadu_si128( (__m128i*)( i3 ) );
00157 __m128i i4_register = _mm_loadu_si128( (__m128i*)( i4 ) );
00158 *result_register = _mm_setzero_si128();
00159 *result_register = _mm_add_epi16( i0_register, *result_register );
00160 i1_register = _mm_add_epi16( i1_register, i1_register );
00161 *result_register = _mm_add_epi16( i1_register, *result_register );
00162 i3_register = _mm_add_epi16( i3_register, i3_register );
00163 *result_register = _mm_sub_epi16( *result_register, i3_register );
00164 *result_register = _mm_sub_epi16( *result_register, i4_register );
00165 *result_register = _mm_srai_epi16( *result_register, 7 );
00166 *result_register = _mm_add_epi16( *result_register, offs );
00167 if( i==0 ) {
00168 i0 += 1;
00169 i1 += 8;
00170 i3 += 8;
00171 i4 += 8;
00172 }
00173 }
00174 pack_16bit_to_8bit_saturate( result_register_lo, result_register_hi, result_register_lo );
00175 _mm_storeu_si128( ((__m128i*)( result )), result_register_lo );
00176 }
00177 }
00178
00179
00180
00181
00182 void convolve_121_row_3x3_16bit( const int16_t* in, uint8_t* out, int w, int h ) {
00183 assert( w % 16 == 0 && "width must be multiple of 16!" );
00184 const __m128i* i0 = (const __m128i*)(in);
00185 const int16_t* i1 = in+1;
00186 const int16_t* i2 = in+2;
00187 uint8_t* result = out + 1;
00188 const int16_t* const end_input = in + w*h;
00189 const size_t blocked_loops = (w*h-2)/16;
00190 __m128i offs = _mm_set1_epi16( 128 );
00191 for( size_t i=0; i != blocked_loops; i++ ) {
00192 __m128i result_register_lo;
00193 __m128i result_register_hi;
00194 __m128i i1_register;
00195 __m128i i2_register;
00196
00197 i1_register = _mm_loadu_si128( (__m128i*)( i1 ) );
00198 i2_register = _mm_loadu_si128( (__m128i*)( i2 ) );
00199 result_register_lo = *i0;
00200
00201
00202 i1_register = _mm_add_epi16( i1_register, i1_register );
00203
00204 result_register_lo = _mm_add_epi16( i1_register, result_register_lo );
00205
00206 result_register_lo = _mm_add_epi16( i2_register, result_register_lo );
00207
00208 result_register_lo = _mm_srai_epi16( result_register_lo, 2 );
00209
00210 result_register_lo = _mm_add_epi16( result_register_lo, offs );
00211
00212 i0++;
00213 i1+=8;
00214 i2+=8;
00215
00216 i1_register = _mm_loadu_si128( (__m128i*)( i1 ) );
00217 i2_register = _mm_loadu_si128( (__m128i*)( i2 ) );
00218 result_register_hi = *i0;
00219 i1_register = _mm_add_epi16( i1_register, i1_register );
00220 result_register_hi = _mm_add_epi16( i1_register, result_register_hi );
00221 result_register_hi = _mm_add_epi16( i2_register, result_register_hi );
00222 result_register_hi = _mm_srai_epi16( result_register_hi, 2 );
00223 result_register_hi = _mm_add_epi16( result_register_hi, offs );
00224
00225 i0++;
00226 i1+=8;
00227 i2+=8;
00228
00229 pack_16bit_to_8bit_saturate( result_register_lo, result_register_hi, result_register_lo );
00230 _mm_storeu_si128( ((__m128i*)( result )), result_register_lo );
00231
00232 result += 16;
00233 }
00234 }
00235
00236
00237
00238
00239 void convolve_101_row_3x3_16bit( const int16_t* in, uint8_t* out, int w, int h ) {
00240 assert( w % 16 == 0 && "width must be multiple of 16!" );
00241 const __m128i* i0 = (const __m128i*)(in);
00242 const int16_t* i2 = in+2;
00243 uint8_t* result = out + 1;
00244 const int16_t* const end_input = in + w*h;
00245 const size_t blocked_loops = (w*h-2)/16;
00246 __m128i offs = _mm_set1_epi16( 128 );
00247 for( size_t i=0; i != blocked_loops; i++ ) {
00248 __m128i result_register_lo;
00249 __m128i result_register_hi;
00250 __m128i i2_register;
00251
00252 i2_register = _mm_loadu_si128( (__m128i*)( i2 ) );
00253 result_register_lo = *i0;
00254 result_register_lo = _mm_sub_epi16( result_register_lo, i2_register );
00255 result_register_lo = _mm_srai_epi16( result_register_lo, 2 );
00256 result_register_lo = _mm_add_epi16( result_register_lo, offs );
00257
00258 i0 += 1;
00259 i2 += 8;
00260
00261 i2_register = _mm_loadu_si128( (__m128i*)( i2 ) );
00262 result_register_hi = *i0;
00263 result_register_hi = _mm_sub_epi16( result_register_hi, i2_register );
00264 result_register_hi = _mm_srai_epi16( result_register_hi, 2 );
00265 result_register_hi = _mm_add_epi16( result_register_hi, offs );
00266
00267 i0 += 1;
00268 i2 += 8;
00269
00270 pack_16bit_to_8bit_saturate( result_register_lo, result_register_hi, result_register_lo );
00271 _mm_storeu_si128( ((__m128i*)( result )), result_register_lo );
00272
00273 result += 16;
00274 }
00275
00276 for( ; i2 < end_input; i2++, result++) {
00277 *result = ((*(i2-2) - *i2)>>2)+128;
00278 }
00279 }
00280
00281 void convolve_cols_5x5( const unsigned char* in, int16_t* out_v, int16_t* out_h, int w, int h ) {
00282 using namespace std;
00283 memset( out_h, 0, w*h*sizeof(int16_t) );
00284 memset( out_v, 0, w*h*sizeof(int16_t) );
00285 assert( w % 16 == 0 && "width must be multiple of 16!" );
00286
00287
00288
00289 const int w_chunk = w/16;
00290 __m128i* i0 = (__m128i*)( in );
00291 __m128i* i1 = (__m128i*)( in ) + w_chunk*1;
00292 __m128i* i2 = (__m128i*)( in ) + w_chunk*2;
00293 __m128i* i3 = (__m128i*)( in ) + w_chunk*3;
00294 __m128i* i4 = (__m128i*)( in ) + w_chunk*4;
00295
00296 __m128i* result_h = (__m128i*)( out_h ) + 4*w_chunk;
00297
00298 __m128i* result_v = (__m128i*)( out_v ) + 4*w_chunk;
00299 __m128i* end_input = (__m128i*)( in ) + w_chunk*h;
00300
00301 __m128i sixes = _mm_set1_epi16( 6 );
00302 __m128i fours = _mm_set1_epi16( 4 );
00303 for( ; i4 != end_input; i0++, i1++, i2++, i3++, i4++, result_v+=2, result_h+=2 ) {
00304 __m128i ilo, ihi;
00305
00306 unpack_8bit_to_16bit( *i0, ihi, ilo );
00307
00308 *result_h = _mm_add_epi16( ihi, *result_h );
00309 *(result_h+1) = _mm_add_epi16( ilo, *(result_h+1) );
00310
00311 *result_v = _mm_add_epi16( *result_v, ihi );
00312 *(result_v+1) = _mm_add_epi16( *(result_v+1), ilo );
00313
00314
00315 unpack_8bit_to_16bit( *i1, ihi, ilo );
00316
00317 *result_h = _mm_add_epi16( ihi, *result_h );
00318 *result_h = _mm_add_epi16( ihi, *result_h );
00319 *(result_h+1) = _mm_add_epi16( ilo, *(result_h+1) );
00320 *(result_h+1) = _mm_add_epi16( ilo, *(result_h+1) );
00321 ihi = _mm_mullo_epi16( ihi, fours );
00322 ilo = _mm_mullo_epi16( ilo, fours );
00323
00324
00325 *result_v = _mm_add_epi16( *result_v, ihi );
00326 *(result_v+1) = _mm_add_epi16( *(result_v+1), ilo );
00327
00328 unpack_8bit_to_16bit( *i2, ihi, ilo );
00329
00330 ihi = _mm_mullo_epi16( ihi, sixes );
00331 ilo = _mm_mullo_epi16( ilo, sixes );
00332
00333
00334 *result_v = _mm_add_epi16( *result_v, ihi );
00335 *(result_v+1) = _mm_add_epi16( *(result_v+1), ilo );
00336
00337 unpack_8bit_to_16bit( *i3, ihi, ilo );
00338
00339 *result_h = _mm_sub_epi16( *result_h, ihi );
00340 *result_h = _mm_sub_epi16( *result_h, ihi );
00341 *(result_h+1) = _mm_sub_epi16( *(result_h+1), ilo );
00342 *(result_h+1) = _mm_sub_epi16( *(result_h+1), ilo );
00343 ihi = _mm_mullo_epi16( ihi, fours );
00344 ilo = _mm_mullo_epi16( ilo, fours );
00345
00346 *result_v = _mm_add_epi16( *result_v, ihi );
00347 *(result_v+1) = _mm_add_epi16( *(result_v+1), ilo );
00348
00349 unpack_8bit_to_16bit( *i4, ihi, ilo );
00350
00351 *result_h = _mm_sub_epi16( *result_h, ihi );
00352 *(result_h+1) = _mm_sub_epi16( *(result_h+1), ilo );
00353
00354 *result_v = _mm_add_epi16( *result_v, ihi );
00355 *(result_v+1) = _mm_add_epi16( *(result_v+1), ilo );
00356 }
00357 }
00358
00359 void convolve_col_p1p1p0m1m1_5x5( const unsigned char* in, int16_t* out, int w, int h ) {
00360 memset( out, 0, w*h*sizeof(int16_t) );
00361 using namespace std;
00362 assert( w % 16 == 0 && "width must be multiple of 16!" );
00363 const int w_chunk = w/16;
00364 __m128i* i0 = (__m128i*)( in );
00365 __m128i* i1 = (__m128i*)( in ) + w_chunk*1;
00366 __m128i* i3 = (__m128i*)( in ) + w_chunk*3;
00367 __m128i* i4 = (__m128i*)( in ) + w_chunk*4;
00368 __m128i* result = (__m128i*)( out ) + 4*w_chunk;
00369 __m128i* end_input = (__m128i*)( in ) + w_chunk*h;
00370 for( ; i4 != end_input; i0++, i1++, i3++, i4++, result+=2 ) {
00371 __m128i ilo0, ihi0;
00372 unpack_8bit_to_16bit( *i0, ihi0, ilo0 );
00373 __m128i ilo1, ihi1;
00374 unpack_8bit_to_16bit( *i1, ihi1, ilo1 );
00375 *result = _mm_add_epi16( ihi0, ihi1 );
00376 *(result+1) = _mm_add_epi16( ilo0, ilo1 );
00377 __m128i ilo, ihi;
00378 unpack_8bit_to_16bit( *i3, ihi, ilo );
00379 *result = _mm_sub_epi16( *result, ihi );
00380 *(result+1) = _mm_sub_epi16( *(result+1), ilo );
00381 unpack_8bit_to_16bit( *i4, ihi, ilo );
00382 *result = _mm_sub_epi16( *result, ihi );
00383 *(result+1) = _mm_sub_epi16( *(result+1), ilo );
00384 }
00385 }
00386
00387 void convolve_row_p1p1p0m1m1_5x5( const int16_t* in, int16_t* out, int w, int h ) {
00388 assert( w % 16 == 0 && "width must be multiple of 16!" );
00389 const __m128i* i0 = (const __m128i*)(in);
00390 const int16_t* i1 = in+1;
00391 const int16_t* i3 = in+3;
00392 const int16_t* i4 = in+4;
00393 int16_t* result = out + 2;
00394 const int16_t* const end_input = in + w*h;
00395 for( ; i4+8 < end_input; i0 += 1, i1 += 8, i3 += 8, i4 += 8, result += 8 ) {
00396 __m128i result_register;
00397 __m128i i0_register = *i0;
00398 __m128i i1_register = _mm_loadu_si128( (__m128i*)( i1 ) );
00399 __m128i i3_register = _mm_loadu_si128( (__m128i*)( i3 ) );
00400 __m128i i4_register = _mm_loadu_si128( (__m128i*)( i4 ) );
00401 result_register = _mm_add_epi16( i0_register, i1_register );
00402 result_register = _mm_sub_epi16( result_register, i3_register );
00403 result_register = _mm_sub_epi16( result_register, i4_register );
00404 _mm_storeu_si128( ((__m128i*)( result )), result_register );
00405 }
00406 }
00407
00408 void convolve_cols_3x3( const unsigned char* in, int16_t* out_v, int16_t* out_h, int w, int h ) {
00409 using namespace std;
00410 assert( w % 16 == 0 && "width must be multiple of 16!" );
00411 const int w_chunk = w/16;
00412 __m128i* i0 = (__m128i*)( in );
00413 __m128i* i1 = (__m128i*)( in ) + w_chunk*1;
00414 __m128i* i2 = (__m128i*)( in ) + w_chunk*2;
00415 __m128i* result_h = (__m128i*)( out_h ) + 2*w_chunk;
00416 __m128i* result_v = (__m128i*)( out_v ) + 2*w_chunk;
00417 __m128i* end_input = (__m128i*)( in ) + w_chunk*h;
00418 for( ; i2 != end_input; i0++, i1++, i2++, result_v+=2, result_h+=2 ) {
00419 *result_h = _mm_setzero_si128();
00420 *(result_h+1) = _mm_setzero_si128();
00421 *result_v = _mm_setzero_si128();
00422 *(result_v+1) = _mm_setzero_si128();
00423 __m128i ilo, ihi;
00424 unpack_8bit_to_16bit( *i0, ihi, ilo );
00425 unpack_8bit_to_16bit( *i0, ihi, ilo );
00426 *result_h = _mm_add_epi16( ihi, *result_h );
00427 *(result_h+1) = _mm_add_epi16( ilo, *(result_h+1) );
00428 *result_v = _mm_add_epi16( *result_v, ihi );
00429 *(result_v+1) = _mm_add_epi16( *(result_v+1), ilo );
00430 unpack_8bit_to_16bit( *i1, ihi, ilo );
00431 *result_v = _mm_add_epi16( *result_v, ihi );
00432 *(result_v+1) = _mm_add_epi16( *(result_v+1), ilo );
00433 *result_v = _mm_add_epi16( *result_v, ihi );
00434 *(result_v+1) = _mm_add_epi16( *(result_v+1), ilo );
00435 unpack_8bit_to_16bit( *i2, ihi, ilo );
00436 *result_h = _mm_sub_epi16( *result_h, ihi );
00437 *(result_h+1) = _mm_sub_epi16( *(result_h+1), ilo );
00438 *result_v = _mm_add_epi16( *result_v, ihi );
00439 *(result_v+1) = _mm_add_epi16( *(result_v+1), ilo );
00440 }
00441 }
00442 };
00443
00444 void sobel3x3( const uint8_t* in, uint8_t* out_v, uint8_t* out_h, int w, int h ) {
00445 int16_t* temp_h = (int16_t*)( _mm_malloc( w*h*sizeof( int16_t ), 16 ) );
00446 int16_t* temp_v = (int16_t*)( _mm_malloc( w*h*sizeof( int16_t ), 16 ) );
00447 detail::convolve_cols_3x3( in, temp_v, temp_h, w, h );
00448 detail::convolve_101_row_3x3_16bit( temp_v, out_v, w, h );
00449 detail::convolve_121_row_3x3_16bit( temp_h, out_h, w, h );
00450 _mm_free( temp_h );
00451 _mm_free( temp_v );
00452 }
00453
00454
00455 void sobel5x5( const uint8_t* in, uint8_t* out_v, uint8_t* out_h, int w, int h ) {
00456 int16_t* temp_h = (int16_t*)( _mm_malloc( w*h*sizeof( int16_t ), 16 ) );
00457 int16_t* temp_v = (int16_t*)( _mm_malloc( w*h*sizeof( int16_t ), 16 ) );
00458 detail::convolve_cols_5x5( in, temp_v, temp_h, w, h );
00459 detail::convolve_12021_row_5x5_16bit( temp_v, out_v, w, h );
00460 detail::convolve_14641_row_5x5_16bit( temp_h, out_h, w, h );
00461 _mm_free( temp_h );
00462 _mm_free( temp_v );
00463 }
00464
00465
00466
00467
00468
00469
00470 void checkerboard5x5( const uint8_t* in, int16_t* out, int w, int h ) {
00471 int16_t* temp = (int16_t*)( _mm_malloc( w*h*sizeof( int16_t ), 16 ) );
00472 detail::convolve_col_p1p1p0m1m1_5x5( in, temp, w, h );
00473 detail::convolve_row_p1p1p0m1m1_5x5( temp, out, w, h );
00474 _mm_free( temp );
00475 }
00476
00477
00478
00479
00480
00481
00482 void blob5x5( const uint8_t* in, int16_t* out, int w, int h ) {
00483 int32_t* integral = (int32_t*)( _mm_malloc( w*h*sizeof( int32_t ), 16 ) );
00484 detail::integral_image( in, integral, w, h );
00485 int16_t* out_ptr = out + 3 + 3*w;
00486 int16_t* out_end = out + w * h - 2 - 2*w;
00487 const int32_t* i00 = integral;
00488 const int32_t* i50 = integral + 5;
00489 const int32_t* i05 = integral + 5*w;
00490 const int32_t* i55 = integral + 5 + 5*w;
00491 const int32_t* i11 = integral + 1 + 1*w;
00492 const int32_t* i41 = integral + 4 + 1*w;
00493 const int32_t* i14 = integral + 1 + 4*w;
00494 const int32_t* i44 = integral + 4 + 4*w;
00495 const uint8_t* im22 = in + 3 + 3*w;
00496 for( ; out_ptr != out_end; out_ptr++, i00++, i50++, i05++, i55++, i11++, i41++, i14++, i44++, im22++ ) {
00497 int32_t result = 0;
00498 result = -( *i55 - *i50 - *i05 + *i00 );
00499 result += 2*( *i44 - *i41 - *i14 + *i11 );
00500 result += 7* *im22;
00501 *out_ptr = result;
00502 }
00503 _mm_free( integral );
00504 }
00505 };