00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022 #include <stdio.h>
00023 #include <string.h>
00024 #include <cassert>
00025
00026 #include "filter.h"
00027
00028
00029 #ifndef _MSC_VER
00030 #include <stdint.h>
00031 #else
00032 typedef __int8 int8_t;
00033 typedef __int16 int16_t;
00034 typedef __int32 int32_t;
00035 typedef __int64 int64_t;
00036 typedef unsigned __int8 uint8_t;
00037 typedef unsigned __int16 uint16_t;
00038 typedef unsigned __int32 uint32_t;
00039 typedef unsigned __int64 uint64_t;
00040 #endif
00041
00042
00043
00044 namespace filter {
00045
00046
00047 namespace detail {
00048 void integral_image( const uint8_t* in, int32_t* out, int w, int h ) {
00049 int32_t* out_top = out;
00050 const uint8_t* line_end = in + w;
00051 const uint8_t* in_end = in + w*h;
00052 int32_t line_sum = 0;
00053 for( ; in != line_end; in++, out++ ) {
00054 line_sum += *in;
00055 *out = line_sum;
00056 }
00057 for( ; in != in_end; ) {
00058 int32_t line_sum = 0;
00059 const uint8_t* line_end = in + w;
00060 for( ; in != line_end; in++, out++, out_top++ ) {
00061 line_sum += *in;
00062 *out = *out_top + line_sum;
00063 }
00064 }
00065 }
00066
00067 void unpack_8bit_to_16bit( const __m128i a, __m128i& b0, __m128i& b1 ) {
00068 __m128i zero = _mm_setzero_si128();
00069 b0 = _mm_unpacklo_epi8( a, zero );
00070 b1 = _mm_unpackhi_epi8( a, zero );
00071 }
00072
00073 void pack_16bit_to_8bit_saturate( const __m128i a0, const __m128i a1, __m128i& b ) {
00074 b = _mm_packus_epi16( a0, a1 );
00075 }
00076
00077
00078
00079 void convolve_14641_row_5x5_16bit( const int16_t* in, uint8_t* out, int w, int h ) {
00080 assert( w % 16 == 0 && "width must be multiple of 16!" );
00081 const __m128i* i0 = (const __m128i*)(in);
00082 const int16_t* i1 = in+1;
00083 const int16_t* i2 = in+2;
00084 const int16_t* i3 = in+3;
00085 const int16_t* i4 = in+4;
00086 uint8_t* result = out + 2;
00087 const int16_t* const end_input = in + w*h;
00088 __m128i offs = _mm_set1_epi16( 128 );
00089 for( ; i4 < end_input; i0 += 1, i1 += 8, i2 += 8, i3 += 8, i4 += 8, result += 16 ) {
00090 __m128i result_register_lo;
00091 __m128i result_register_hi;
00092 for( int i=0; i<2; i++ ) {
00093 __m128i* result_register;
00094 if( i==0 ) result_register = &result_register_lo;
00095 else result_register = &result_register_hi;
00096 __m128i i0_register = *i0;
00097 __m128i i1_register = _mm_loadu_si128( (__m128i*)( i1 ) );
00098 __m128i i2_register = _mm_loadu_si128( (__m128i*)( i2 ) );
00099 __m128i i3_register = _mm_loadu_si128( (__m128i*)( i3 ) );
00100 __m128i i4_register = _mm_loadu_si128( (__m128i*)( i4 ) );
00101 *result_register = _mm_setzero_si128();
00102 *result_register = _mm_add_epi16( i0_register, *result_register );
00103 i1_register = _mm_add_epi16( i1_register, i1_register );
00104 i1_register = _mm_add_epi16( i1_register, i1_register );
00105 *result_register = _mm_add_epi16( i1_register, *result_register );
00106 i2_register = _mm_add_epi16( i2_register, i2_register );
00107 *result_register = _mm_add_epi16( i2_register, *result_register );
00108 i2_register = _mm_add_epi16( i2_register, i2_register );
00109 *result_register = _mm_add_epi16( i2_register, *result_register );
00110 i3_register = _mm_add_epi16( i3_register, i3_register );
00111 i3_register = _mm_add_epi16( i3_register, i3_register );
00112 *result_register = _mm_add_epi16( i3_register, *result_register );
00113 *result_register = _mm_add_epi16( i4_register, *result_register );
00114 *result_register = _mm_srai_epi16( *result_register, 7 );
00115 *result_register = _mm_add_epi16( *result_register, offs );
00116 if( i==0 ) {
00117 i0 += 1;
00118 i1 += 8;
00119 i2 += 8;
00120 i3 += 8;
00121 i4 += 8;
00122 }
00123 }
00124 pack_16bit_to_8bit_saturate( result_register_lo, result_register_hi, result_register_lo );
00125 _mm_storeu_si128( ((__m128i*)( result )), result_register_lo );
00126 }
00127 }
00128
00129
00130
00131
00132 void convolve_12021_row_5x5_16bit( const int16_t* in, uint8_t* out, int w, int h ) {
00133 assert( w % 16 == 0 && "width must be multiple of 16!" );
00134 const __m128i* i0 = (const __m128i*)(in);
00135 const int16_t* i1 = in+1;
00136 const int16_t* i3 = in+3;
00137 const int16_t* i4 = in+4;
00138 uint8_t* result = out + 2;
00139 const int16_t* const end_input = in + w*h;
00140 __m128i offs = _mm_set1_epi16( 128 );
00141 for( ; i4 < end_input; i0 += 1, i1 += 8, i3 += 8, i4 += 8, result += 16 ) {
00142 __m128i result_register_lo;
00143 __m128i result_register_hi;
00144 for( int i=0; i<2; i++ ) {
00145 __m128i* result_register;
00146 if( i==0 ) result_register = &result_register_lo;
00147 else result_register = &result_register_hi;
00148 __m128i i0_register = *i0;
00149 __m128i i1_register = _mm_loadu_si128( (__m128i*)( i1 ) );
00150 __m128i i3_register = _mm_loadu_si128( (__m128i*)( i3 ) );
00151 __m128i i4_register = _mm_loadu_si128( (__m128i*)( i4 ) );
00152 *result_register = _mm_setzero_si128();
00153 *result_register = _mm_add_epi16( i0_register, *result_register );
00154 i1_register = _mm_add_epi16( i1_register, i1_register );
00155 *result_register = _mm_add_epi16( i1_register, *result_register );
00156 i3_register = _mm_add_epi16( i3_register, i3_register );
00157 *result_register = _mm_sub_epi16( *result_register, i3_register );
00158 *result_register = _mm_sub_epi16( *result_register, i4_register );
00159 *result_register = _mm_srai_epi16( *result_register, 7 );
00160 *result_register = _mm_add_epi16( *result_register, offs );
00161 if( i==0 ) {
00162 i0 += 1;
00163 i1 += 8;
00164 i3 += 8;
00165 i4 += 8;
00166 }
00167 }
00168 pack_16bit_to_8bit_saturate( result_register_lo, result_register_hi, result_register_lo );
00169 _mm_storeu_si128( ((__m128i*)( result )), result_register_lo );
00170 }
00171 }
00172
00173
00174
00175
00176 void convolve_121_row_3x3_16bit( const int16_t* in, uint8_t* out, int w, int h ) {
00177 assert( w % 16 == 0 && "width must be multiple of 16!" );
00178 const __m128i* i0 = (const __m128i*)(in);
00179 const int16_t* i1 = in+1;
00180 const int16_t* i2 = in+2;
00181 uint8_t* result = out + 1;
00182 const int16_t* const end_input = in + w*h;
00183 const size_t blocked_loops = (w*h-2)/16;
00184 __m128i offs = _mm_set1_epi16( 128 );
00185 for( size_t i=0; i != blocked_loops; i++ ) {
00186 __m128i result_register_lo;
00187 __m128i result_register_hi;
00188 __m128i i1_register;
00189 __m128i i2_register;
00190
00191 i1_register = _mm_loadu_si128( (__m128i*)( i1 ) );
00192 i2_register = _mm_loadu_si128( (__m128i*)( i2 ) );
00193 result_register_lo = *i0;
00194 i1_register = _mm_add_epi16( i1_register, i1_register );
00195 result_register_lo = _mm_add_epi16( i1_register, result_register_lo );
00196 result_register_lo = _mm_add_epi16( i2_register, result_register_lo );
00197 result_register_lo = _mm_srai_epi16( result_register_lo, 2 );
00198 result_register_lo = _mm_add_epi16( result_register_lo, offs );
00199
00200 i0++;
00201 i1+=8;
00202 i2+=8;
00203
00204 i1_register = _mm_loadu_si128( (__m128i*)( i1 ) );
00205 i2_register = _mm_loadu_si128( (__m128i*)( i2 ) );
00206 result_register_hi = *i0;
00207 i1_register = _mm_add_epi16( i1_register, i1_register );
00208 result_register_hi = _mm_add_epi16( i1_register, result_register_hi );
00209 result_register_hi = _mm_add_epi16( i2_register, result_register_hi );
00210 result_register_hi = _mm_srai_epi16( result_register_hi, 2 );
00211 result_register_hi = _mm_add_epi16( result_register_hi, offs );
00212
00213 i0++;
00214 i1+=8;
00215 i2+=8;
00216
00217 pack_16bit_to_8bit_saturate( result_register_lo, result_register_hi, result_register_lo );
00218 _mm_storeu_si128( ((__m128i*)( result )), result_register_lo );
00219
00220 result += 16;
00221 }
00222 }
00223
00224
00225
00226
00227 void convolve_101_row_3x3_16bit( const int16_t* in, uint8_t* out, int w, int h ) {
00228 assert( w % 16 == 0 && "width must be multiple of 16!" );
00229 const __m128i* i0 = (const __m128i*)(in);
00230 const int16_t* i2 = in+2;
00231 uint8_t* result = out + 1;
00232 const int16_t* const end_input = in + w*h;
00233 const size_t blocked_loops = (w*h-2)/16;
00234 __m128i offs = _mm_set1_epi16( 128 );
00235 for( size_t i=0; i != blocked_loops; i++ ) {
00236 __m128i result_register_lo;
00237 __m128i result_register_hi;
00238 __m128i i2_register;
00239
00240 i2_register = _mm_loadu_si128( (__m128i*)( i2 ) );
00241 result_register_lo = *i0;
00242 result_register_lo = _mm_sub_epi16( result_register_lo, i2_register );
00243 result_register_lo = _mm_srai_epi16( result_register_lo, 2 );
00244 result_register_lo = _mm_add_epi16( result_register_lo, offs );
00245
00246 i0 += 1;
00247 i2 += 8;
00248
00249 i2_register = _mm_loadu_si128( (__m128i*)( i2 ) );
00250 result_register_hi = *i0;
00251 result_register_hi = _mm_sub_epi16( result_register_hi, i2_register );
00252 result_register_hi = _mm_srai_epi16( result_register_hi, 2 );
00253 result_register_hi = _mm_add_epi16( result_register_hi, offs );
00254
00255 i0 += 1;
00256 i2 += 8;
00257
00258 pack_16bit_to_8bit_saturate( result_register_lo, result_register_hi, result_register_lo );
00259 _mm_storeu_si128( ((__m128i*)( result )), result_register_lo );
00260
00261 result += 16;
00262 }
00263
00264 for( ; i2 < end_input; i2++, result++) {
00265 *result = ((*(i2-2) - *i2)>>2)+128;
00266 }
00267 }
00268
00269 void convolve_cols_5x5( const unsigned char* in, int16_t* out_v, int16_t* out_h, int w, int h ) {
00270 using namespace std;
00271 memset( out_h, 0, w*h*sizeof(int16_t) );
00272 memset( out_v, 0, w*h*sizeof(int16_t) );
00273 assert( w % 16 == 0 && "width must be multiple of 16!" );
00274 const int w_chunk = w/16;
00275 __m128i* i0 = (__m128i*)( in );
00276 __m128i* i1 = (__m128i*)( in ) + w_chunk*1;
00277 __m128i* i2 = (__m128i*)( in ) + w_chunk*2;
00278 __m128i* i3 = (__m128i*)( in ) + w_chunk*3;
00279 __m128i* i4 = (__m128i*)( in ) + w_chunk*4;
00280 __m128i* result_h = (__m128i*)( out_h ) + 4*w_chunk;
00281 __m128i* result_v = (__m128i*)( out_v ) + 4*w_chunk;
00282 __m128i* end_input = (__m128i*)( in ) + w_chunk*h;
00283 __m128i sixes = _mm_set1_epi16( 6 );
00284 __m128i fours = _mm_set1_epi16( 4 );
00285 for( ; i4 != end_input; i0++, i1++, i2++, i3++, i4++, result_v+=2, result_h+=2 ) {
00286 __m128i ilo, ihi;
00287 unpack_8bit_to_16bit( *i0, ihi, ilo );
00288 *result_h = _mm_add_epi16( ihi, *result_h );
00289 *(result_h+1) = _mm_add_epi16( ilo, *(result_h+1) );
00290 *result_v = _mm_add_epi16( *result_v, ihi );
00291 *(result_v+1) = _mm_add_epi16( *(result_v+1), ilo );
00292 unpack_8bit_to_16bit( *i1, ihi, ilo );
00293 *result_h = _mm_add_epi16( ihi, *result_h );
00294 *result_h = _mm_add_epi16( ihi, *result_h );
00295 *(result_h+1) = _mm_add_epi16( ilo, *(result_h+1) );
00296 *(result_h+1) = _mm_add_epi16( ilo, *(result_h+1) );
00297 ihi = _mm_mullo_epi16( ihi, fours );
00298 ilo = _mm_mullo_epi16( ilo, fours );
00299 *result_v = _mm_add_epi16( *result_v, ihi );
00300 *(result_v+1) = _mm_add_epi16( *(result_v+1), ilo );
00301 unpack_8bit_to_16bit( *i2, ihi, ilo );
00302 ihi = _mm_mullo_epi16( ihi, sixes );
00303 ilo = _mm_mullo_epi16( ilo, sixes );
00304 *result_v = _mm_add_epi16( *result_v, ihi );
00305 *(result_v+1) = _mm_add_epi16( *(result_v+1), ilo );
00306 unpack_8bit_to_16bit( *i3, ihi, ilo );
00307 *result_h = _mm_sub_epi16( *result_h, ihi );
00308 *result_h = _mm_sub_epi16( *result_h, ihi );
00309 *(result_h+1) = _mm_sub_epi16( *(result_h+1), ilo );
00310 *(result_h+1) = _mm_sub_epi16( *(result_h+1), ilo );
00311 ihi = _mm_mullo_epi16( ihi, fours );
00312 ilo = _mm_mullo_epi16( ilo, fours );
00313 *result_v = _mm_add_epi16( *result_v, ihi );
00314 *(result_v+1) = _mm_add_epi16( *(result_v+1), ilo );
00315 unpack_8bit_to_16bit( *i4, ihi, ilo );
00316 *result_h = _mm_sub_epi16( *result_h, ihi );
00317 *(result_h+1) = _mm_sub_epi16( *(result_h+1), ilo );
00318 *result_v = _mm_add_epi16( *result_v, ihi );
00319 *(result_v+1) = _mm_add_epi16( *(result_v+1), ilo );
00320 }
00321 }
00322
00323 void convolve_col_p1p1p0m1m1_5x5( const unsigned char* in, int16_t* out, int w, int h ) {
00324 memset( out, 0, w*h*sizeof(int16_t) );
00325 using namespace std;
00326 assert( w % 16 == 0 && "width must be multiple of 16!" );
00327 const int w_chunk = w/16;
00328 __m128i* i0 = (__m128i*)( in );
00329 __m128i* i1 = (__m128i*)( in ) + w_chunk*1;
00330 __m128i* i3 = (__m128i*)( in ) + w_chunk*3;
00331 __m128i* i4 = (__m128i*)( in ) + w_chunk*4;
00332 __m128i* result = (__m128i*)( out ) + 4*w_chunk;
00333 __m128i* end_input = (__m128i*)( in ) + w_chunk*h;
00334 for( ; i4 != end_input; i0++, i1++, i3++, i4++, result+=2 ) {
00335 __m128i ilo0, ihi0;
00336 unpack_8bit_to_16bit( *i0, ihi0, ilo0 );
00337 __m128i ilo1, ihi1;
00338 unpack_8bit_to_16bit( *i1, ihi1, ilo1 );
00339 *result = _mm_add_epi16( ihi0, ihi1 );
00340 *(result+1) = _mm_add_epi16( ilo0, ilo1 );
00341 __m128i ilo, ihi;
00342 unpack_8bit_to_16bit( *i3, ihi, ilo );
00343 *result = _mm_sub_epi16( *result, ihi );
00344 *(result+1) = _mm_sub_epi16( *(result+1), ilo );
00345 unpack_8bit_to_16bit( *i4, ihi, ilo );
00346 *result = _mm_sub_epi16( *result, ihi );
00347 *(result+1) = _mm_sub_epi16( *(result+1), ilo );
00348 }
00349 }
00350
00351 void convolve_row_p1p1p0m1m1_5x5( const int16_t* in, int16_t* out, int w, int h ) {
00352 assert( w % 16 == 0 && "width must be multiple of 16!" );
00353 const __m128i* i0 = (const __m128i*)(in);
00354 const int16_t* i1 = in+1;
00355 const int16_t* i3 = in+3;
00356 const int16_t* i4 = in+4;
00357 int16_t* result = out + 2;
00358 const int16_t* const end_input = in + w*h;
00359 for( ; i4+8 < end_input; i0 += 1, i1 += 8, i3 += 8, i4 += 8, result += 8 ) {
00360 __m128i result_register;
00361 __m128i i0_register = *i0;
00362 __m128i i1_register = _mm_loadu_si128( (__m128i*)( i1 ) );
00363 __m128i i3_register = _mm_loadu_si128( (__m128i*)( i3 ) );
00364 __m128i i4_register = _mm_loadu_si128( (__m128i*)( i4 ) );
00365 result_register = _mm_add_epi16( i0_register, i1_register );
00366 result_register = _mm_sub_epi16( result_register, i3_register );
00367 result_register = _mm_sub_epi16( result_register, i4_register );
00368 _mm_storeu_si128( ((__m128i*)( result )), result_register );
00369 }
00370 }
00371
00372 void convolve_cols_3x3( const unsigned char* in, int16_t* out_v, int16_t* out_h, int w, int h ) {
00373 using namespace std;
00374 assert( w % 16 == 0 && "width must be multiple of 16!" );
00375 const int w_chunk = w/16;
00376 __m128i* i0 = (__m128i*)( in );
00377 __m128i* i1 = (__m128i*)( in ) + w_chunk*1;
00378 __m128i* i2 = (__m128i*)( in ) + w_chunk*2;
00379 __m128i* result_h = (__m128i*)( out_h ) + 2*w_chunk;
00380 __m128i* result_v = (__m128i*)( out_v ) + 2*w_chunk;
00381 __m128i* end_input = (__m128i*)( in ) + w_chunk*h;
00382 for( ; i2 != end_input; i0++, i1++, i2++, result_v+=2, result_h+=2 ) {
00383 *result_h = _mm_setzero_si128();
00384 *(result_h+1) = _mm_setzero_si128();
00385 *result_v = _mm_setzero_si128();
00386 *(result_v+1) = _mm_setzero_si128();
00387 __m128i ilo, ihi;
00388 unpack_8bit_to_16bit( *i0, ihi, ilo );
00389 unpack_8bit_to_16bit( *i0, ihi, ilo );
00390 *result_h = _mm_add_epi16( ihi, *result_h );
00391 *(result_h+1) = _mm_add_epi16( ilo, *(result_h+1) );
00392 *result_v = _mm_add_epi16( *result_v, ihi );
00393 *(result_v+1) = _mm_add_epi16( *(result_v+1), ilo );
00394 unpack_8bit_to_16bit( *i1, ihi, ilo );
00395 *result_v = _mm_add_epi16( *result_v, ihi );
00396 *(result_v+1) = _mm_add_epi16( *(result_v+1), ilo );
00397 *result_v = _mm_add_epi16( *result_v, ihi );
00398 *(result_v+1) = _mm_add_epi16( *(result_v+1), ilo );
00399 unpack_8bit_to_16bit( *i2, ihi, ilo );
00400 *result_h = _mm_sub_epi16( *result_h, ihi );
00401 *(result_h+1) = _mm_sub_epi16( *(result_h+1), ilo );
00402 *result_v = _mm_add_epi16( *result_v, ihi );
00403 *(result_v+1) = _mm_add_epi16( *(result_v+1), ilo );
00404 }
00405 }
00406 };
00407
00408 void sobel3x3( const uint8_t* in, uint8_t* out_v, uint8_t* out_h, int w, int h ) {
00409 int16_t* temp_h = (int16_t*)( _mm_malloc( w*h*sizeof( int16_t ), 16 ) );
00410 int16_t* temp_v = (int16_t*)( _mm_malloc( w*h*sizeof( int16_t ), 16 ) );
00411 detail::convolve_cols_3x3( in, temp_v, temp_h, w, h );
00412 detail::convolve_101_row_3x3_16bit( temp_v, out_v, w, h );
00413 detail::convolve_121_row_3x3_16bit( temp_h, out_h, w, h );
00414 _mm_free( temp_h );
00415 _mm_free( temp_v );
00416 }
00417
00418 void sobel5x5( const uint8_t* in, uint8_t* out_v, uint8_t* out_h, int w, int h ) {
00419 int16_t* temp_h = (int16_t*)( _mm_malloc( w*h*sizeof( int16_t ), 16 ) );
00420 int16_t* temp_v = (int16_t*)( _mm_malloc( w*h*sizeof( int16_t ), 16 ) );
00421 detail::convolve_cols_5x5( in, temp_v, temp_h, w, h );
00422 detail::convolve_12021_row_5x5_16bit( temp_v, out_v, w, h );
00423 detail::convolve_14641_row_5x5_16bit( temp_h, out_h, w, h );
00424 _mm_free( temp_h );
00425 _mm_free( temp_v );
00426 }
00427
00428
00429
00430
00431
00432
00433 void checkerboard5x5( const uint8_t* in, int16_t* out, int w, int h ) {
00434 int16_t* temp = (int16_t*)( _mm_malloc( w*h*sizeof( int16_t ), 16 ) );
00435 detail::convolve_col_p1p1p0m1m1_5x5( in, temp, w, h );
00436 detail::convolve_row_p1p1p0m1m1_5x5( temp, out, w, h );
00437 _mm_free( temp );
00438 }
00439
00440
00441
00442
00443
00444
00445 void blob5x5( const uint8_t* in, int16_t* out, int w, int h ) {
00446 int32_t* integral = (int32_t*)( _mm_malloc( w*h*sizeof( int32_t ), 16 ) );
00447 detail::integral_image( in, integral, w, h );
00448 int16_t* out_ptr = out + 3 + 3*w;
00449 int16_t* out_end = out + w * h - 2 - 2*w;
00450 const int32_t* i00 = integral;
00451 const int32_t* i50 = integral + 5;
00452 const int32_t* i05 = integral + 5*w;
00453 const int32_t* i55 = integral + 5 + 5*w;
00454 const int32_t* i11 = integral + 1 + 1*w;
00455 const int32_t* i41 = integral + 4 + 1*w;
00456 const int32_t* i14 = integral + 1 + 4*w;
00457 const int32_t* i44 = integral + 4 + 4*w;
00458 const uint8_t* im22 = in + 3 + 3*w;
00459 for( ; out_ptr != out_end; out_ptr++, i00++, i50++, i05++, i55++, i11++, i41++, i14++, i44++, im22++ ) {
00460 int32_t result = 0;
00461 result = -( *i55 - *i50 - *i05 + *i00 );
00462 result += 2*( *i44 - *i41 - *i14 + *i11 );
00463 result += 7* *im22;
00464 *out_ptr = result;
00465 }
00466 _mm_free( integral );
00467 }
00468 };