libviso2: filter.cpp Source File

Go to the documentation of this file.
00001 /*
00002 Copyright 2012. All rights reserved.
00003 Institute of Measurement and Control Systems
00004 Karlsruhe Institute of Technology, Germany
00005 
00006 This file is part of libelas.
00007 Authors: Julius Ziegler, Andreas Geiger
00008 
00009 libelas is free software; you can redistribute it and/or modify it under the
00010 terms of the GNU General Public License as published by the Free Software
00011 Foundation; either version 3 of the License, or any later version.
00012 
00013 libelas is distributed in the hope that it will be useful, but WITHOUT ANY
00014 WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
00015 PARTICULAR PURPOSE. See the GNU General Public License for more details.
00016 
00017 You should have received a copy of the GNU General Public License along with
00018 libelas; if not, write to the Free Software Foundation, Inc., 51 Franklin
00019 Street, Fifth Floor, Boston, MA 02110-1301, USA 
00020 */
00021 
00022 #include <stdio.h>
00023 #include <string.h>
00024 #include <cassert>
00025 
00026 #include "filter.h"
00027 
00028 // define fixed-width datatypes for Visual Studio projects
00029 #ifndef _MSC_VER
00030   #include <stdint.h>
00031 #else
00032   typedef __int8            int8_t;
00033   typedef __int16           int16_t;
00034   typedef __int32           int32_t;
00035   typedef __int64           int64_t;
00036   typedef unsigned __int8   uint8_t;
00037   typedef unsigned __int16  uint16_t;
00038   typedef unsigned __int32  uint32_t;
00039   typedef unsigned __int64  uint64_t;
00040 #endif
00041 
00042 // fast filters: implements 3x3 and 5x5 sobel filters and 
00043 //               5x5 blob and corner filters based on SSE2/3 instructions
00044 namespace filter {
00045   
00046   // private namespace, public user functions at the bottom of this file
00047   namespace detail {
00048     void integral_image( const uint8_t* in, int32_t* out, int w, int h ) {
00049       int32_t* out_top = out;
00050       const uint8_t* line_end = in + w;
00051       const uint8_t* in_end   = in + w*h;
00052       int32_t line_sum = 0;
00053       for( ; in != line_end; in++, out++ ) {
00054         line_sum += *in;
00055         *out = line_sum;
00056       }
00057       for( ; in != in_end; ) {
00058         int32_t line_sum = 0;
00059         const uint8_t* line_end = in + w;
00060         for( ; in != line_end; in++, out++, out_top++ ) {
00061           line_sum += *in;
00062           *out = *out_top + line_sum;
00063         }
00064       }
00065     }
00066     
00067     void unpack_8bit_to_16bit( const __m128i a, __m128i& b0, __m128i& b1 ) {
00068       __m128i zero = _mm_setzero_si128();
00069       b0 = _mm_unpacklo_epi8( a, zero );
00070       b1 = _mm_unpackhi_epi8( a, zero );
00071     }
00072     
00073     void pack_16bit_to_8bit_saturate( const __m128i a0, const __m128i a1, __m128i& b ) {
00074       b = _mm_packus_epi16( a0, a1 );
00075     }
00076     
00077     // convolve image with a (1,4,6,4,1) row vector. Result is accumulated into output.
00078     // output is scaled by 1/128, then clamped to [-128,128], and finally shifted to [0,255].
00079     void convolve_14641_row_5x5_16bit( const int16_t* in, uint8_t* out, int w, int h ) {
00080       assert( w % 16 == 0 && "width must be multiple of 16!" );
00081       const __m128i* i0 = (const __m128i*)(in);
00082       const int16_t* i1 = in+1;
00083       const int16_t* i2 = in+2;
00084       const int16_t* i3 = in+3;
00085       const int16_t* i4 = in+4;
00086       uint8_t* result   = out + 2;
00087       const int16_t* const end_input = in + w*h;
00088       __m128i offs = _mm_set1_epi16( 128 );
00089       for( ; i4 < end_input; i0 += 1, i1 += 8, i2 += 8, i3 += 8, i4 += 8, result += 16 ) {
00090         __m128i result_register_lo;
00091         __m128i result_register_hi;
00092         for( int i=0; i<2; i++ ) {
00093           __m128i* result_register;
00094           if( i==0 ) result_register = &result_register_lo;
00095           else       result_register = &result_register_hi;
00096           __m128i i0_register = *i0;
00097           __m128i i1_register = _mm_loadu_si128( (__m128i*)( i1 ) );
00098           __m128i i2_register = _mm_loadu_si128( (__m128i*)( i2 ) );
00099           __m128i i3_register = _mm_loadu_si128( (__m128i*)( i3 ) );
00100           __m128i i4_register = _mm_loadu_si128( (__m128i*)( i4 ) );
00101           *result_register = _mm_setzero_si128();
00102           *result_register = _mm_add_epi16( i0_register, *result_register );
00103           i1_register      = _mm_add_epi16( i1_register, i1_register  );
00104           i1_register      = _mm_add_epi16( i1_register, i1_register  );
00105           *result_register = _mm_add_epi16( i1_register, *result_register );
00106           i2_register      = _mm_add_epi16( i2_register, i2_register  );
00107           *result_register = _mm_add_epi16( i2_register, *result_register );
00108           i2_register      = _mm_add_epi16( i2_register, i2_register  );
00109           *result_register = _mm_add_epi16( i2_register, *result_register );
00110           i3_register      = _mm_add_epi16( i3_register, i3_register  );
00111           i3_register      = _mm_add_epi16( i3_register, i3_register  );
00112           *result_register = _mm_add_epi16( i3_register, *result_register );
00113           *result_register = _mm_add_epi16( i4_register, *result_register );
00114           *result_register = _mm_srai_epi16( *result_register, 7 );
00115           *result_register = _mm_add_epi16( *result_register, offs );
00116           if( i==0 ) {
00117             i0 += 1;
00118             i1 += 8;
00119             i2 += 8;
00120             i3 += 8;
00121             i4 += 8;
00122           }
00123         }
00124         pack_16bit_to_8bit_saturate( result_register_lo, result_register_hi, result_register_lo );
00125         _mm_storeu_si128( ((__m128i*)( result )), result_register_lo );
00126       }
00127     }
00128     
00129     // convolve image with a (1,2,0,-2,-1) row vector. Result is accumulated into output.
00130     // This one works on 16bit input and 8bit output.
00131     // output is scaled by 1/128, then clamped to [-128,128], and finally shifted to [0,255].
00132     void convolve_12021_row_5x5_16bit( const int16_t* in, uint8_t* out, int w, int h ) {
00133       assert( w % 16 == 0 && "width must be multiple of 16!" );
00134       const __m128i*  i0 = (const __m128i*)(in);
00135       const int16_t*    i1 = in+1;
00136       const int16_t*    i3 = in+3;
00137       const int16_t*    i4 = in+4;
00138       uint8_t* result    = out + 2;
00139       const int16_t* const end_input = in + w*h;
00140       __m128i offs = _mm_set1_epi16( 128 );
00141       for( ; i4 < end_input; i0 += 1, i1 += 8, i3 += 8, i4 += 8, result += 16 ) {
00142         __m128i result_register_lo;
00143         __m128i result_register_hi;
00144         for( int i=0; i<2; i++ ) {
00145           __m128i* result_register;
00146           if( i==0 ) result_register = &result_register_lo;
00147           else       result_register = &result_register_hi;
00148           __m128i i0_register = *i0;
00149           __m128i i1_register = _mm_loadu_si128( (__m128i*)( i1 ) );
00150           __m128i i3_register = _mm_loadu_si128( (__m128i*)( i3 ) );
00151           __m128i i4_register = _mm_loadu_si128( (__m128i*)( i4 ) );
00152           *result_register = _mm_setzero_si128();
00153           *result_register = _mm_add_epi16( i0_register,   *result_register );
00154           i1_register      = _mm_add_epi16( i1_register, i1_register  );
00155           *result_register = _mm_add_epi16( i1_register,   *result_register );
00156           i3_register      = _mm_add_epi16( i3_register, i3_register  );
00157           *result_register = _mm_sub_epi16( *result_register, i3_register );
00158           *result_register = _mm_sub_epi16( *result_register, i4_register );
00159           *result_register = _mm_srai_epi16( *result_register, 7 );
00160           *result_register = _mm_add_epi16( *result_register, offs );
00161           if( i==0 ) {
00162             i0 += 1;
00163             i1 += 8;
00164             i3 += 8;
00165             i4 += 8;
00166           }
00167         }
00168         pack_16bit_to_8bit_saturate( result_register_lo, result_register_hi, result_register_lo );
00169         _mm_storeu_si128( ((__m128i*)( result )), result_register_lo );
00170       }
00171     }
00172 
00173     // convolve image with a (1,2,1) row vector. Result is accumulated into output.
00174     // This one works on 16bit input and 8bit output.
00175     // output is scaled by 1/4, then clamped to [-128,128], and finally shifted to [0,255].
00176     void convolve_121_row_3x3_16bit( const int16_t* in, uint8_t* out, int w, int h ) {
00177       assert( w % 16 == 0 && "width must be multiple of 16!" );
00178       const __m128i* i0 = (const __m128i*)(in);
00179       const int16_t* i1 = in+1;
00180       const int16_t* i2 = in+2;
00181       uint8_t* result   = out + 1;
00182       const int16_t* const end_input = in + w*h;
00183       const size_t blocked_loops = (w*h-2)/16;
00184       __m128i offs = _mm_set1_epi16( 128 );
00185       for( size_t i=0; i != blocked_loops; i++ ) {
00186         __m128i result_register_lo;
00187         __m128i result_register_hi;
00188         __m128i i1_register;
00189         __m128i i2_register;
00190         
00191         i1_register        = _mm_loadu_si128( (__m128i*)( i1 ) );
00192         i2_register        = _mm_loadu_si128( (__m128i*)( i2 ) );
00193         result_register_lo = *i0;
00194         i1_register        = _mm_add_epi16( i1_register, i1_register );
00195         result_register_lo = _mm_add_epi16( i1_register, result_register_lo );
00196         result_register_lo = _mm_add_epi16( i2_register, result_register_lo );
00197         result_register_lo = _mm_srai_epi16( result_register_lo, 2 );
00198         result_register_lo = _mm_add_epi16( result_register_lo, offs );
00199 
00200         i0++;
00201         i1+=8;
00202         i2+=8;
00203 
00204         i1_register        = _mm_loadu_si128( (__m128i*)( i1 ) );
00205         i2_register        = _mm_loadu_si128( (__m128i*)( i2 ) );
00206         result_register_hi = *i0;
00207         i1_register        = _mm_add_epi16( i1_register, i1_register );
00208         result_register_hi = _mm_add_epi16( i1_register, result_register_hi );
00209         result_register_hi = _mm_add_epi16( i2_register, result_register_hi );
00210         result_register_hi = _mm_srai_epi16( result_register_hi, 2 );
00211         result_register_hi = _mm_add_epi16( result_register_hi, offs );
00212 
00213         i0++;
00214         i1+=8;
00215         i2+=8;
00216 
00217         pack_16bit_to_8bit_saturate( result_register_lo, result_register_hi, result_register_lo );
00218         _mm_storeu_si128( ((__m128i*)( result )), result_register_lo );
00219       
00220         result += 16;
00221       }
00222     }
00223     
00224     // convolve image with a (1,0,-1) row vector. Result is accumulated into output.
00225     // This one works on 16bit input and 8bit output.
00226     // output is scaled by 1/4, then clamped to [-128,128], and finally shifted to [0,255].
00227     void convolve_101_row_3x3_16bit( const int16_t* in, uint8_t* out, int w, int h ) {
00228       assert( w % 16 == 0 && "width must be multiple of 16!" );
00229       const __m128i*  i0 = (const __m128i*)(in);
00230       const int16_t*    i2 = in+2;
00231       uint8_t* result    = out + 1;
00232       const int16_t* const end_input = in + w*h;
00233       const size_t blocked_loops = (w*h-2)/16;
00234       __m128i offs = _mm_set1_epi16( 128 );
00235       for( size_t i=0; i != blocked_loops; i++ ) {
00236         __m128i result_register_lo;
00237         __m128i result_register_hi;
00238         __m128i i2_register;
00239 
00240         i2_register = _mm_loadu_si128( (__m128i*)( i2 ) );
00241         result_register_lo  = *i0;
00242         result_register_lo  = _mm_sub_epi16( result_register_lo, i2_register );
00243         result_register_lo  = _mm_srai_epi16( result_register_lo, 2 );
00244         result_register_lo  = _mm_add_epi16( result_register_lo, offs );
00245  
00246         i0 += 1;
00247         i2 += 8;
00248         
00249         i2_register = _mm_loadu_si128( (__m128i*)( i2 ) );
00250         result_register_hi  = *i0;
00251         result_register_hi  = _mm_sub_epi16( result_register_hi, i2_register );
00252         result_register_hi  = _mm_srai_epi16( result_register_hi, 2 );
00253         result_register_hi  = _mm_add_epi16( result_register_hi, offs );
00254 
00255         i0 += 1;
00256         i2 += 8;
00257         
00258         pack_16bit_to_8bit_saturate( result_register_lo, result_register_hi, result_register_lo );
00259         _mm_storeu_si128( ((__m128i*)( result )), result_register_lo );
00260 
00261         result += 16;
00262       }
00263 
00264       for( ; i2 < end_input; i2++, result++) {
00265         *result = ((*(i2-2) - *i2)>>2)+128;
00266       }
00267     }
00268     
00269     void convolve_cols_5x5( const unsigned char* in, int16_t* out_v, int16_t* out_h, int w, int h ) {
00270       using namespace std;
00271       memset( out_h, 0, w*h*sizeof(int16_t) );
00272       memset( out_v, 0, w*h*sizeof(int16_t) );
00273       assert( w % 16 == 0 && "width must be multiple of 16!" );
00274       const int w_chunk  = w/16;
00275       __m128i*  i0       = (__m128i*)( in );
00276       __m128i*  i1       = (__m128i*)( in ) + w_chunk*1;
00277       __m128i*  i2       = (__m128i*)( in ) + w_chunk*2;
00278       __m128i*  i3       = (__m128i*)( in ) + w_chunk*3;
00279       __m128i*  i4       = (__m128i*)( in ) + w_chunk*4;
00280       __m128i* result_h  = (__m128i*)( out_h ) + 4*w_chunk;
00281       __m128i* result_v  = (__m128i*)( out_v ) + 4*w_chunk;
00282       __m128i* end_input = (__m128i*)( in ) + w_chunk*h;
00283       __m128i sixes      = _mm_set1_epi16( 6 );
00284       __m128i fours      = _mm_set1_epi16( 4 );
00285       for( ; i4 != end_input; i0++, i1++, i2++, i3++, i4++, result_v+=2, result_h+=2 ) {      
00286         __m128i ilo, ihi;
00287         unpack_8bit_to_16bit( *i0, ihi, ilo );
00288         *result_h     = _mm_add_epi16( ihi, *result_h );
00289         *(result_h+1) = _mm_add_epi16( ilo, *(result_h+1) );
00290         *result_v     = _mm_add_epi16( *result_v, ihi );
00291         *(result_v+1) = _mm_add_epi16( *(result_v+1), ilo );
00292         unpack_8bit_to_16bit( *i1, ihi, ilo );
00293         *result_h     = _mm_add_epi16( ihi, *result_h );
00294         *result_h     = _mm_add_epi16( ihi, *result_h );
00295         *(result_h+1) = _mm_add_epi16( ilo, *(result_h+1) );
00296         *(result_h+1) = _mm_add_epi16( ilo, *(result_h+1) );
00297         ihi = _mm_mullo_epi16( ihi, fours );
00298         ilo = _mm_mullo_epi16( ilo, fours );
00299         *result_v     = _mm_add_epi16( *result_v, ihi );
00300         *(result_v+1) = _mm_add_epi16( *(result_v+1), ilo );
00301         unpack_8bit_to_16bit( *i2, ihi, ilo );
00302         ihi = _mm_mullo_epi16( ihi, sixes );
00303         ilo = _mm_mullo_epi16( ilo, sixes );
00304         *result_v     = _mm_add_epi16( *result_v, ihi );
00305         *(result_v+1) = _mm_add_epi16( *(result_v+1), ilo );
00306         unpack_8bit_to_16bit( *i3, ihi, ilo );
00307         *result_h     = _mm_sub_epi16( *result_h, ihi );
00308         *result_h     = _mm_sub_epi16( *result_h, ihi );
00309         *(result_h+1) = _mm_sub_epi16( *(result_h+1), ilo );
00310         *(result_h+1) = _mm_sub_epi16( *(result_h+1), ilo );
00311         ihi = _mm_mullo_epi16( ihi, fours );
00312         ilo = _mm_mullo_epi16( ilo, fours );
00313         *result_v     = _mm_add_epi16( *result_v, ihi );
00314         *(result_v+1) = _mm_add_epi16( *(result_v+1), ilo );          
00315         unpack_8bit_to_16bit( *i4, ihi, ilo );
00316         *result_h     = _mm_sub_epi16( *result_h, ihi );
00317         *(result_h+1) = _mm_sub_epi16( *(result_h+1), ilo );
00318         *result_v     = _mm_add_epi16( *result_v, ihi );
00319         *(result_v+1) = _mm_add_epi16( *(result_v+1), ilo );
00320       }
00321     }
00322     
00323     void convolve_col_p1p1p0m1m1_5x5( const unsigned char* in, int16_t* out, int w, int h ) {
00324       memset( out, 0, w*h*sizeof(int16_t) );
00325       using namespace std;
00326       assert( w % 16 == 0 && "width must be multiple of 16!" );
00327       const int w_chunk  = w/16;
00328       __m128i*  i0       = (__m128i*)( in );
00329       __m128i*  i1       = (__m128i*)( in ) + w_chunk*1;
00330       __m128i*  i3       = (__m128i*)( in ) + w_chunk*3;
00331       __m128i*  i4       = (__m128i*)( in ) + w_chunk*4;
00332       __m128i* result    = (__m128i*)( out ) + 4*w_chunk;
00333       __m128i* end_input = (__m128i*)( in ) + w_chunk*h;
00334       for( ; i4 != end_input; i0++, i1++, i3++, i4++, result+=2 ) {
00335         __m128i ilo0, ihi0;
00336         unpack_8bit_to_16bit( *i0, ihi0, ilo0 );
00337         __m128i ilo1, ihi1;
00338         unpack_8bit_to_16bit( *i1, ihi1, ilo1 );
00339         *result     = _mm_add_epi16( ihi0, ihi1 );
00340         *(result+1) = _mm_add_epi16( ilo0, ilo1 );
00341         __m128i ilo, ihi;
00342         unpack_8bit_to_16bit( *i3, ihi, ilo );
00343         *result     = _mm_sub_epi16( *result, ihi );
00344         *(result+1) = _mm_sub_epi16( *(result+1), ilo );
00345         unpack_8bit_to_16bit( *i4, ihi, ilo );
00346         *result     = _mm_sub_epi16( *result, ihi );
00347         *(result+1) = _mm_sub_epi16( *(result+1), ilo );
00348       }
00349     }
00350     
00351     void convolve_row_p1p1p0m1m1_5x5( const int16_t* in, int16_t* out, int w, int h ) {
00352       assert( w % 16 == 0 && "width must be multiple of 16!" );
00353       const __m128i*  i0 = (const __m128i*)(in);
00354       const int16_t*    i1 = in+1;
00355       const int16_t*    i3 = in+3;
00356       const int16_t*    i4 = in+4;
00357       int16_t* result    = out + 2;
00358       const int16_t* const end_input = in + w*h;
00359       for( ; i4+8 < end_input; i0 += 1, i1 += 8, i3 += 8, i4 += 8, result += 8 ) {
00360         __m128i result_register;
00361         __m128i i0_register = *i0;
00362         __m128i i1_register = _mm_loadu_si128( (__m128i*)( i1 ) );
00363         __m128i i3_register = _mm_loadu_si128( (__m128i*)( i3 ) );
00364         __m128i i4_register = _mm_loadu_si128( (__m128i*)( i4 ) );
00365         result_register     = _mm_add_epi16( i0_register,     i1_register );
00366         result_register     = _mm_sub_epi16( result_register, i3_register );
00367         result_register     = _mm_sub_epi16( result_register, i4_register );
00368         _mm_storeu_si128( ((__m128i*)( result )), result_register );
00369       }
00370     }
00371     
00372     void convolve_cols_3x3( const unsigned char* in, int16_t* out_v, int16_t* out_h, int w, int h ) {
00373       using namespace std;
00374       assert( w % 16 == 0 && "width must be multiple of 16!" );
00375       const int w_chunk  = w/16;
00376       __m128i*  i0       = (__m128i*)( in );
00377       __m128i*  i1       = (__m128i*)( in ) + w_chunk*1;
00378       __m128i*  i2       = (__m128i*)( in ) + w_chunk*2;
00379       __m128i* result_h  = (__m128i*)( out_h ) + 2*w_chunk;
00380       __m128i* result_v  = (__m128i*)( out_v ) + 2*w_chunk;
00381       __m128i* end_input = (__m128i*)( in ) + w_chunk*h;
00382       for( ; i2 != end_input; i0++, i1++, i2++, result_v+=2, result_h+=2 ) {
00383         *result_h     = _mm_setzero_si128();
00384         *(result_h+1) = _mm_setzero_si128();
00385         *result_v     = _mm_setzero_si128();
00386         *(result_v+1) = _mm_setzero_si128();
00387         __m128i ilo, ihi;
00388         unpack_8bit_to_16bit( *i0, ihi, ilo ); 
00389         unpack_8bit_to_16bit( *i0, ihi, ilo );
00390         *result_h     = _mm_add_epi16( ihi, *result_h );
00391         *(result_h+1) = _mm_add_epi16( ilo, *(result_h+1) );
00392         *result_v     = _mm_add_epi16( *result_v, ihi );
00393         *(result_v+1) = _mm_add_epi16( *(result_v+1), ilo );
00394         unpack_8bit_to_16bit( *i1, ihi, ilo );
00395         *result_v     = _mm_add_epi16( *result_v, ihi );
00396         *(result_v+1) = _mm_add_epi16( *(result_v+1), ilo );
00397         *result_v     = _mm_add_epi16( *result_v, ihi );
00398         *(result_v+1) = _mm_add_epi16( *(result_v+1), ilo );
00399         unpack_8bit_to_16bit( *i2, ihi, ilo );
00400         *result_h     = _mm_sub_epi16( *result_h, ihi );
00401         *(result_h+1) = _mm_sub_epi16( *(result_h+1), ilo );
00402         *result_v     = _mm_add_epi16( *result_v, ihi );
00403         *(result_v+1) = _mm_add_epi16( *(result_v+1), ilo );
00404       }
00405     }
00406   };
00407   
00408   void sobel3x3( const uint8_t* in, uint8_t* out_v, uint8_t* out_h, int w, int h ) {
00409     int16_t* temp_h = (int16_t*)( _mm_malloc( w*h*sizeof( int16_t ), 16 ) );
00410     int16_t* temp_v = (int16_t*)( _mm_malloc( w*h*sizeof( int16_t ), 16 ) );    
00411     detail::convolve_cols_3x3( in, temp_v, temp_h, w, h );
00412     detail::convolve_101_row_3x3_16bit( temp_v, out_v, w, h );
00413     detail::convolve_121_row_3x3_16bit( temp_h, out_h, w, h );
00414     _mm_free( temp_h );
00415     _mm_free( temp_v );
00416   }
00417   
00418   void sobel5x5( const uint8_t* in, uint8_t* out_v, uint8_t* out_h, int w, int h ) {
00419     int16_t* temp_h = (int16_t*)( _mm_malloc( w*h*sizeof( int16_t ), 16 ) );
00420     int16_t* temp_v = (int16_t*)( _mm_malloc( w*h*sizeof( int16_t ), 16 ) );
00421     detail::convolve_cols_5x5( in, temp_v, temp_h, w, h );
00422     detail::convolve_12021_row_5x5_16bit( temp_v, out_v, w, h );
00423     detail::convolve_14641_row_5x5_16bit( temp_h, out_h, w, h );
00424     _mm_free( temp_h );
00425     _mm_free( temp_v );
00426   }
00427   
00428   // -1 -1  0  1  1
00429   // -1 -1  0  1  1
00430   //  0  0  0  0  0
00431   //  1  1  0 -1 -1
00432   //  1  1  0 -1 -1
00433   void checkerboard5x5( const uint8_t* in, int16_t* out, int w, int h ) {
00434     int16_t* temp = (int16_t*)( _mm_malloc( w*h*sizeof( int16_t ), 16 ) );
00435     detail::convolve_col_p1p1p0m1m1_5x5( in, temp, w, h );
00436     detail::convolve_row_p1p1p0m1m1_5x5( temp, out, w, h );
00437     _mm_free( temp );
00438   }
00439   
00440   // -1 -1 -1 -1 -1
00441   // -1  1  1  1 -1
00442   // -1  1  8  1 -1
00443   // -1  1  1  1 -1
00444   // -1 -1 -1 -1 -1
00445   void blob5x5( const uint8_t* in, int16_t* out, int w, int h ) {
00446     int32_t* integral = (int32_t*)( _mm_malloc( w*h*sizeof( int32_t ), 16 ) );
00447     detail::integral_image( in, integral, w, h );
00448     int16_t* out_ptr   = out + 3 + 3*w;
00449     int16_t* out_end   = out + w * h - 2 - 2*w;
00450     const int32_t* i00 = integral;
00451     const int32_t* i50 = integral + 5;
00452     const int32_t* i05 = integral + 5*w;
00453     const int32_t* i55 = integral + 5 + 5*w;
00454     const int32_t* i11 = integral + 1 + 1*w;
00455     const int32_t* i41 = integral + 4 + 1*w;
00456     const int32_t* i14 = integral + 1 + 4*w;
00457     const int32_t* i44 = integral + 4 + 4*w;    
00458     const uint8_t* im22 = in + 3 + 3*w;
00459     for( ; out_ptr != out_end; out_ptr++, i00++, i50++, i05++, i55++, i11++, i41++, i14++, i44++, im22++ ) {
00460       int32_t result = 0;
00461       result = -( *i55 - *i50 - *i05 + *i00 );
00462       result += 2*( *i44 - *i41 - *i14 + *i11 );
00463       result += 7* *im22;
00464       *out_ptr = result;
00465     }
00466     _mm_free( integral );
00467   }
00468 };