dlut_libvo: filter.cpp Source File

Go to the documentation of this file.
00001 /*
00002 Copyright 2012. All rights reserved.
00003 Institute of Measurement and Control Systems
00004 Karlsruhe Institute of Technology, Germany
00005 
00006 This file is part of libelas.
00007 Authors: Julius Ziegler, Andreas Geiger
00008 
00009 libelas is free software; you can redistribute it and/or modify it under the
00010 terms of the GNU General Public License as published by the Free Software
00011 Foundation; either version 3 of the License, or any later version.
00012 
00013 libelas is distributed in the hope that it will be useful, but WITHOUT ANY
00014 WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
00015 PARTICULAR PURPOSE. See the GNU General Public License for more details.
00016 
00017 You should have received a copy of the GNU General Public License along with
00018 libelas; if not, write to the Free Software Foundation, Inc., 51 Franklin
00019 Street, Fifth Floor, Boston, MA 02110-1301, USA
00020 */
00021 
00022 #include <stdio.h>
00023 #include <iostream>
00024 #include <string.h>
00025 #include <cassert>
00026 
00027 #include "filter.h"
00028 
00029 // define fixed-width datatypes for Visual Studio projects
00030 // Linux env has the stdint.h
00031 #ifndef _MSC_VER
00032   #include <stdint.h>
00033 #else
00034   typedef __int8            int8_t;
00035   typedef __int16           int16_t;
00036   typedef __int32           int32_t;
00037   typedef __int64           int64_t;
00038   typedef unsigned __int8   uint8_t;
00039   typedef unsigned __int16  uint16_t;
00040   typedef unsigned __int32  uint32_t;
00041   typedef unsigned __int64  uint64_t;
00042 #endif
00043 
00044 // fast filters: implements 3x3 and 5x5 sobel filters and
00045 //               5x5 blob and corner filters based on SSE2/3 instructions
00046 namespace filter {
00047 
00048   // private namespace, public user functions at the bottom of this file
00049   namespace detail {
00050       //图像积分
00051     void integral_image( const uint8_t* in, int32_t* out, int w, int h ) {
00052       int32_t* out_top = out;
00053       const uint8_t* line_end = in + w;
00054       const uint8_t* in_end   = in + w*h;
00055       int32_t line_sum = 0;
00056       for( ; in != line_end; in++, out++ ) {
00057         line_sum += *in;
00058         *out = line_sum;
00059       }
00060       for( ; in != in_end; ) {
00061         int32_t line_sum = 0;
00062         const uint8_t* line_end = in + w;
00063         for( ; in != line_end; in++, out++, out_top++ ) {
00064           line_sum += *in;
00065           *out = *out_top + line_sum;
00066         }
00067       }
00068     }
00069 
00070     void unpack_8bit_to_16bit( const __m128i a, __m128i& b0, __m128i& b1 ) {
00071       __m128i zero = _mm_setzero_si128();
00072       //b0:0,a0
00073       b0 = _mm_unpacklo_epi8( a, zero );
00074       //b1:0,a1
00075       b1 = _mm_unpackhi_epi8( a, zero );
00076     }
00077 
00078     void pack_16bit_to_8bit_saturate( const __m128i a0, const __m128i a1, __m128i& b ) {
00079       b = _mm_packus_epi16( a0, a1 );
00080     }
00081 
00082     // convolve image with a (1,4,6,4,1) row vector. Result is accumulated into output.
00083     // output is scaled by 1/128, then clamped to [-128,128], and finally shifted to [0,255].
00084     void convolve_14641_row_5x5_16bit( const int16_t* in, uint8_t* out, int w, int h ) {
00085       assert( w % 16 == 0 && "width must be multiple of 16!" );
00086       const __m128i* i0 = (const __m128i*)(in);
00087       const int16_t* i1 = in+1;
00088       const int16_t* i2 = in+2;
00089       const int16_t* i3 = in+3;
00090       const int16_t* i4 = in+4;
00091       uint8_t* result   = out + 2;
00092       const int16_t* const end_input = in + w*h;
00093       __m128i offs = _mm_set1_epi16( 128 );
00094       for( ; i4 < end_input; i0 += 1, i1 += 8, i2 += 8, i3 += 8, i4 += 8, result += 16 ) {
00095         __m128i result_register_lo;
00096         __m128i result_register_hi;
00097         for( int i=0; i<2; i++ ) {
00098           __m128i* result_register;
00099           if( i==0 ) result_register = &result_register_lo;
00100           else       result_register = &result_register_hi;
00101           __m128i i0_register = *i0;
00102           __m128i i1_register = _mm_loadu_si128( (__m128i*)( i1 ) );
00103           __m128i i2_register = _mm_loadu_si128( (__m128i*)( i2 ) );
00104           __m128i i3_register = _mm_loadu_si128( (__m128i*)( i3 ) );
00105           __m128i i4_register = _mm_loadu_si128( (__m128i*)( i4 ) );
00106           *result_register = _mm_setzero_si128();
00107           *result_register = _mm_add_epi16( i0_register, *result_register );
00108           i1_register      = _mm_add_epi16( i1_register, i1_register  );
00109           i1_register      = _mm_add_epi16( i1_register, i1_register  );
00110           *result_register = _mm_add_epi16( i1_register, *result_register );
00111           i2_register      = _mm_add_epi16( i2_register, i2_register  );
00112           *result_register = _mm_add_epi16( i2_register, *result_register );
00113           i2_register      = _mm_add_epi16( i2_register, i2_register  );
00114           *result_register = _mm_add_epi16( i2_register, *result_register );
00115           i3_register      = _mm_add_epi16( i3_register, i3_register  );
00116           i3_register      = _mm_add_epi16( i3_register, i3_register  );
00117           *result_register = _mm_add_epi16( i3_register, *result_register );
00118           *result_register = _mm_add_epi16( i4_register, *result_register );
00119           //右移命令,1/128
00120           *result_register = _mm_srai_epi16( *result_register, 7 );
00121           *result_register = _mm_add_epi16( *result_register, offs );
00122           if( i==0 ) {
00123             i0 += 1;
00124             i1 += 8;
00125             i2 += 8;
00126             i3 += 8;
00127             i4 += 8;
00128           }
00129         }
00130         pack_16bit_to_8bit_saturate( result_register_lo, result_register_hi, result_register_lo );
00131         _mm_storeu_si128( ((__m128i*)( result )), result_register_lo );
00132       }
00133     }
00134 
00135     // convolve image with a (1,2,0,-2,-1) row vector. Result is accumulated into output.
00136     // This one works on 16bit input and 8bit output.
00137     // output is scaled by 1/128, then clamped to [-128,128], and finally shifted to [0,255].
00138     void convolve_12021_row_5x5_16bit( const int16_t* in, uint8_t* out, int w, int h ) {
00139       assert( w % 16 == 0 && "width must be multiple of 16!" );
00140       const __m128i*  i0 = (const __m128i*)(in);
00141       const int16_t*    i1 = in+1;
00142       const int16_t*    i3 = in+3;
00143       const int16_t*    i4 = in+4;
00144       uint8_t* result    = out + 2;
00145       const int16_t* const end_input = in + w*h;
00146       __m128i offs = _mm_set1_epi16( 128 );
00147       for( ; i4 < end_input; i0 += 1, i1 += 8, i3 += 8, i4 += 8, result += 16 ) {
00148         __m128i result_register_lo;
00149         __m128i result_register_hi;
00150         for( int i=0; i<2; i++ ) {
00151           __m128i* result_register;
00152           if( i==0 ) result_register = &result_register_lo;
00153           else       result_register = &result_register_hi;
00154           __m128i i0_register = *i0;
00155           __m128i i1_register = _mm_loadu_si128( (__m128i*)( i1 ) );
00156           __m128i i3_register = _mm_loadu_si128( (__m128i*)( i3 ) );
00157           __m128i i4_register = _mm_loadu_si128( (__m128i*)( i4 ) );
00158           *result_register = _mm_setzero_si128();
00159           *result_register = _mm_add_epi16( i0_register,   *result_register );
00160           i1_register      = _mm_add_epi16( i1_register, i1_register  );
00161           *result_register = _mm_add_epi16( i1_register,   *result_register );
00162           i3_register      = _mm_add_epi16( i3_register, i3_register  );
00163           *result_register = _mm_sub_epi16( *result_register, i3_register );
00164           *result_register = _mm_sub_epi16( *result_register, i4_register );
00165           *result_register = _mm_srai_epi16( *result_register, 7 );
00166           *result_register = _mm_add_epi16( *result_register, offs );
00167           if( i==0 ) {
00168             i0 += 1;
00169             i1 += 8;
00170             i3 += 8;
00171             i4 += 8;
00172           }
00173         }
00174         pack_16bit_to_8bit_saturate( result_register_lo, result_register_hi, result_register_lo );
00175         _mm_storeu_si128( ((__m128i*)( result )), result_register_lo );
00176       }
00177     }
00178 
00179     // convolve image with a (1,2,1) row vector. Result is accumulated into output.
00180     // This one works on 16bit input and 8bit output.
00181     // output is scaled by 1/4, then clamped to [-128,128], and finally shifted to [0,255].
00182     void convolve_121_row_3x3_16bit( const int16_t* in, uint8_t* out, int w, int h ) {
00183       assert( w % 16 == 0 && "width must be multiple of 16!" );
00184       const __m128i* i0 = (const __m128i*)(in);
00185       const int16_t* i1 = in+1;
00186       const int16_t* i2 = in+2;
00187       uint8_t* result   = out + 1;
00188       const int16_t* const end_input = in + w*h;
00189       const size_t blocked_loops = (w*h-2)/16;
00190       __m128i offs = _mm_set1_epi16( 128 );
00191       for( size_t i=0; i != blocked_loops; i++ ) {
00192         __m128i result_register_lo;
00193         __m128i result_register_hi;
00194         __m128i i1_register;
00195         __m128i i2_register;
00196 
00197         i1_register        = _mm_loadu_si128( (__m128i*)( i1 ) );
00198         i2_register        = _mm_loadu_si128( (__m128i*)( i2 ) );
00199         result_register_lo = *i0;
00200         /*result_register_lo = _mm_loadu_si128(i0);*/
00201         //i1*2
00202         i1_register        = _mm_add_epi16( i1_register, i1_register );
00203         //i0+2*i1;
00204         result_register_lo = _mm_add_epi16( i1_register, result_register_lo );
00205         //i0+2*i1+i2;
00206         result_register_lo = _mm_add_epi16( i2_register, result_register_lo );
00207         //1/4
00208         result_register_lo = _mm_srai_epi16( result_register_lo, 2 );
00209         //+=128
00210         result_register_lo = _mm_add_epi16( result_register_lo, offs );
00211 
00212         i0++;
00213         i1+=8;
00214         i2+=8;
00215 
00216         i1_register        = _mm_loadu_si128( (__m128i*)( i1 ) );
00217         i2_register        = _mm_loadu_si128( (__m128i*)( i2 ) );
00218         result_register_hi = *i0;
00219         i1_register        = _mm_add_epi16( i1_register, i1_register );
00220         result_register_hi = _mm_add_epi16( i1_register, result_register_hi );
00221         result_register_hi = _mm_add_epi16( i2_register, result_register_hi );
00222         result_register_hi = _mm_srai_epi16( result_register_hi, 2 );
00223         result_register_hi = _mm_add_epi16( result_register_hi, offs );
00224 
00225         i0++;
00226         i1+=8;
00227         i2+=8;
00228 
00229         pack_16bit_to_8bit_saturate( result_register_lo, result_register_hi, result_register_lo );
00230         _mm_storeu_si128( ((__m128i*)( result )), result_register_lo );
00231 
00232         result += 16;
00233       }
00234     }
00235 
00236     // convolve image with a (1,0,-1) row vector. Result is accumulated into output.
00237     // This one works on 16bit input and 8bit output.
00238     // output is scaled by 1/4, then clamped to [-128,128], and finally shifted to [0,255].
00239     void convolve_101_row_3x3_16bit( const int16_t* in, uint8_t* out, int w, int h ) {
00240       assert( w % 16 == 0 && "width must be multiple of 16!" );
00241       const __m128i*  i0 = (const __m128i*)(in);
00242       const int16_t*    i2 = in+2;
00243       uint8_t* result    = out + 1;
00244       const int16_t* const end_input = in + w*h;
00245       const size_t blocked_loops = (w*h-2)/16;
00246       __m128i offs = _mm_set1_epi16( 128 );
00247       for( size_t i=0; i != blocked_loops; i++ ) {
00248         __m128i result_register_lo;
00249         __m128i result_register_hi;
00250         __m128i i2_register;
00251 
00252         i2_register = _mm_loadu_si128( (__m128i*)( i2 ) );
00253         result_register_lo  = *i0;
00254         result_register_lo  = _mm_sub_epi16( result_register_lo, i2_register );
00255         result_register_lo  = _mm_srai_epi16( result_register_lo, 2 );
00256         result_register_lo  = _mm_add_epi16( result_register_lo, offs );
00257 
00258         i0 += 1;
00259         i2 += 8;
00260 
00261         i2_register = _mm_loadu_si128( (__m128i*)( i2 ) );
00262         result_register_hi  = *i0;
00263         result_register_hi  = _mm_sub_epi16( result_register_hi, i2_register );
00264         result_register_hi  = _mm_srai_epi16( result_register_hi, 2 );
00265         result_register_hi  = _mm_add_epi16( result_register_hi, offs );
00266 
00267         i0 += 1;
00268         i2 += 8;
00269 
00270         pack_16bit_to_8bit_saturate( result_register_lo, result_register_hi, result_register_lo );
00271         _mm_storeu_si128( ((__m128i*)( result )), result_register_lo );
00272 
00273         result += 16;
00274       }
00275 
00276       for( ; i2 < end_input; i2++, result++) {
00277         *result = ((*(i2-2) - *i2)>>2)+128;
00278       }
00279     }
00280 
00281     void convolve_cols_5x5( const unsigned char* in, int16_t* out_v, int16_t* out_h, int w, int h ) {
00282       using namespace std;
00283       memset( out_h, 0, w*h*sizeof(int16_t) );
00284       memset( out_v, 0, w*h*sizeof(int16_t) );
00285       assert( w % 16 == 0 && "width must be multiple of 16!" );
00286       //std::cout<<"w="<<w<<"h="<<h<<std::endl;
00287       //w=512,w_chunk=32
00288       //一行有多少个__m128i
00289       const int w_chunk  = w/16;
00290       __m128i*  i0       = (__m128i*)( in );
00291       __m128i*  i1       = (__m128i*)( in ) + w_chunk*1;
00292       __m128i*  i2       = (__m128i*)( in ) + w_chunk*2;
00293       __m128i*  i3       = (__m128i*)( in ) + w_chunk*3;
00294       __m128i*  i4       = (__m128i*)( in ) + w_chunk*4;
00295       //水平方向,从第三行开始
00296       __m128i* result_h  = (__m128i*)( out_h ) + 4*w_chunk;
00297       //竖直方向
00298       __m128i* result_v  = (__m128i*)( out_v ) + 4*w_chunk;
00299       __m128i* end_input = (__m128i*)( in ) + w_chunk*h;
00300       //八个数全是6
00301       __m128i sixes      = _mm_set1_epi16( 6 );
00302       __m128i fours      = _mm_set1_epi16( 4 );
00303       for( ; i4 != end_input; i0++, i1++, i2++, i3++, i4++, result_v+=2, result_h+=2 ) {
00304         __m128i ilo, ihi;
00305         //扩展成16bits
00306         unpack_8bit_to_16bit( *i0, ihi, ilo );
00307         //高八位数字与地八位数字
00308         *result_h     = _mm_add_epi16( ihi, *result_h );
00309         *(result_h+1) = _mm_add_epi16( ilo, *(result_h+1) );
00310         //高低八位数字
00311         *result_v     = _mm_add_epi16( *result_v, ihi );
00312         *(result_v+1) = _mm_add_epi16( *(result_v+1), ilo );
00313 
00314         //ihi,ilo已经被清空,重新装入i1的数据
00315         unpack_8bit_to_16bit( *i1, ihi, ilo );
00316         //i0+2i1
00317         *result_h     = _mm_add_epi16( ihi, *result_h );
00318         *result_h     = _mm_add_epi16( ihi, *result_h );
00319         *(result_h+1) = _mm_add_epi16( ilo, *(result_h+1) );
00320         *(result_h+1) = _mm_add_epi16( ilo, *(result_h+1) );
00321         ihi = _mm_mullo_epi16( ihi, fours );
00322         ilo = _mm_mullo_epi16( ilo, fours );
00323         //4倍高低八位
00324         //i0+4i1
00325         *result_v     = _mm_add_epi16( *result_v, ihi );
00326         *(result_v+1) = _mm_add_epi16( *(result_v+1), ilo );
00327 
00328         unpack_8bit_to_16bit( *i2, ihi, ilo );
00329         //i0+2i1+0i2
00330         ihi = _mm_mullo_epi16( ihi, sixes );
00331         ilo = _mm_mullo_epi16( ilo, sixes );
00332         //6倍高低八位
00333         //i0+4i1+6i2
00334         *result_v     = _mm_add_epi16( *result_v, ihi );
00335         *(result_v+1) = _mm_add_epi16( *(result_v+1), ilo );
00336 
00337         unpack_8bit_to_16bit( *i3, ihi, ilo );
00338         //i0+2i1+0i2-2i1
00339         *result_h     = _mm_sub_epi16( *result_h, ihi );
00340         *result_h     = _mm_sub_epi16( *result_h, ihi );
00341         *(result_h+1) = _mm_sub_epi16( *(result_h+1), ilo );
00342         *(result_h+1) = _mm_sub_epi16( *(result_h+1), ilo );
00343         ihi = _mm_mullo_epi16( ihi, fours );
00344         ilo = _mm_mullo_epi16( ilo, fours );
00345         //i0+4i1+6i2+4i3
00346         *result_v     = _mm_add_epi16( *result_v, ihi );
00347         *(result_v+1) = _mm_add_epi16( *(result_v+1), ilo );
00348 
00349         unpack_8bit_to_16bit( *i4, ihi, ilo );
00350         //i0+2i1+0i2-2i3-i4
00351         *result_h     = _mm_sub_epi16( *result_h, ihi );
00352         *(result_h+1) = _mm_sub_epi16( *(result_h+1), ilo );
00353         //i0+4i1+6i2+4i3+i4
00354         *result_v     = _mm_add_epi16( *result_v, ihi );
00355         *(result_v+1) = _mm_add_epi16( *(result_v+1), ilo );
00356       }
00357     }
00358 
00359     void convolve_col_p1p1p0m1m1_5x5( const unsigned char* in, int16_t* out, int w, int h ) {
00360       memset( out, 0, w*h*sizeof(int16_t) );
00361       using namespace std;
00362       assert( w % 16 == 0 && "width must be multiple of 16!" );
00363       const int w_chunk  = w/16;
00364       __m128i*  i0       = (__m128i*)( in );
00365       __m128i*  i1       = (__m128i*)( in ) + w_chunk*1;
00366       __m128i*  i3       = (__m128i*)( in ) + w_chunk*3;
00367       __m128i*  i4       = (__m128i*)( in ) + w_chunk*4;
00368       __m128i* result    = (__m128i*)( out ) + 4*w_chunk;
00369       __m128i* end_input = (__m128i*)( in ) + w_chunk*h;
00370       for( ; i4 != end_input; i0++, i1++, i3++, i4++, result+=2 ) {
00371         __m128i ilo0, ihi0;
00372         unpack_8bit_to_16bit( *i0, ihi0, ilo0 );
00373         __m128i ilo1, ihi1;
00374         unpack_8bit_to_16bit( *i1, ihi1, ilo1 );
00375         *result     = _mm_add_epi16( ihi0, ihi1 );
00376         *(result+1) = _mm_add_epi16( ilo0, ilo1 );
00377         __m128i ilo, ihi;
00378         unpack_8bit_to_16bit( *i3, ihi, ilo );
00379         *result     = _mm_sub_epi16( *result, ihi );
00380         *(result+1) = _mm_sub_epi16( *(result+1), ilo );
00381         unpack_8bit_to_16bit( *i4, ihi, ilo );
00382         *result     = _mm_sub_epi16( *result, ihi );
00383         *(result+1) = _mm_sub_epi16( *(result+1), ilo );
00384       }
00385     }
00386 
00387     void convolve_row_p1p1p0m1m1_5x5( const int16_t* in, int16_t* out, int w, int h ) {
00388       assert( w % 16 == 0 && "width must be multiple of 16!" );
00389       const __m128i*  i0 = (const __m128i*)(in);
00390       const int16_t*    i1 = in+1;
00391       const int16_t*    i3 = in+3;
00392       const int16_t*    i4 = in+4;
00393       int16_t* result    = out + 2;
00394       const int16_t* const end_input = in + w*h;
00395       for( ; i4+8 < end_input; i0 += 1, i1 += 8, i3 += 8, i4 += 8, result += 8 ) {
00396         __m128i result_register;
00397         __m128i i0_register = *i0;
00398         __m128i i1_register = _mm_loadu_si128( (__m128i*)( i1 ) );
00399         __m128i i3_register = _mm_loadu_si128( (__m128i*)( i3 ) );
00400         __m128i i4_register = _mm_loadu_si128( (__m128i*)( i4 ) );
00401         result_register     = _mm_add_epi16( i0_register,     i1_register );
00402         result_register     = _mm_sub_epi16( result_register, i3_register );
00403         result_register     = _mm_sub_epi16( result_register, i4_register );
00404         _mm_storeu_si128( ((__m128i*)( result )), result_register );
00405       }
00406     }
00407 
00408     void convolve_cols_3x3( const unsigned char* in, int16_t* out_v, int16_t* out_h, int w, int h ) {
00409       using namespace std;
00410       assert( w % 16 == 0 && "width must be multiple of 16!" );
00411       const int w_chunk  = w/16;
00412       __m128i*  i0       = (__m128i*)( in );
00413       __m128i*  i1       = (__m128i*)( in ) + w_chunk*1;
00414       __m128i*  i2       = (__m128i*)( in ) + w_chunk*2;
00415       __m128i* result_h  = (__m128i*)( out_h ) + 2*w_chunk;
00416       __m128i* result_v  = (__m128i*)( out_v ) + 2*w_chunk;
00417       __m128i* end_input = (__m128i*)( in ) + w_chunk*h;
00418       for( ; i2 != end_input; i0++, i1++, i2++, result_v+=2, result_h+=2 ) {
00419         *result_h     = _mm_setzero_si128();
00420         *(result_h+1) = _mm_setzero_si128();
00421         *result_v     = _mm_setzero_si128();
00422         *(result_v+1) = _mm_setzero_si128();
00423         __m128i ilo, ihi;
00424         unpack_8bit_to_16bit( *i0, ihi, ilo );
00425         unpack_8bit_to_16bit( *i0, ihi, ilo );
00426         *result_h     = _mm_add_epi16( ihi, *result_h );
00427         *(result_h+1) = _mm_add_epi16( ilo, *(result_h+1) );
00428         *result_v     = _mm_add_epi16( *result_v, ihi );
00429         *(result_v+1) = _mm_add_epi16( *(result_v+1), ilo );
00430         unpack_8bit_to_16bit( *i1, ihi, ilo );
00431         *result_v     = _mm_add_epi16( *result_v, ihi );
00432         *(result_v+1) = _mm_add_epi16( *(result_v+1), ilo );
00433         *result_v     = _mm_add_epi16( *result_v, ihi );
00434         *(result_v+1) = _mm_add_epi16( *(result_v+1), ilo );
00435         unpack_8bit_to_16bit( *i2, ihi, ilo );
00436         *result_h     = _mm_sub_epi16( *result_h, ihi );
00437         *(result_h+1) = _mm_sub_epi16( *(result_h+1), ilo );
00438         *result_v     = _mm_add_epi16( *result_v, ihi );
00439         *(result_v+1) = _mm_add_epi16( *(result_v+1), ilo );
00440       }
00441     }
00442   };
00443 
00444   void sobel3x3( const uint8_t* in, uint8_t* out_v, uint8_t* out_h, int w, int h ) {
00445     int16_t* temp_h = (int16_t*)( _mm_malloc( w*h*sizeof( int16_t ), 16 ) );
00446     int16_t* temp_v = (int16_t*)( _mm_malloc( w*h*sizeof( int16_t ), 16 ) );
00447     detail::convolve_cols_3x3( in, temp_v, temp_h, w, h );
00448     detail::convolve_101_row_3x3_16bit( temp_v, out_v, w, h );
00449     detail::convolve_121_row_3x3_16bit( temp_h, out_h, w, h );
00450     _mm_free( temp_h );
00451     _mm_free( temp_v );
00452   }
00453 
00454   //小核对噪声更加敏感,大核在更多像素上计算拟合
00455   void sobel5x5( const uint8_t* in, uint8_t* out_v, uint8_t* out_h, int w, int h ) {
00456     int16_t* temp_h = (int16_t*)( _mm_malloc( w*h*sizeof( int16_t ), 16 ) );
00457     int16_t* temp_v = (int16_t*)( _mm_malloc( w*h*sizeof( int16_t ), 16 ) );
00458     detail::convolve_cols_5x5( in, temp_v, temp_h, w, h );
00459     detail::convolve_12021_row_5x5_16bit( temp_v, out_v, w, h );
00460     detail::convolve_14641_row_5x5_16bit( temp_h, out_h, w, h );
00461     _mm_free( temp_h );
00462     _mm_free( temp_v );
00463   }
00464 
00465   // -1 -1  0  1  1
00466   // -1 -1  0  1  1
00467   //  0  0  0  0  0
00468   //  1  1  0 -1 -1
00469   //  1  1  0 -1 -1
00470   void checkerboard5x5( const uint8_t* in, int16_t* out, int w, int h ) {
00471     int16_t* temp = (int16_t*)( _mm_malloc( w*h*sizeof( int16_t ), 16 ) );
00472     detail::convolve_col_p1p1p0m1m1_5x5( in, temp, w, h );
00473     detail::convolve_row_p1p1p0m1m1_5x5( temp, out, w, h );
00474     _mm_free( temp );
00475   }
00476 
00477   // -1 -1 -1 -1 -1
00478   // -1  1  1  1 -1
00479   // -1  1  8  1 -1
00480   // -1  1  1  1 -1
00481   // -1 -1 -1 -1 -1
00482   void blob5x5( const uint8_t* in, int16_t* out, int w, int h ) {
00483     int32_t* integral = (int32_t*)( _mm_malloc( w*h*sizeof( int32_t ), 16 ) );
00484     detail::integral_image( in, integral, w, h );
00485     int16_t* out_ptr   = out + 3 + 3*w;
00486     int16_t* out_end   = out + w * h - 2 - 2*w;
00487     const int32_t* i00 = integral;
00488     const int32_t* i50 = integral + 5;
00489     const int32_t* i05 = integral + 5*w;
00490     const int32_t* i55 = integral + 5 + 5*w;
00491     const int32_t* i11 = integral + 1 + 1*w;
00492     const int32_t* i41 = integral + 4 + 1*w;
00493     const int32_t* i14 = integral + 1 + 4*w;
00494     const int32_t* i44 = integral + 4 + 4*w;
00495     const uint8_t* im22 = in + 3 + 3*w;
00496     for( ; out_ptr != out_end; out_ptr++, i00++, i50++, i05++, i55++, i11++, i41++, i14++, i44++, im22++ ) {
00497       int32_t result = 0;
00498       result = -( *i55 - *i50 - *i05 + *i00 );
00499       result += 2*( *i44 - *i41 - *i14 + *i11 );
00500       result += 7* *im22;
00501       *out_ptr = result;
00502     }
00503     _mm_free( integral );
00504   }
00505 };