libvlfeat: imopv_sse2.c Source File

Go to the documentation of this file.
00001 
00006 /*
00007 Copyright (C) 2007-12 Andrea Vedaldi and Brian Fulkerson.
00008 All rights reserved.
00009 
00010 This file is part of the VLFeat library and is made available under
00011 the terms of the BSD license (see the COPYING file).
00012 */
00013 
00014 #if ! defined(VL_DISABLE_SSE2) & ! defined(__SSE2__)
00015 #error "Compiling with SSE2 enabled, but no __SSE2__ defined"
00016 #endif
00017 
00018 #if ! defined(VL_DISABLE_SSE2)
00019 
00020 #ifndef VL_IMOPV_SSE2_INSTANTIATING
00021 
00022 #include <emmintrin.h>
00023 
00024 #include "imopv.h"
00025 #include "imopv_sse2.h"
00026 
00027 #define FLT VL_TYPE_FLOAT
00028 #define VL_IMOPV_SSE2_INSTANTIATING
00029 #include "imopv_sse2.c"
00030 
00031 #define FLT VL_TYPE_DOUBLE
00032 #define VL_IMOPV_SSE2_INSTANTIATING
00033 #include "imopv_sse2.c"
00034 
00035 /* ---------------------------------------------------------------- */
00036 /* VL_IMOPV_SSE2_INSTANTIATING */
00037 #else
00038 
00039 #include "float.th"
00040 
00041 /* ---------------------------------------------------------------- */
00042 void
00043 VL_XCAT3(_vl_imconvcol_v, SFX, _sse2)
00044 (T* dst, vl_size dst_stride,
00045  T const* src,
00046  vl_size src_width, vl_size src_height, vl_size src_stride,
00047  T const* filt, vl_index filt_begin, vl_index filt_end,
00048  int step, unsigned int flags)
00049 {
00050   vl_index x = 0 ;
00051   vl_index y ;
00052   vl_index dheight = (src_height - 1) / step + 1 ;
00053   vl_bool use_simd  = VALIGNED(src_stride) ;
00054   vl_bool transp    = flags & VL_TRANSPOSE ;
00055   vl_bool zeropad   = (flags & VL_PAD_MASK) == VL_PAD_BY_ZERO ;
00056   double totcol = 0 ;
00057   double simdcol = 0 ;
00058 
00059   /* let filt point to the last sample of the filter */
00060   filt += filt_end - filt_begin ;
00061 
00062   while (x < (signed)src_width) {
00063     /* Calculate dest[x,y] = sum_p image[x,p] filt[y - p]
00064      * where supp(filt) = [filt_begin, filt_end] = [fb,fe].
00065      *
00066      * CHUNK_A: y - fe <= p < 0
00067      *          completes VL_MAX(fe - y, 0) samples
00068      * CHUNK_B: VL_MAX(y - fe, 0) <= p < VL_MIN(y - fb, height - 1)
00069      *          completes fe - VL_MAX(fb, height - y) + 1 samples
00070      * CHUNK_C: completes all samples
00071      */
00072 
00073     T const *filti ;
00074     vl_index stop ;
00075 
00076     if ((x + VSIZE < (signed)src_width) &
00077         VALIGNED(src + x) & use_simd)
00078     {
00079       /* ----------------------------------------------  Vectorized */
00080       for (y = 0 ; y < (signed)src_height ; y += step)  {
00081         union {VTYPE v ; T x [VSIZE] ; } acc ;
00082         VTYPE v, c ;
00083         T const *srci ;
00084         acc.v = VSTZ () ;
00085         v = VSTZ() ;
00086 
00087         filti = filt ;
00088         stop = filt_end - y ;
00089         srci = src + x - stop * src_stride ;
00090 
00091         if (stop > 0) {
00092           if (zeropad) {
00093             v = VSTZ () ;
00094           } else {
00095             v = * (VTYPE*) (src + x) ;
00096           }
00097           while (filti > filt - stop) {
00098             c = VLD1 (filti--) ;
00099             acc.v = VADD (acc.v,  VMUL (v, c)) ;
00100             srci += src_stride ;
00101           }
00102         }
00103 
00104         stop = filt_end - VL_MAX(filt_begin, y - (signed)src_height + 1) + 1 ;
00105         while (filti > filt - stop) {
00106           v = * (VTYPE*) srci ;
00107           c = VLD1 (filti--) ;
00108           acc.v = VADD (acc.v, VMUL (v, c)) ;
00109           srci += src_stride ;
00110         }
00111 
00112         if (zeropad) v = VSTZ () ;
00113 
00114         stop = filt_end - filt_begin + 1;
00115         while (filti > filt - stop) {
00116           c = VLD1 (filti--) ;
00117           acc.v = VADD (acc.v, VMUL (v, c)) ;
00118         }
00119 
00120         if (transp) {
00121           *dst = acc.x[0] ; dst += dst_stride ;
00122           *dst = acc.x[1] ; dst += dst_stride ;
00123 #if(VSIZE == 4)
00124           *dst = acc.x[2] ; dst += dst_stride ;
00125           *dst = acc.x[3] ; dst += dst_stride ;
00126 #endif
00127           dst += 1 * 1 - VSIZE * dst_stride ;
00128         } else {
00129           *dst = acc.x[0] ; dst += 1 ;
00130           *dst = acc.x[1] ; dst += 1 ;
00131 #if(VSIZE == 4)
00132           *dst = acc.x[2] ; dst += 1 ;
00133           *dst = acc.x[3] ; dst += 1 ;
00134 #endif
00135           dst += 1 * dst_stride - VSIZE * 1 ;
00136         }
00137       } /* next y */
00138       if (transp) {
00139         dst += VSIZE * dst_stride - dheight * 1 ;
00140       } else {
00141         dst += VSIZE * 1 - dheight * dst_stride ;
00142       }
00143       x       += VSIZE ;
00144       simdcol += VSIZE ;
00145       totcol  += VSIZE ;
00146     } else {
00147       /* -------------------------------------------------  Vanilla */
00148       for (y = 0 ; y < (signed)src_height ; y += step) {
00149         T acc = 0 ;
00150         T v = 0, c ;
00151         T const* srci ;
00152 
00153         filti = filt ;
00154         stop = filt_end - y ;
00155         srci = src + x - stop * src_stride ;
00156 
00157         if (stop > 0) {
00158           if (zeropad) {
00159             v = 0 ;
00160           } else {
00161             v = *(src + x) ;
00162           }
00163           while (filti > filt - stop) {
00164             c = *filti-- ;
00165             acc += v * c ;
00166             srci += src_stride ;
00167           }
00168         }
00169 
00170         stop = filt_end - VL_MAX(filt_begin, y - (signed)src_height + 1) + 1 ;
00171         while (filti > filt - (signed)stop) {
00172           v = *srci ;
00173           c = *filti-- ;
00174           acc += v * c ;
00175           srci += src_stride ;
00176         }
00177 
00178         if (zeropad) v = 0 ;
00179 
00180         stop = filt_end - filt_begin + 1 ;
00181         while (filti > filt - stop) {
00182           c = *filti-- ;
00183           acc += v * c ;
00184         }
00185 
00186         if (transp) {
00187           *dst = acc ; dst += 1 ;
00188         } else {
00189           *dst = acc ; dst += dst_stride ;
00190         }
00191       } /* next y */
00192       if (transp) {
00193         dst += 1 * dst_stride - dheight * 1 ;
00194       } else {
00195         dst += 1 * 1 - dheight * dst_stride ;
00196       }
00197       x      += 1 ;
00198       totcol += 1 ;
00199     } /* next x */
00200   }
00201 }
00202 
00203 /* ---------------------------------------------------------------- */
00204 #if 0
00205 void
00206 VL_XCAT(_vl_imconvcoltri_v, SFX, sse2)
00207 (T* dst, int dst_stride,
00208  T const* src,
00209  int src_width, int src_height, int src_stride,
00210  int filt_size,
00211  int step, unsigned int flags)
00212 {
00213   int x = 0 ;
00214   int y ;
00215   int dheight = (src_height - 1) / step + 1 ;
00216   vl_bool use_simd  = ((src_stride & ALIGNSTRIDE) == 0) &&
00217   (! (flags & VL_NO_SIMD)) ;
00218   vl_bool transp = flags & VL_TRANSPOSE ;
00219   vl_bool zeropad = (flags & VL_PAD_MASK) == VL_PAD_BY_ZERO ;
00220 
00221   T * buff = vl_malloc(sizeof(T) * (src_height + filt_size)) ;
00222 #define fa (1.0 / (double) (filt_size + 1))
00223   T scale = fa*fa*fa*fa ;
00224   buff += filt_size ;
00225 
00226   while (x < src_width) {
00227     T const *srci ;
00228 
00229     use_simd = 0 ;
00230     if ((x + VSIZE < src_width) &
00231         (((vl_ptrint)(src + x) & ALIGNPTR) == 0) &
00232         use_simd)
00233     {
00234 
00235     } else {
00236       int stridex = transp ? dst_stride : 1 ;
00237       int stridey = transp ? 1 : dst_stride ;
00238       srci = src + x + src_stride * (src_height - 1) ;
00239 
00240       /* integrate backward the column */
00241       buff [src_height - 1] = *srci ;
00242       for (y = src_height-2 ; y >=  0 ; --y) {
00243         srci -= src_stride ;
00244         buff [y] = buff [y+1] + *srci ;
00245       }
00246       if (zeropad) {
00247         for ( ; y >= - filt_size ; --y) {
00248           buff [y] = buff [y+1] ;
00249         }
00250       } else {
00251         for ( ; y >= - filt_size ; --y) {
00252           buff [y] = buff[y+1] + *srci ;
00253         }
00254       }
00255 
00256       /* compute the filter forward */
00257       for (y = - filt_size ; y < src_height - filt_size ; ++y) {
00258         buff [y] = buff [y] - buff [y + filt_size] ;
00259       }
00260       if (! zeropad) {
00261         for (y = src_height - filt_size ; y < src_height ; ++y) {
00262           buff [y] = buff [y] - buff [src_height-1]  *
00263           (src_height - filt_size - y) ;
00264         }
00265       }
00266 
00267       /* integrate forward the column */
00268       for (y = - filt_size + 1 ; y < src_height ; ++y) {
00269         buff [y] += buff [y - 1] ;
00270       }
00271 
00272       /* compute the filter backward */
00273       for (y = src_height - 1 ; y >= 0 ; --y) {
00274         dst [x*stridex + y*stridey]
00275         = scale * (buff [y] - buff [y - filt_size]) ;
00276       }
00277     } /* next y */
00278     x += 1 ;
00279   }
00280   vl_free (buff - filt_size) ;
00281 }
00282 #endif
00283 
00284 #undef FLT
00285 #undef VL_IMOPV_SSE2_INSTANTIATING
00286 #endif
00287 
00288 /* ! VL_DISABLE_SSE2 */
00289 #endif