Go to the documentation of this file.00001
00006
00007
00008
00009
00010
00011
00012
00013
00014 #if ! defined(VL_DISABLE_SSE2) & ! defined(__SSE2__)
00015 #error "Compiling with SSE2 enabled, but no __SSE2__ defined"
00016 #endif
00017
00018 #if ! defined(VL_DISABLE_SSE2)
00019
00020 #ifndef VL_IMOPV_SSE2_INSTANTIATING
00021
00022 #include <emmintrin.h>
00023
00024 #include "imopv.h"
00025 #include "imopv_sse2.h"
00026
00027 #define FLT VL_TYPE_FLOAT
00028 #define VL_IMOPV_SSE2_INSTANTIATING
00029 #include "imopv_sse2.c"
00030
00031 #define FLT VL_TYPE_DOUBLE
00032 #define VL_IMOPV_SSE2_INSTANTIATING
00033 #include "imopv_sse2.c"
00034
00035
00036
00037 #else
00038
00039 #include "float.th"
00040
00041
00042 void
00043 VL_XCAT3(_vl_imconvcol_v, SFX, _sse2)
00044 (T* dst, vl_size dst_stride,
00045 T const* src,
00046 vl_size src_width, vl_size src_height, vl_size src_stride,
00047 T const* filt, vl_index filt_begin, vl_index filt_end,
00048 int step, unsigned int flags)
00049 {
00050 vl_index x = 0 ;
00051 vl_index y ;
00052 vl_index dheight = (src_height - 1) / step + 1 ;
00053 vl_bool use_simd = VALIGNED(src_stride) ;
00054 vl_bool transp = flags & VL_TRANSPOSE ;
00055 vl_bool zeropad = (flags & VL_PAD_MASK) == VL_PAD_BY_ZERO ;
00056 double totcol = 0 ;
00057 double simdcol = 0 ;
00058
00059
00060 filt += filt_end - filt_begin ;
00061
00062 while (x < (signed)src_width) {
00063
00064
00065
00066
00067
00068
00069
00070
00071
00072
00073 T const *filti ;
00074 vl_index stop ;
00075
00076 if ((x + VSIZE < (signed)src_width) &
00077 VALIGNED(src + x) & use_simd)
00078 {
00079
00080 for (y = 0 ; y < (signed)src_height ; y += step) {
00081 union {VTYPE v ; T x [VSIZE] ; } acc ;
00082 VTYPE v, c ;
00083 T const *srci ;
00084 acc.v = VSTZ () ;
00085 v = VSTZ() ;
00086
00087 filti = filt ;
00088 stop = filt_end - y ;
00089 srci = src + x - stop * src_stride ;
00090
00091 if (stop > 0) {
00092 if (zeropad) {
00093 v = VSTZ () ;
00094 } else {
00095 v = * (VTYPE*) (src + x) ;
00096 }
00097 while (filti > filt - stop) {
00098 c = VLD1 (filti--) ;
00099 acc.v = VADD (acc.v, VMUL (v, c)) ;
00100 srci += src_stride ;
00101 }
00102 }
00103
00104 stop = filt_end - VL_MAX(filt_begin, y - (signed)src_height + 1) + 1 ;
00105 while (filti > filt - stop) {
00106 v = * (VTYPE*) srci ;
00107 c = VLD1 (filti--) ;
00108 acc.v = VADD (acc.v, VMUL (v, c)) ;
00109 srci += src_stride ;
00110 }
00111
00112 if (zeropad) v = VSTZ () ;
00113
00114 stop = filt_end - filt_begin + 1;
00115 while (filti > filt - stop) {
00116 c = VLD1 (filti--) ;
00117 acc.v = VADD (acc.v, VMUL (v, c)) ;
00118 }
00119
00120 if (transp) {
00121 *dst = acc.x[0] ; dst += dst_stride ;
00122 *dst = acc.x[1] ; dst += dst_stride ;
00123 #if(VSIZE == 4)
00124 *dst = acc.x[2] ; dst += dst_stride ;
00125 *dst = acc.x[3] ; dst += dst_stride ;
00126 #endif
00127 dst += 1 * 1 - VSIZE * dst_stride ;
00128 } else {
00129 *dst = acc.x[0] ; dst += 1 ;
00130 *dst = acc.x[1] ; dst += 1 ;
00131 #if(VSIZE == 4)
00132 *dst = acc.x[2] ; dst += 1 ;
00133 *dst = acc.x[3] ; dst += 1 ;
00134 #endif
00135 dst += 1 * dst_stride - VSIZE * 1 ;
00136 }
00137 }
00138 if (transp) {
00139 dst += VSIZE * dst_stride - dheight * 1 ;
00140 } else {
00141 dst += VSIZE * 1 - dheight * dst_stride ;
00142 }
00143 x += VSIZE ;
00144 simdcol += VSIZE ;
00145 totcol += VSIZE ;
00146 } else {
00147
00148 for (y = 0 ; y < (signed)src_height ; y += step) {
00149 T acc = 0 ;
00150 T v = 0, c ;
00151 T const* srci ;
00152
00153 filti = filt ;
00154 stop = filt_end - y ;
00155 srci = src + x - stop * src_stride ;
00156
00157 if (stop > 0) {
00158 if (zeropad) {
00159 v = 0 ;
00160 } else {
00161 v = *(src + x) ;
00162 }
00163 while (filti > filt - stop) {
00164 c = *filti-- ;
00165 acc += v * c ;
00166 srci += src_stride ;
00167 }
00168 }
00169
00170 stop = filt_end - VL_MAX(filt_begin, y - (signed)src_height + 1) + 1 ;
00171 while (filti > filt - (signed)stop) {
00172 v = *srci ;
00173 c = *filti-- ;
00174 acc += v * c ;
00175 srci += src_stride ;
00176 }
00177
00178 if (zeropad) v = 0 ;
00179
00180 stop = filt_end - filt_begin + 1 ;
00181 while (filti > filt - stop) {
00182 c = *filti-- ;
00183 acc += v * c ;
00184 }
00185
00186 if (transp) {
00187 *dst = acc ; dst += 1 ;
00188 } else {
00189 *dst = acc ; dst += dst_stride ;
00190 }
00191 }
00192 if (transp) {
00193 dst += 1 * dst_stride - dheight * 1 ;
00194 } else {
00195 dst += 1 * 1 - dheight * dst_stride ;
00196 }
00197 x += 1 ;
00198 totcol += 1 ;
00199 }
00200 }
00201 }
00202
00203
00204 #if 0
00205 void
00206 VL_XCAT(_vl_imconvcoltri_v, SFX, sse2)
00207 (T* dst, int dst_stride,
00208 T const* src,
00209 int src_width, int src_height, int src_stride,
00210 int filt_size,
00211 int step, unsigned int flags)
00212 {
00213 int x = 0 ;
00214 int y ;
00215 int dheight = (src_height - 1) / step + 1 ;
00216 vl_bool use_simd = ((src_stride & ALIGNSTRIDE) == 0) &&
00217 (! (flags & VL_NO_SIMD)) ;
00218 vl_bool transp = flags & VL_TRANSPOSE ;
00219 vl_bool zeropad = (flags & VL_PAD_MASK) == VL_PAD_BY_ZERO ;
00220
00221 T * buff = vl_malloc(sizeof(T) * (src_height + filt_size)) ;
00222 #define fa (1.0 / (double) (filt_size + 1))
00223 T scale = fa*fa*fa*fa ;
00224 buff += filt_size ;
00225
00226 while (x < src_width) {
00227 T const *srci ;
00228
00229 use_simd = 0 ;
00230 if ((x + VSIZE < src_width) &
00231 (((vl_ptrint)(src + x) & ALIGNPTR) == 0) &
00232 use_simd)
00233 {
00234
00235 } else {
00236 int stridex = transp ? dst_stride : 1 ;
00237 int stridey = transp ? 1 : dst_stride ;
00238 srci = src + x + src_stride * (src_height - 1) ;
00239
00240
00241 buff [src_height - 1] = *srci ;
00242 for (y = src_height-2 ; y >= 0 ; --y) {
00243 srci -= src_stride ;
00244 buff [y] = buff [y+1] + *srci ;
00245 }
00246 if (zeropad) {
00247 for ( ; y >= - filt_size ; --y) {
00248 buff [y] = buff [y+1] ;
00249 }
00250 } else {
00251 for ( ; y >= - filt_size ; --y) {
00252 buff [y] = buff[y+1] + *srci ;
00253 }
00254 }
00255
00256
00257 for (y = - filt_size ; y < src_height - filt_size ; ++y) {
00258 buff [y] = buff [y] - buff [y + filt_size] ;
00259 }
00260 if (! zeropad) {
00261 for (y = src_height - filt_size ; y < src_height ; ++y) {
00262 buff [y] = buff [y] - buff [src_height-1] *
00263 (src_height - filt_size - y) ;
00264 }
00265 }
00266
00267
00268 for (y = - filt_size + 1 ; y < src_height ; ++y) {
00269 buff [y] += buff [y - 1] ;
00270 }
00271
00272
00273 for (y = src_height - 1 ; y >= 0 ; --y) {
00274 dst [x*stridex + y*stridey]
00275 = scale * (buff [y] - buff [y - filt_size]) ;
00276 }
00277 }
00278 x += 1 ;
00279 }
00280 vl_free (buff - filt_size) ;
00281 }
00282 #endif
00283
00284 #undef FLT
00285 #undef VL_IMOPV_SSE2_INSTANTIATING
00286 #endif
00287
00288
00289 #endif