tum_ardrone: PatchFinder.cc Source File

Go to the documentation of this file.
00001 // Copyright 2008 Isis Innovation Limited
00002 #include "PatchFinder.h"
00003 #include "SmallMatrixOpts.h"
00004 #include "KeyFrame.h"
00005 
00006 #include <cvd/vision.h>
00007 #include <cvd/vector_image_ref.h>
00008 #include <cvd/image_interpolate.h>
00009 #include <TooN/Cholesky.h>
00010 // tmmintrin.h contains SSE3<> instrinsics, used for the ZMSSD search at the bottom..
00011 // If this causes problems, just do #define CVD_HAVE_XMMINTRIN 0
00012 #if CVD_HAVE_XMMINTRIN
00013 #include <tmmintrin.h>
00014 #endif
00015 
00016 using namespace CVD;
00017 using namespace std;
00018 
00019 PatchFinder::PatchFinder(int nPatchSize)
00020   : mimTemplate(ImageRef(nPatchSize,nPatchSize))
00021 {
00022   mnPatchSize = nPatchSize;
00023   mirCenter = ImageRef(nPatchSize/2, nPatchSize/2);
00024   int nMaxSSDPerPixel = 500; // Pretty arbitrary... could make a GVar out of this.
00025   mnMaxSSD = mnPatchSize * mnPatchSize * nMaxSSDPerPixel;
00026   // Populate the speed-up caches with bogus values:
00027   mm2LastWarpMatrix = 9999.9 * Identity;
00028   mpLastTemplateMapPoint = NULL;
00029 };
00030 
00031 
00032 // Find the warping matrix and search level
00033 int PatchFinder::CalcSearchLevelAndWarpMatrix(MapPoint &p,
00034                                               SE3<> se3CFromW,
00035                                               Matrix<2> &m2CamDerivs)
00036 {
00037   // Calc point pos in new view camera frame
00038   // Slightly dumb that we re-calculate this here when the tracker's already done this!
00039   Vector<3> v3Cam = se3CFromW * p.v3WorldPos;
00040   double dOneOverCameraZ = 1.0 / v3Cam[2];
00041   // Project the source keyframe's one-pixel-right and one-pixel-down vectors into the current view
00042   Vector<3> v3MotionRight = se3CFromW.get_rotation() * p.v3PixelRight_W;
00043   Vector<3> v3MotionDown = se3CFromW.get_rotation() * p.v3PixelDown_W;
00044   // Calculate in-image derivatives of source image pixel motions:
00045   mm2WarpInverse.T()[0] = m2CamDerivs * (v3MotionRight.slice<0,2>() - v3Cam.slice<0,2>() * v3MotionRight[2] * dOneOverCameraZ) * dOneOverCameraZ;
00046   mm2WarpInverse.T()[1] = m2CamDerivs * (v3MotionDown.slice<0,2>() - v3Cam.slice<0,2>() * v3MotionDown[2] * dOneOverCameraZ) * dOneOverCameraZ;
00047   double dDet = mm2WarpInverse[0][0] * mm2WarpInverse[1][1] - mm2WarpInverse[0][1] * mm2WarpInverse[1][0];
00048   mnSearchLevel = 0;
00049   
00050   // This warp matrix is likely not appropriate for finding at level zero, which is 
00051   // the level at which it has been calculated. Vary the search level until the 
00052   // at that level would be appropriate (does not actually modify the matrix.)
00053   while(dDet > 3 && mnSearchLevel < LEVELS-1)
00054     {
00055       mnSearchLevel++;
00056       dDet *= 0.25;
00057     };
00058   
00059   // Some warps are inappropriate, e.g. too near the camera, too far, or reflected, 
00060   // or zero area.. reject these!
00061   if(dDet > 3 || dDet < 0.25)
00062     {
00063       mbTemplateBad = true;
00064       return -1;
00065     }
00066   else
00067     return mnSearchLevel;
00068 }
00069 
00070 // This is just a convenience function wich caluclates the warp matrix and generates
00071 // the template all in one call.
00072 void PatchFinder::MakeTemplateCoarse(MapPoint &p,
00073                                      SE3<> se3CFromW,
00074                                      Matrix<2> &m2CamDerivs)
00075 {
00076   CalcSearchLevelAndWarpMatrix(p, se3CFromW, m2CamDerivs);
00077   MakeTemplateCoarseCont(p);
00078 };
00079 
00080 // This function generates the warped search template.
00081 void PatchFinder::MakeTemplateCoarseCont(MapPoint &p)
00082 {
00083   // Get the warping matrix appropriate for use with CVD::transform...
00084   Matrix<2> m2 = M2Inverse(mm2WarpInverse) * LevelScale(mnSearchLevel); 
00085   // m2 now represents the number of pixels in the source image for one 
00086   // pixel of template image
00087   
00088   // Optimisation: Don't re-gen the coarse template if it's going to be substantially the 
00089   // same as was made last time. This saves time when the camera is not moving. For this, 
00090   // check that (a) this patchfinder is still working on the same map point and (b) the 
00091   // warping matrix has not changed much.
00092   
00093   bool bNeedToRefreshTemplate = false;
00094   if(&p != mpLastTemplateMapPoint)
00095     bNeedToRefreshTemplate = true;
00096   // Still the same map point? Then compare warping matrix..
00097   for(int i=0; !bNeedToRefreshTemplate && i<2; i++)
00098     {
00099       Vector<2> v2Diff = m2.T()[i] - mm2LastWarpMatrix.T()[i];
00100       const double dRefreshLimit = 0.07;  // Sort of works out as half a pixel displacement in src img
00101       if(v2Diff * v2Diff > dRefreshLimit * dRefreshLimit)
00102         bNeedToRefreshTemplate = true;
00103     }
00104   
00105   // Need to regen template? Then go ahead.
00106   if(bNeedToRefreshTemplate)
00107     {
00108       int nOutside;  // Use CVD::transform to warp the patch according the the warping matrix m2
00109                      // This returns the number of pixels outside the source image hit, which should be zero.
00110       nOutside = CVD::transform(p.pPatchSourceKF->aLevels[p.nSourceLevel].im, 
00111                                 mimTemplate, 
00112                                 m2,
00113                                 vec(p.irCenter),
00114                                 vec(mirCenter)); 
00115       
00116       if(nOutside)
00117         mbTemplateBad = true;
00118       else
00119         mbTemplateBad = false;
00120       
00121       MakeTemplateSums();
00122       
00123       // Store the parameters which allow us to determine if we need to re-calculate
00124       // the patch next time round.
00125       mpLastTemplateMapPoint = &p;
00126       mm2LastWarpMatrix = m2;
00127     }
00128 };
00129 
00130 // This makes a template without warping. Used for epipolar search, where we don't really know 
00131 // what the warping matrix should be. (Although to be fair, I should do rotation for epipolar,
00132 // which we could approximate without knowing patch depth!)
00133 void PatchFinder::MakeTemplateCoarseNoWarp(KeyFrame &k, int nLevel, ImageRef irLevelPos)
00134 {
00135   mnSearchLevel = nLevel;
00136   Image<byte> &im = k.aLevels[nLevel].im;
00137   if(!im.in_image_with_border(irLevelPos, mnPatchSize / 2 + 1))
00138     {
00139       mbTemplateBad = true;
00140       return;
00141     }
00142   mbTemplateBad = false;
00143   copy(im,
00144        mimTemplate,
00145        mimTemplate.size(),
00146        irLevelPos - mirCenter);
00147   
00148   MakeTemplateSums();
00149 }
00150 
00151 // Convenient wrapper for the above
00152 void PatchFinder::MakeTemplateCoarseNoWarp(MapPoint &p)
00153 {
00154   MakeTemplateCoarseNoWarp(*p.pPatchSourceKF, p.nSourceLevel,  p.irCenter);
00155 };
00156 
00157 // Finds the sum, and sum-squared, of template pixels. These sums are used
00158 // to calculate the ZMSSD.
00159 inline void PatchFinder::MakeTemplateSums()
00160 {
00161   int nSum = 0;
00162   int nSumSq = 0;
00163   ImageRef ir;
00164   do
00165     {
00166       int b = mimTemplate[ir];
00167       nSum += b;
00168       nSumSq +=b * b;
00169     }      
00170   while(ir.next(mimTemplate.size()));
00171   mnTemplateSum = nSum;
00172   mnTemplateSumSq = nSumSq;
00173 }
00174 
00175 // One of the main functions of the class! Looks at the appropriate level of 
00176 // the target keyframe to try and find the template. Looks only at FAST corner points
00177 // which are within radius nRange of the center. (Params are supplied in Level0
00178 // coords.) Returns true on patch found.
00179 bool PatchFinder::FindPatchCoarse(ImageRef irPos, KeyFrame &kf, unsigned int nRange)
00180 {
00181   mbFound = false;
00182   
00183   // Convert from L0 coords to search level quantities
00184   int nLevelScale = LevelScale(mnSearchLevel);
00185   mirPredictedPos = irPos;
00186   irPos = irPos / nLevelScale;
00187   nRange = (nRange + nLevelScale - 1) / nLevelScale;
00188   
00189   // Bounding box of search circle
00190   int nTop = irPos.y - nRange;
00191   int nBottomPlusOne = irPos.y + nRange + 1;
00192   int nLeft = irPos.x - nRange;
00193   int nRight = irPos.x + nRange;
00194   
00195   // Ref variable for the search level
00196   Level &L = kf.aLevels[mnSearchLevel];
00197   
00198   // Some bounds checks on the bounding box..
00199   if(nTop < 0)
00200     nTop = 0;
00201   if(nTop >= L.im.size().y)
00202     return false;
00203   if(nBottomPlusOne <= 0)
00204     return false;
00205   
00206   // The next section finds all the FAST corners in the target level which 
00207   // are near enough the search center. It's a bit optimised to use 
00208   // a corner row look-up-table, since otherwise the routine
00209   // would spend a long time trawling throught the whole list of FAST corners!
00210   vector<ImageRef>::iterator i;
00211   vector<ImageRef>::iterator i_end;
00212   
00213   i = L.vCorners.begin() + L.vCornerRowLUT[nTop];
00214   
00215   if(nBottomPlusOne >= L.im.size().y)
00216     i_end = L.vCorners.end();
00217   else 
00218     i_end = L.vCorners.begin() + L.vCornerRowLUT[nBottomPlusOne];
00219   
00220   ImageRef irBest;             // Best match so far
00221   int nBestSSD = mnMaxSSD + 1; // Best score so far is beyond the max allowed
00222   
00223   for(; i<i_end; i++)          // For each corner ...
00224     {                         
00225       if( i->x < nLeft || i->x > nRight)
00226         continue;
00227       if((irPos - *i).mag_squared() > nRange * nRange)
00228         continue;              // ... reject all those not close enough..
00229 
00230       int nSSD;                // .. and find the ZMSSD at those near enough.
00231       nSSD = ZMSSDAtPoint(L.im, *i);
00232       if(nSSD < nBestSSD)      // Best yet?
00233         {
00234           irBest = *i;
00235           nBestSSD = nSSD;
00236         }
00237     } // done looping over corners
00238   
00239   if(nBestSSD < mnMaxSSD)      // Found a valid match?
00240     {
00241       mv2CoarsePos= LevelZeroPos(irBest, mnSearchLevel);
00242       mbFound = true;
00243     }
00244   else
00245     mbFound = false;
00246   return mbFound;
00247 }
00248 
00249 // Makes an inverse composition template out of the coarse template.
00250 // Includes calculating image of derivatives (gradients.) The inverse composition
00251 // used here operates on three variables: x offet, y offset, and difference in patch
00252 // means; hence things like mm3HInv are dim 3, but the trivial mean jacobian 
00253 // (always unity, for each pixel) is not stored.
00254 void PatchFinder::MakeSubPixTemplate()
00255 {
00256   mimJacs.resize(mimTemplate.size() - ImageRef(2,2));
00257   Matrix<3> m3H = Zeros; // This stores jTj.
00258   ImageRef ir;
00259   for(ir.x = 1; ir.x < mnPatchSize - 1; ir.x++)
00260     for(ir.y = 1; ir.y < mnPatchSize - 1; ir.y++)
00261       {
00262         Vector<2> v2Grad;
00263         v2Grad[0] = 0.5 * (mimTemplate[ir + ImageRef(1,0)] - mimTemplate[ir - ImageRef(1,0)]);
00264         v2Grad[1] = 0.5 * (mimTemplate[ir + ImageRef(0,1)] - mimTemplate[ir - ImageRef(0,1)]);
00265         mimJacs[ir-ImageRef(1,1)].first = v2Grad[0];
00266         mimJacs[ir-ImageRef(1,1)].second = v2Grad[1];
00267         Vector<3> v3Grad = unproject(v2Grad); // This adds the mean-difference jacobian..
00268         m3H += v3Grad.as_col() * v3Grad.as_row(); // Populate JTJ.
00269       }
00270   
00271   // Invert JTJ..
00272   Cholesky<3> chol(m3H);
00273   mm3HInv = chol.get_inverse();
00274   // TOON2 Does not have a get_rank for cholesky
00275   // int nRank = chol.get_rank();
00276   // if(nRank < 3)
00277   // cout << "BAD RANK IN MAKESUBPIXELTEMPLATE!!!!" << endl; // This does not happen often (almost never!)
00278   
00279   mv2SubPixPos = mv2CoarsePos; // Start the sub-pixel search at the result of the coarse search..
00280   mdMeanDiff = 0.0;
00281 }
00282 
00283 // Iterate inverse composition until convergence. Since it should never have 
00284 // to travel more than a pixel's distance, set a max number of iterations; 
00285 // if this is exceeded, consider the IC to have failed.
00286 bool PatchFinder::IterateSubPixToConvergence(KeyFrame &kf, int nMaxIts)
00287 {
00288   const double dConvLimit = 0.03;
00289   bool bConverged = false;
00290   int nIts;
00291   for(nIts = 0; nIts < nMaxIts && !bConverged; nIts++)
00292     {
00293       double dUpdateSquared = IterateSubPix(kf);
00294       if(dUpdateSquared < 0) // went off edge of image
00295         return false;
00296       if(dUpdateSquared < dConvLimit*dConvLimit)
00297         return true;
00298     }
00299   return false;
00300 }
00301 
00302 // Single iteration of inverse composition. This compares integral image positions in the 
00303 // template image to floating point positions in the target keyframe. Interpolation is
00304 // bilinear, and performed manually (rather than using CVD::image_interpolate) since 
00305 // this is a special case where the mixing fractions for each pixel are identical.
00306 double PatchFinder::IterateSubPix(KeyFrame &kf)
00307 {
00308   // Search level pos of patch center
00309   Vector<2> v2Center = LevelNPos(mv2SubPixPos, mnSearchLevel);
00310   BasicImage<byte> &im = kf.aLevels[mnSearchLevel].im;
00311   if(!im.in_image_with_border(ir_rounded(v2Center), mnPatchSize / 2 + 1))
00312     return -1.0;       // Negative return value indicates off edge of image 
00313   
00314   // Position of top-left corner of patch in search level
00315   Vector<2> v2Base = v2Center - vec(mirCenter);
00316   
00317   // I.C. JT*d accumulator
00318   Vector<3> v3Accum = Zeros;
00319   
00320   ImageRef ir;
00321   
00322   byte* pTopLeftPixel;
00323   
00324   // Each template pixel will be compared to an interpolated target pixel
00325   // The target value is made using bilinear interpolation as the weighted sum
00326   // of four target image pixels. Calculate mixing fractions:
00327   double dX = v2Base[0]-floor(v2Base[0]); // Distances from pixel center of TL pixel
00328   double dY = v2Base[1]-floor(v2Base[1]);
00329   float fMixTL = (1.0 - dX) * (1.0 - dY);
00330   float fMixTR = (dX)       * (1.0 - dY);
00331   float fMixBL = (1.0 - dX) * (dY);
00332   float fMixBR = (dX)       * (dY);
00333   
00334   // Loop over template image
00335   unsigned long nRowOffset = &kf.aLevels[mnSearchLevel].im[ImageRef(0,1)] - &kf.aLevels[mnSearchLevel].im[ImageRef(0,0)];
00336   for(ir.y = 1; ir.y < mnPatchSize - 1; ir.y++)
00337     {
00338       pTopLeftPixel = &im[::ir(v2Base) + ImageRef(1,ir.y)]; // n.b. the x=1 offset, as with y
00339       for(ir.x = 1; ir.x < mnPatchSize - 1; ir.x++)
00340         {
00341           float fPixel =   // Calc target interpolated pixel
00342             fMixTL * pTopLeftPixel[0]          + fMixTR * pTopLeftPixel[1] + 
00343             fMixBL * pTopLeftPixel[nRowOffset] + fMixBR * pTopLeftPixel[nRowOffset + 1];
00344           pTopLeftPixel++;
00345           double dDiff = fPixel - mimTemplate[ir] + mdMeanDiff;
00346           v3Accum[0] += dDiff * mimJacs[ir - ImageRef(1,1)].first;
00347           v3Accum[1] += dDiff * mimJacs[ir - ImageRef(1,1)].second;
00348           v3Accum[2] += dDiff;  // Update JT*d
00349         };
00350     }
00351   
00352   // All done looping over image - find JTJ^-1 * JTd:
00353   Vector<3> v3Update = mm3HInv * v3Accum; 
00354   mv2SubPixPos -= v3Update.slice<0,2>() * LevelScale(mnSearchLevel);
00355   mdMeanDiff -= v3Update[2];
00356   
00357   double dPixelUpdateSquared = v3Update.slice<0,2>() * v3Update.slice<0,2>();
00358   return dPixelUpdateSquared;
00359 }
00360 
00363 // 
00364 //
00365 //              ZMSSDatpoint, which is SSE optimised, follows
00366 //
00367 // The top version is the SSE version for 8x8 patches. It is compiled
00368 // only if CVD_HAVE_XMMINTRIN is true, also you need to give your 
00369 // compiler the appropriate flags (e.g. -march=core2 -msse3 for g++.)
00370 // The standard c++ version, which is about half as quick (not a disaster
00371 // by any means) is below.
00372 //
00373 // The 8x8 SSE version looks long because it has been unrolled, 
00374 // it just does the same thing eight times. Both versions are one-pass
00375 // and need pre-calculated template sums and sum-squares.
00376 //
00379 
00380 #if CVD_HAVE_XMMINTRIN
00381 // Horizontal sum of uint16s stored in an XMM register
00382 inline int SumXMM_16(__m128i &target)
00383 {
00384   unsigned short int sums_store[8];    
00385   _mm_storeu_si128((__m128i*)sums_store, target);
00386   return sums_store[0] + sums_store[1] + sums_store[2] + sums_store[3] +
00387     sums_store[4] + sums_store[5] + sums_store[6] + sums_store[7];
00388 }
00389 // Horizontal sum of uint32s stored in an XMM register
00390 inline int SumXMM_32(__m128i &target)
00391 {
00392   unsigned int sums_store[4];    
00393   _mm_storeu_si128((__m128i*)sums_store, target);
00394   return sums_store[0] + sums_store[1] + sums_store[2] + sums_store[3];
00395 }
00396 #endif
00397 
00398 // Calculate the Zero-mean SSD of the coarse patch and a target imate at a specific 
00399 // point.
00400 int PatchFinder::ZMSSDAtPoint(CVD::BasicImage<CVD::byte> &im, const CVD::ImageRef &ir)
00401 {
00402   if(!im.in_image_with_border(ir, mirCenter[0]))
00403     return mnMaxSSD + 1;
00404   
00405   ImageRef irImgBase = ir - mirCenter;
00406   byte *imagepointer;
00407   byte *templatepointer;
00408   
00409   int nImageSumSq = 0;
00410   int nImageSum = 0;
00411   int nCrossSum = 0;
00412 
00413 #if CVD_HAVE_XMMINTRIN
00414   if(mnPatchSize == 8)
00415     {
00416       long unsigned int imagepointerincrement;
00417 
00418       __m128i xImageAsEightBytes;
00419       __m128i xImageAsWords;
00420       __m128i xTemplateAsEightBytes;
00421       __m128i xTemplateAsWords;
00422       __m128i xZero;
00423       __m128i xImageSums; // These sums are 8xuint16
00424       __m128i xImageSqSums; // These sums are 4xint32
00425       __m128i xCrossSums;   // These sums are 4xint32
00426       __m128i xProduct;
00427 
00428       
00429       xImageSums = _mm_setzero_si128();
00430       xImageSqSums = _mm_setzero_si128();
00431       xCrossSums = _mm_setzero_si128();
00432       xZero = _mm_setzero_si128();
00433       
00434       imagepointer = &im[irImgBase + ImageRef(0,0)];
00435       templatepointer = &mimTemplate[ImageRef(0,0)];
00436       imagepointerincrement = &im[irImgBase + ImageRef(0,1)] - imagepointer;
00437       
00438       xImageAsEightBytes=_mm_loadl_epi64((__m128i*) imagepointer);
00439       imagepointer += imagepointerincrement;
00440       xImageAsWords = _mm_unpacklo_epi8(xImageAsEightBytes,xZero);
00441       xImageSums = _mm_adds_epu16(xImageAsWords,xImageSums);
00442       xProduct = _mm_madd_epi16(xImageAsWords, xImageAsWords);
00443       xImageSqSums = _mm_add_epi32(xProduct, xImageSqSums);
00444       xTemplateAsEightBytes=_mm_load_si128((__m128i*) templatepointer);
00445       templatepointer += 16;
00446       xTemplateAsWords = _mm_unpacklo_epi8(xTemplateAsEightBytes,xZero);
00447       xProduct = _mm_madd_epi16(xImageAsWords, xTemplateAsWords);
00448       xCrossSums = _mm_add_epi32(xProduct, xCrossSums);
00449       xImageAsEightBytes=_mm_loadl_epi64((__m128i*) imagepointer);
00450       imagepointer += imagepointerincrement;
00451       xImageAsWords = _mm_unpacklo_epi8(xImageAsEightBytes,xZero);
00452       xImageSums = _mm_adds_epu16(xImageAsWords,xImageSums);
00453       xProduct = _mm_madd_epi16(xImageAsWords, xImageAsWords);
00454       xImageSqSums = _mm_add_epi32(xProduct, xImageSqSums);
00455       xTemplateAsWords = _mm_unpackhi_epi8(xTemplateAsEightBytes,xZero);
00456       xProduct = _mm_madd_epi16(xImageAsWords, xTemplateAsWords);
00457       xCrossSums = _mm_add_epi32(xProduct, xCrossSums);
00458 
00459       xImageAsEightBytes=_mm_loadl_epi64((__m128i*) imagepointer);
00460       imagepointer += imagepointerincrement;
00461       xImageAsWords = _mm_unpacklo_epi8(xImageAsEightBytes,xZero);
00462       xImageSums = _mm_adds_epu16(xImageAsWords,xImageSums);
00463       xProduct = _mm_madd_epi16(xImageAsWords, xImageAsWords);
00464       xImageSqSums = _mm_add_epi32(xProduct, xImageSqSums);
00465       xTemplateAsEightBytes=_mm_load_si128((__m128i*) templatepointer);
00466       templatepointer += 16;
00467       xTemplateAsWords = _mm_unpacklo_epi8(xTemplateAsEightBytes,xZero);
00468       xProduct = _mm_madd_epi16(xImageAsWords, xTemplateAsWords);
00469       xCrossSums = _mm_add_epi32(xProduct, xCrossSums);
00470       xImageAsEightBytes=_mm_loadl_epi64((__m128i*) imagepointer);
00471       imagepointer += imagepointerincrement;
00472       xImageAsWords = _mm_unpacklo_epi8(xImageAsEightBytes,xZero);
00473       xImageSums = _mm_adds_epu16(xImageAsWords,xImageSums);
00474       xProduct = _mm_madd_epi16(xImageAsWords, xImageAsWords);
00475       xImageSqSums = _mm_add_epi32(xProduct, xImageSqSums);
00476       xTemplateAsWords = _mm_unpackhi_epi8(xTemplateAsEightBytes,xZero);
00477       xProduct = _mm_madd_epi16(xImageAsWords, xTemplateAsWords);
00478       xCrossSums = _mm_add_epi32(xProduct, xCrossSums);
00479 
00480       xImageAsEightBytes=_mm_loadl_epi64((__m128i*) imagepointer);
00481       imagepointer += imagepointerincrement;
00482       xImageAsWords = _mm_unpacklo_epi8(xImageAsEightBytes,xZero);
00483       xImageSums = _mm_adds_epu16(xImageAsWords,xImageSums);
00484       xProduct = _mm_madd_epi16(xImageAsWords, xImageAsWords);
00485       xImageSqSums = _mm_add_epi32(xProduct, xImageSqSums);
00486       xTemplateAsEightBytes=_mm_load_si128((__m128i*) templatepointer);
00487       templatepointer += 16;
00488       xTemplateAsWords = _mm_unpacklo_epi8(xTemplateAsEightBytes,xZero);
00489       xProduct = _mm_madd_epi16(xImageAsWords, xTemplateAsWords);
00490       xCrossSums = _mm_add_epi32(xProduct, xCrossSums);
00491       xImageAsEightBytes=_mm_loadl_epi64((__m128i*) imagepointer);
00492       imagepointer += imagepointerincrement;
00493       xImageAsWords = _mm_unpacklo_epi8(xImageAsEightBytes,xZero);
00494       xImageSums = _mm_adds_epu16(xImageAsWords,xImageSums);
00495       xProduct = _mm_madd_epi16(xImageAsWords, xImageAsWords);
00496       xImageSqSums = _mm_add_epi32(xProduct, xImageSqSums);
00497       xTemplateAsWords = _mm_unpackhi_epi8(xTemplateAsEightBytes,xZero);
00498       xProduct = _mm_madd_epi16(xImageAsWords, xTemplateAsWords);
00499       xCrossSums = _mm_add_epi32(xProduct, xCrossSums);
00500 
00501       xImageAsEightBytes=_mm_loadl_epi64((__m128i*) imagepointer);
00502       imagepointer += imagepointerincrement;
00503       xImageAsWords = _mm_unpacklo_epi8(xImageAsEightBytes,xZero);
00504       xImageSums = _mm_adds_epu16(xImageAsWords,xImageSums);
00505       xProduct = _mm_madd_epi16(xImageAsWords, xImageAsWords);
00506       xImageSqSums = _mm_add_epi32(xProduct, xImageSqSums);
00507       xTemplateAsEightBytes=_mm_load_si128((__m128i*) templatepointer);
00508       templatepointer += 16;
00509       xTemplateAsWords = _mm_unpacklo_epi8(xTemplateAsEightBytes,xZero);
00510       xProduct = _mm_madd_epi16(xImageAsWords, xTemplateAsWords);
00511       xCrossSums = _mm_add_epi32(xProduct, xCrossSums);
00512       xImageAsEightBytes=_mm_loadl_epi64((__m128i*) imagepointer);
00513       xImageAsWords = _mm_unpacklo_epi8(xImageAsEightBytes,xZero);
00514       xImageSums = _mm_adds_epu16(xImageAsWords,xImageSums);
00515       xProduct = _mm_madd_epi16(xImageAsWords, xImageAsWords);
00516       xImageSqSums = _mm_add_epi32(xProduct, xImageSqSums);
00517       xTemplateAsWords = _mm_unpackhi_epi8(xTemplateAsEightBytes,xZero);
00518       xProduct = _mm_madd_epi16(xImageAsWords, xTemplateAsWords);
00519       xCrossSums = _mm_add_epi32(xProduct, xCrossSums);
00520 
00521       nImageSum = SumXMM_16(xImageSums);
00522       nCrossSum = SumXMM_32(xCrossSums);
00523       nImageSumSq = SumXMM_32(xImageSqSums);
00524     }
00525   else
00526 #endif 
00527     {    
00528       for(int nRow = 0; nRow < mnPatchSize; nRow++)
00529         {
00530           imagepointer = &im[irImgBase + ImageRef(0,nRow)];
00531           templatepointer = &mimTemplate[ImageRef(0,nRow)];
00532           for(int nCol = 0; nCol < mnPatchSize; nCol++)
00533             {
00534               int n = imagepointer[nCol];
00535               nImageSum += n;
00536               nImageSumSq += n*n;
00537               nCrossSum += n * templatepointer[nCol];
00538             };
00539         }
00540     };
00541   
00542   int SA = mnTemplateSum;
00543   int SB = nImageSum;
00544   
00545   int N = mnPatchSize * mnPatchSize;
00546   return ((2*SA*SB - SA*SA - SB*SB)/N + nImageSumSq + mnTemplateSumSq - 2*nCrossSum);
00547 }
00548 
00549 
00550 
00551 
00552 
00553