ptam: PatchFinder.cc Source File

Go to the documentation of this file.
00001 // Copyright 2008 Isis Innovation Limited
00002 #include "ptam/PatchFinder.h"
00003 #include "ptam/SmallMatrixOpts.h"
00004 #include "ptam/KeyFrame.h"
00005 
00006 #include <cvd/vision.h>
00007 #include <cvd/vector_image_ref.h>
00008 #include <cvd/image_interpolate.h>
00009 #include <TooN/Cholesky.h>
00010 // tmmintrin.h contains SSE3<> instrinsics, used for the ZMSSD search at the bottom..
00011 // If this causes problems, just do #define CVD_HAVE_XMMINTRIN 0
00012 #if CVD_HAVE_XMMINTRIN
00013 #include <tmmintrin.h>
00014 #endif
00015 
00016 using namespace CVD;
00017 using namespace std;
00018 
00019 PatchFinder::PatchFinder(int nPatchSize)
00020 : mimTemplate(ImageRef(nPatchSize,nPatchSize))
00021 {
00022   mnPatchSize = nPatchSize;
00023   mirCenter = ImageRef(nPatchSize/2, nPatchSize/2);
00024   int nMaxSSDPerPixel = 500; // Pretty arbitrary... could make a GVar out of this.
00025   mnMaxSSD = mnPatchSize * mnPatchSize * nMaxSSDPerPixel;
00026   // Populate the speed-up caches with bogus values:
00027   mm2LastWarpMatrix = 9999.9 * Identity;
00028 };
00029 
00030 
00031 // Find the warping matrix and search level
00032 int PatchFinder::CalcSearchLevelAndWarpMatrix(MapPoint::Ptr p,
00033                                               SE3<> se3CFromW,
00034                                               Matrix<2> &m2CamDerivs)
00035 {
00036   // Calc point pos in new view camera frame
00037   // Slightly dumb that we re-calculate this here when the tracker's already done this!
00038   Vector<3> v3Cam = se3CFromW * p->v3WorldPos;
00039   double dOneOverCameraZ = 1.0 / v3Cam[2];
00040   // Project the source keyframe's one-pixel-right and one-pixel-down vectors into the current view
00041   Vector<3> v3MotionRight = se3CFromW.get_rotation() * p->v3PixelRight_W;
00042   Vector<3> v3MotionDown = se3CFromW.get_rotation() * p->v3PixelDown_W;
00043   // Calculate in-image derivatives of source image pixel motions:
00044   mm2WarpInverse.T()[0] = m2CamDerivs * (v3MotionRight.slice<0,2>() - v3Cam.slice<0,2>() * v3MotionRight[2] * dOneOverCameraZ) * dOneOverCameraZ;
00045   mm2WarpInverse.T()[1] = m2CamDerivs * (v3MotionDown.slice<0,2>() - v3Cam.slice<0,2>() * v3MotionDown[2] * dOneOverCameraZ) * dOneOverCameraZ;
00046   double dDet = mm2WarpInverse[0][0] * mm2WarpInverse[1][1] - mm2WarpInverse[0][1] * mm2WarpInverse[1][0];
00047   mnSearchLevel = 0;
00048 
00049   // This warp matrix is likely not appropriate for finding at level zero, which is 
00050   // the level at which it has been calculated. Vary the search level until the 
00051   // at that level would be appropriate (does not actually modify the matrix.)
00052   while(dDet > 3 && mnSearchLevel < LEVELS-1)
00053   {
00054     mnSearchLevel++;
00055     dDet *= 0.25;
00056   };
00057 
00058   // Some warps are inappropriate, e.g. too near the camera, too far, or reflected, 
00059   // or zero area.. reject these!
00060   if(dDet > 3 || dDet < 0.25)
00061   {
00062     mbTemplateBad = true;
00063     return -1;
00064   }
00065   else
00066     return mnSearchLevel;
00067 }
00068 
00069 // This is just a convenience function wich caluclates the warp matrix and generates
00070 // the template all in one call.
00071 void PatchFinder::MakeTemplateCoarse(MapPoint::Ptr p,
00072                                      SE3<> se3CFromW,
00073                                      Matrix<2> &m2CamDerivs)
00074 {
00075   CalcSearchLevelAndWarpMatrix(p, se3CFromW, m2CamDerivs);
00076   MakeTemplateCoarseCont(p);
00077 };
00078 
00079 // This function generates the warped search template.
00080 void PatchFinder::MakeTemplateCoarseCont(MapPoint::Ptr p)
00081 {
00082   // Get the warping matrix appropriate for use with CVD::transform...
00083   Matrix<2> m2 = M2Inverse(mm2WarpInverse) * LevelScale(mnSearchLevel); 
00084   // m2 now represents the number of pixels in the source image for one 
00085   // pixel of template image
00086 
00087   // Optimisation: Don't re-gen the coarse template if it's going to be substantially the 
00088   // same as was made last time. This saves time when the camera is not moving. For this, 
00089   // check that (a) this patchfinder is still working on the same map point and (b) the 
00090   // warping matrix has not changed much.
00091 
00092   bool bNeedToRefreshTemplate = false;
00093   if(p != mpLastTemplateMapPoint)
00094     bNeedToRefreshTemplate = true;
00095   // Still the same map point? Then compare warping matrix..
00096   for(int i=0; !bNeedToRefreshTemplate && i<2; i++)
00097   {
00098     Vector<2> v2Diff = m2.T()[i] - mm2LastWarpMatrix.T()[i];
00099     const double dRefreshLimit = 0.07;  // Sort of works out as half a pixel displacement in src img
00100     if(v2Diff * v2Diff > dRefreshLimit * dRefreshLimit)
00101       bNeedToRefreshTemplate = true;
00102   }
00103 
00104   // Need to regen template? Then go ahead.
00105   if(bNeedToRefreshTemplate)
00106   {
00107     int nOutside;  // Use CVD::transform to warp the patch according the the warping matrix m2
00108     // This returns the number of pixels outside the source image hit, which should be zero.
00109     nOutside = CVD::transform(p->pPatchSourceKF->aLevels[p->nSourceLevel].im,
00110                               mimTemplate,
00111                               m2,
00112                               vec(p->irCenter),
00113                               vec(mirCenter));
00114 
00115     if(nOutside)
00116       mbTemplateBad = true;
00117     else
00118       mbTemplateBad = false;
00119 
00120     MakeTemplateSums();
00121 
00122     // Store the parameters which allow us to determine if we need to re-calculate
00123     // the patch next time round.
00124     mpLastTemplateMapPoint = p;
00125     mm2LastWarpMatrix = m2;
00126   }
00127 };
00128 
00129 // This makes a template without warping. Used for epipolar search, where we don't really know 
00130 // what the warping matrix should be. (Although to be fair, I should do rotation for epipolar,
00131 // which we could approximate without knowing patch depth!)
00132 void PatchFinder::MakeTemplateCoarseNoWarp(KeyFrame::Ptr k, int nLevel, ImageRef irLevelPos)
00133 {
00134   mnSearchLevel = nLevel;
00135   Image<CVD::byte> &im = k->aLevels[nLevel].im;
00136   if(!im.in_image_with_border(irLevelPos, mnPatchSize / 2 + 1))
00137   {
00138     mbTemplateBad = true;
00139     return;
00140   }
00141   mbTemplateBad = false;
00142   copy(im,
00143        mimTemplate,
00144        mimTemplate.size(),
00145        irLevelPos - mirCenter);
00146 
00147   MakeTemplateSums();
00148 }
00149 
00150 // Convenient wrapper for the above
00151 void PatchFinder::MakeTemplateCoarseNoWarp(MapPoint::Ptr p)
00152 {
00153   MakeTemplateCoarseNoWarp(p->pPatchSourceKF, p->nSourceLevel,  p->irCenter);
00154 };
00155 
00156 // Finds the sum, and sum-squared, of template pixels. These sums are used
00157 // to calculate the ZMSSD.
00158 inline void PatchFinder::MakeTemplateSums()
00159 {
00160   int nSum = 0;
00161   int nSumSq = 0;
00162   ImageRef ir;
00163   do
00164   {
00165     int b = mimTemplate[ir];
00166     nSum += b;
00167     nSumSq +=b * b;
00168   }
00169   while(ir.next(mimTemplate.size()));
00170   mnTemplateSum = nSum;
00171   mnTemplateSumSq = nSumSq;
00172 }
00173 
00174 // One of the main functions of the class! Looks at the appropriate level of 
00175 // the target keyframe to try and find the template. Looks only at FAST corner points
00176 // which are within radius nRange of the center. (Params are supplied in Level0
00177 // coords.) Returns true on patch found.
00178 bool PatchFinder::FindPatchCoarse(ImageRef irPos, KeyFrame::Ptr kf, unsigned int nRange)
00179 {
00180   mbFound = false;
00181 
00182   // Convert from L0 coords to search level quantities
00183   int nLevelScale = LevelScale(mnSearchLevel);
00184   mirPredictedPos = irPos;
00185   irPos = irPos / nLevelScale;
00186   nRange = (nRange + nLevelScale - 1) / nLevelScale;
00187 
00188   // Bounding box of search circle
00189   int nTop = irPos.y - nRange;
00190   int nBottomPlusOne = irPos.y + nRange + 1;
00191   int nLeft = irPos.x - nRange;
00192   int nRight = irPos.x + nRange;
00193 
00194   // Ref variable for the search level
00195   Level &L = kf->aLevels[mnSearchLevel];
00196 
00197   // Some bounds checks on the bounding box..
00198   if(nTop < 0)
00199     nTop = 0;
00200   if(nTop >= L.im.size().y)
00201     return false;
00202   if(nBottomPlusOne <= 0)
00203     return false;
00204 
00205   // The next section finds all the FAST corners in the target level which 
00206   // are near enough the search center. It's a bit optimised to use 
00207   // a corner row look-up-table, since otherwise the routine
00208   // would spend a long time trawling throught the whole list of FAST corners!
00209   vector<ImageRef>::iterator i;
00210   vector<ImageRef>::iterator i_end;
00211 
00212   i = L.vCorners.begin() + L.vCornerRowLUT[nTop];
00213 
00214   if(nBottomPlusOne >= L.im.size().y)
00215     i_end = L.vCorners.end();
00216   else 
00217     i_end = L.vCorners.begin() + L.vCornerRowLUT[nBottomPlusOne];
00218 
00219   ImageRef irBest;             // Best match so far
00220   int nBestSSD = mnMaxSSD + 1; // Best score so far is beyond the max allowed
00221 
00222   for(; i<i_end; i++)          // For each corner ...
00223   {
00224     if( i->x < nLeft || i->x > nRight)
00225       continue;
00226     if((irPos - *i).mag_squared() > nRange * nRange)
00227       continue;              // ... reject all those not close enough..
00228 
00229     int nSSD;                // .. and find the ZMSSD at those near enough.
00230     nSSD = ZMSSDAtPoint(L.im, *i);
00231     if(nSSD < nBestSSD)      // Best yet?
00232     {
00233       irBest = *i;
00234       nBestSSD = nSSD;
00235     }
00236   } // done looping over corners
00237 
00238   if(nBestSSD < mnMaxSSD)      // Found a valid match?
00239   {
00240     mv2CoarsePos= LevelZeroPos(irBest, mnSearchLevel);
00241     mbFound = true;
00242   }
00243   else
00244     mbFound = false;
00245   return mbFound;
00246 }
00247 
00248 // Makes an inverse composition template out of the coarse template.
00249 // Includes calculating image of derivatives (gradients.) The inverse composition
00250 // used here operates on three variables: x offet, y offset, and difference in patch
00251 // means; hence things like mm3HInv are dim 3, but the trivial mean jacobian 
00252 // (always unity, for each pixel) is not stored.
00253 void PatchFinder::MakeSubPixTemplate()
00254 {
00255   mimJacs.resize(mimTemplate.size() - ImageRef(2,2));
00256   Matrix<3> m3H = Zeros; // This stores jTj.
00257   ImageRef ir;
00258   for(ir.x = 1; ir.x < mnPatchSize - 1; ir.x++)
00259     for(ir.y = 1; ir.y < mnPatchSize - 1; ir.y++)
00260     {
00261       Vector<2> v2Grad;
00262       v2Grad[0] = 0.5 * (mimTemplate[ir + ImageRef(1,0)] - mimTemplate[ir - ImageRef(1,0)]);
00263       v2Grad[1] = 0.5 * (mimTemplate[ir + ImageRef(0,1)] - mimTemplate[ir - ImageRef(0,1)]);
00264       mimJacs[ir-ImageRef(1,1)].first = v2Grad[0];
00265       mimJacs[ir-ImageRef(1,1)].second = v2Grad[1];
00266       Vector<3> v3Grad = unproject(v2Grad); // This adds the mean-difference jacobian..
00267       m3H += v3Grad.as_col() * v3Grad.as_row(); // Populate JTJ.
00268     }
00269 
00270   // Invert JTJ..
00271   Cholesky<3> chol(m3H);
00272   mm3HInv = chol.get_inverse();
00273   // TOON2 Does not have a get_rank for cholesky
00274   // int nRank = chol.get_rank();
00275   // if(nRank < 3)
00276   // cout << "BAD RANK IN MAKESUBPIXELTEMPLATE!!!!" << endl; // This does not happen often (almost never!)
00277 
00278   mv2SubPixPos = mv2CoarsePos; // Start the sub-pixel search at the result of the coarse search..
00279   mdMeanDiff = 0.0;
00280 }
00281 
00282 // Iterate inverse composition until convergence. Since it should never have 
00283 // to travel more than a pixel's distance, set a max number of iterations; 
00284 // if this is exceeded, consider the IC to have failed.
00285 bool PatchFinder::IterateSubPixToConvergence(KeyFrame& kf, int nMaxIts)
00286 {
00287   const double dConvLimit = 0.03;
00288   bool bConverged = false;
00289   int nIts;
00290   for(nIts = 0; nIts < nMaxIts && !bConverged; nIts++)
00291   {
00292     double dUpdateSquared = IterateSubPix(kf);
00293     if(dUpdateSquared < 0) // went off edge of image
00294       return false;
00295     if(dUpdateSquared < dConvLimit*dConvLimit)
00296       return true;
00297   }
00298   return false;
00299 }
00300 
00301 // Single iteration of inverse composition. This compares integral image positions in the 
00302 // template image to floating point positions in the target keyframe. Interpolation is
00303 // bilinear, and performed manually (rather than using CVD::image_interpolate) since 
00304 // this is a special case where the mixing fractions for each pixel are identical.
00305 double PatchFinder::IterateSubPix(KeyFrame& kf)
00306 {
00307   // Search level pos of patch center
00308   Vector<2> v2Center = LevelNPos(mv2SubPixPos, mnSearchLevel);
00309   BasicImage<CVD::byte> &im = kf.aLevels[mnSearchLevel].im;
00310   if(!im.in_image_with_border(ir_rounded(v2Center), mnPatchSize / 2 + 1))
00311     return -1.0;       // Negative return value indicates off edge of image 
00312 
00313   // Position of top-left corner of patch in search level
00314   Vector<2> v2Base = v2Center - vec(mirCenter);
00315 
00316   // I.C. JT*d accumulator
00317   Vector<3> v3Accum = Zeros;
00318 
00319   ImageRef ir;
00320 
00321   CVD::byte* pTopLeftPixel;
00322 
00323   // Each template pixel will be compared to an interpolated target pixel
00324   // The target value is made using bilinear interpolation as the weighted sum
00325   // of four target image pixels. Calculate mixing fractions:
00326   double dX = v2Base[0]-floor(v2Base[0]); // Distances from pixel center of TL pixel
00327   double dY = v2Base[1]-floor(v2Base[1]);
00328   float fMixTL = (1.0 - dX) * (1.0 - dY);
00329   float fMixTR = (dX)       * (1.0 - dY);
00330   float fMixBL = (1.0 - dX) * (dY);
00331   float fMixBR = (dX)       * (dY);
00332 
00333   // Loop over template image
00334   unsigned long nRowOffset = &kf.aLevels[mnSearchLevel].im[ImageRef(0,1)] - &kf.aLevels[mnSearchLevel].im[ImageRef(0,0)];
00335   for(ir.y = 1; ir.y < mnPatchSize - 1; ir.y++)
00336   {
00337     pTopLeftPixel = &im[::ir(v2Base) + ImageRef(1,ir.y)]; // n.b. the x=1 offset, as with y
00338     for(ir.x = 1; ir.x < mnPatchSize - 1; ir.x++)
00339     {
00340       float fPixel =   // Calc target interpolated pixel
00341           fMixTL * pTopLeftPixel[0]          + fMixTR * pTopLeftPixel[1] +
00342           fMixBL * pTopLeftPixel[nRowOffset] + fMixBR * pTopLeftPixel[nRowOffset + 1];
00343       pTopLeftPixel++;
00344       double dDiff = fPixel - mimTemplate[ir] + mdMeanDiff;
00345       v3Accum[0] += dDiff * mimJacs[ir - ImageRef(1,1)].first;
00346       v3Accum[1] += dDiff * mimJacs[ir - ImageRef(1,1)].second;
00347       v3Accum[2] += dDiff;  // Update JT*d
00348     };
00349   }
00350 
00351   // All done looping over image - find JTJ^-1 * JTd:
00352   Vector<3> v3Update = mm3HInv * v3Accum; 
00353   mv2SubPixPos -= v3Update.slice<0,2>() * LevelScale(mnSearchLevel);
00354   mdMeanDiff -= v3Update[2];
00355 
00356   double dPixelUpdateSquared = v3Update.slice<0,2>() * v3Update.slice<0,2>();
00357   return dPixelUpdateSquared;
00358 }
00359 
00362 // 
00363 //
00364 //              ZMSSDatpoint, which is SSE optimised, follows
00365 //
00366 // The top version is the SSE version for 8x8 patches. It is compiled
00367 // only if CVD_HAVE_XMMINTRIN is true, also you need to give your 
00368 // compiler the appropriate flags (e.g. -march=core2 -msse3 for g++.)
00369 // The standard c++ version, which is about half as quick (not a disaster
00370 // by any means) is below.
00371 //
00372 // The 8x8 SSE version looks long because it has been unrolled, 
00373 // it just does the same thing eight times. Both versions are one-pass
00374 // and need pre-calculated template sums and sum-squares.
00375 //
00378 
00379 #if CVD_HAVE_XMMINTRIN
00380 // Horizontal sum of uint16s stored in an XMM register
00381 inline int SumXMM_16(__m128i &target)
00382 {
00383   unsigned short int sums_store[8];    
00384   _mm_storeu_si128((__m128i*)sums_store, target);
00385   return sums_store[0] + sums_store[1] + sums_store[2] + sums_store[3] +
00386       sums_store[4] + sums_store[5] + sums_store[6] + sums_store[7];
00387 }
00388 // Horizontal sum of uint32s stored in an XMM register
00389 inline int SumXMM_32(__m128i &target)
00390 {
00391   unsigned int sums_store[4];    
00392   _mm_storeu_si128((__m128i*)sums_store, target);
00393   return sums_store[0] + sums_store[1] + sums_store[2] + sums_store[3];
00394 }
00395 #endif
00396 
00397 // Calculate the Zero-mean SSD of the coarse patch and a target imate at a specific 
00398 // point.
00399 int PatchFinder::ZMSSDAtPoint(CVD::BasicImage<CVD::byte> &im, const CVD::ImageRef &ir)
00400 {
00401   if(!im.in_image_with_border(ir, mirCenter[0]))
00402     return mnMaxSSD + 1;
00403 
00404   ImageRef irImgBase = ir - mirCenter;
00405   CVD::byte *imagepointer;
00406   CVD::byte *templatepointer;
00407 
00408   int nImageSumSq = 0;
00409   int nImageSum = 0;
00410   int nCrossSum = 0;
00411 
00412 #if CVD_HAVE_XMMINTRIN
00413   if(mnPatchSize == 8)
00414   {
00415     long unsigned int imagepointerincrement;
00416 
00417     __m128i xImageAsEightBytes;
00418     __m128i xImageAsWords;
00419     __m128i xTemplateAsEightBytes;
00420     __m128i xTemplateAsWords;
00421     __m128i xZero;
00422     __m128i xImageSums; // These sums are 8xuint16
00423     __m128i xImageSqSums; // These sums are 4xint32
00424     __m128i xCrossSums;   // These sums are 4xint32
00425     __m128i xProduct;
00426 
00427 
00428     xImageSums = _mm_setzero_si128();
00429     xImageSqSums = _mm_setzero_si128();
00430     xCrossSums = _mm_setzero_si128();
00431     xZero = _mm_setzero_si128();
00432 
00433     imagepointer = &im[irImgBase + ImageRef(0,0)];
00434     templatepointer = &mimTemplate[ImageRef(0,0)];
00435     imagepointerincrement = &im[irImgBase + ImageRef(0,1)] - imagepointer;
00436 
00437     xImageAsEightBytes=_mm_loadl_epi64((__m128i*) imagepointer);
00438     imagepointer += imagepointerincrement;
00439     xImageAsWords = _mm_unpacklo_epi8(xImageAsEightBytes,xZero);
00440     xImageSums = _mm_adds_epu16(xImageAsWords,xImageSums);
00441     xProduct = _mm_madd_epi16(xImageAsWords, xImageAsWords);
00442     xImageSqSums = _mm_add_epi32(xProduct, xImageSqSums);
00443     xTemplateAsEightBytes=_mm_load_si128((__m128i*) templatepointer);
00444     templatepointer += 16;
00445     xTemplateAsWords = _mm_unpacklo_epi8(xTemplateAsEightBytes,xZero);
00446     xProduct = _mm_madd_epi16(xImageAsWords, xTemplateAsWords);
00447     xCrossSums = _mm_add_epi32(xProduct, xCrossSums);
00448     xImageAsEightBytes=_mm_loadl_epi64((__m128i*) imagepointer);
00449     imagepointer += imagepointerincrement;
00450     xImageAsWords = _mm_unpacklo_epi8(xImageAsEightBytes,xZero);
00451     xImageSums = _mm_adds_epu16(xImageAsWords,xImageSums);
00452     xProduct = _mm_madd_epi16(xImageAsWords, xImageAsWords);
00453     xImageSqSums = _mm_add_epi32(xProduct, xImageSqSums);
00454     xTemplateAsWords = _mm_unpackhi_epi8(xTemplateAsEightBytes,xZero);
00455     xProduct = _mm_madd_epi16(xImageAsWords, xTemplateAsWords);
00456     xCrossSums = _mm_add_epi32(xProduct, xCrossSums);
00457 
00458     xImageAsEightBytes=_mm_loadl_epi64((__m128i*) imagepointer);
00459     imagepointer += imagepointerincrement;
00460     xImageAsWords = _mm_unpacklo_epi8(xImageAsEightBytes,xZero);
00461     xImageSums = _mm_adds_epu16(xImageAsWords,xImageSums);
00462     xProduct = _mm_madd_epi16(xImageAsWords, xImageAsWords);
00463     xImageSqSums = _mm_add_epi32(xProduct, xImageSqSums);
00464     xTemplateAsEightBytes=_mm_load_si128((__m128i*) templatepointer);
00465     templatepointer += 16;
00466     xTemplateAsWords = _mm_unpacklo_epi8(xTemplateAsEightBytes,xZero);
00467     xProduct = _mm_madd_epi16(xImageAsWords, xTemplateAsWords);
00468     xCrossSums = _mm_add_epi32(xProduct, xCrossSums);
00469     xImageAsEightBytes=_mm_loadl_epi64((__m128i*) imagepointer);
00470     imagepointer += imagepointerincrement;
00471     xImageAsWords = _mm_unpacklo_epi8(xImageAsEightBytes,xZero);
00472     xImageSums = _mm_adds_epu16(xImageAsWords,xImageSums);
00473     xProduct = _mm_madd_epi16(xImageAsWords, xImageAsWords);
00474     xImageSqSums = _mm_add_epi32(xProduct, xImageSqSums);
00475     xTemplateAsWords = _mm_unpackhi_epi8(xTemplateAsEightBytes,xZero);
00476     xProduct = _mm_madd_epi16(xImageAsWords, xTemplateAsWords);
00477     xCrossSums = _mm_add_epi32(xProduct, xCrossSums);
00478 
00479     xImageAsEightBytes=_mm_loadl_epi64((__m128i*) imagepointer);
00480     imagepointer += imagepointerincrement;
00481     xImageAsWords = _mm_unpacklo_epi8(xImageAsEightBytes,xZero);
00482     xImageSums = _mm_adds_epu16(xImageAsWords,xImageSums);
00483     xProduct = _mm_madd_epi16(xImageAsWords, xImageAsWords);
00484     xImageSqSums = _mm_add_epi32(xProduct, xImageSqSums);
00485     xTemplateAsEightBytes=_mm_load_si128((__m128i*) templatepointer);
00486     templatepointer += 16;
00487     xTemplateAsWords = _mm_unpacklo_epi8(xTemplateAsEightBytes,xZero);
00488     xProduct = _mm_madd_epi16(xImageAsWords, xTemplateAsWords);
00489     xCrossSums = _mm_add_epi32(xProduct, xCrossSums);
00490     xImageAsEightBytes=_mm_loadl_epi64((__m128i*) imagepointer);
00491     imagepointer += imagepointerincrement;
00492     xImageAsWords = _mm_unpacklo_epi8(xImageAsEightBytes,xZero);
00493     xImageSums = _mm_adds_epu16(xImageAsWords,xImageSums);
00494     xProduct = _mm_madd_epi16(xImageAsWords, xImageAsWords);
00495     xImageSqSums = _mm_add_epi32(xProduct, xImageSqSums);
00496     xTemplateAsWords = _mm_unpackhi_epi8(xTemplateAsEightBytes,xZero);
00497     xProduct = _mm_madd_epi16(xImageAsWords, xTemplateAsWords);
00498     xCrossSums = _mm_add_epi32(xProduct, xCrossSums);
00499 
00500     xImageAsEightBytes=_mm_loadl_epi64((__m128i*) imagepointer);
00501     imagepointer += imagepointerincrement;
00502     xImageAsWords = _mm_unpacklo_epi8(xImageAsEightBytes,xZero);
00503     xImageSums = _mm_adds_epu16(xImageAsWords,xImageSums);
00504     xProduct = _mm_madd_epi16(xImageAsWords, xImageAsWords);
00505     xImageSqSums = _mm_add_epi32(xProduct, xImageSqSums);
00506     xTemplateAsEightBytes=_mm_load_si128((__m128i*) templatepointer);
00507     templatepointer += 16;
00508     xTemplateAsWords = _mm_unpacklo_epi8(xTemplateAsEightBytes,xZero);
00509     xProduct = _mm_madd_epi16(xImageAsWords, xTemplateAsWords);
00510     xCrossSums = _mm_add_epi32(xProduct, xCrossSums);
00511     xImageAsEightBytes=_mm_loadl_epi64((__m128i*) imagepointer);
00512     xImageAsWords = _mm_unpacklo_epi8(xImageAsEightBytes,xZero);
00513     xImageSums = _mm_adds_epu16(xImageAsWords,xImageSums);
00514     xProduct = _mm_madd_epi16(xImageAsWords, xImageAsWords);
00515     xImageSqSums = _mm_add_epi32(xProduct, xImageSqSums);
00516     xTemplateAsWords = _mm_unpackhi_epi8(xTemplateAsEightBytes,xZero);
00517     xProduct = _mm_madd_epi16(xImageAsWords, xTemplateAsWords);
00518     xCrossSums = _mm_add_epi32(xProduct, xCrossSums);
00519 
00520     nImageSum = SumXMM_16(xImageSums);
00521     nCrossSum = SumXMM_32(xCrossSums);
00522     nImageSumSq = SumXMM_32(xImageSqSums);
00523   }
00524   else
00525 #endif 
00526   {
00527     for(int nRow = 0; nRow < mnPatchSize; nRow++)
00528     {
00529       imagepointer = &im[irImgBase + ImageRef(0,nRow)];
00530       templatepointer = &mimTemplate[ImageRef(0,nRow)];
00531       for(int nCol = 0; nCol < mnPatchSize; nCol++)
00532       {
00533         int n = imagepointer[nCol];
00534         nImageSum += n;
00535         nImageSumSq += n*n;
00536         nCrossSum += n * templatepointer[nCol];
00537       };
00538     }
00539   };
00540 
00541   int SA = mnTemplateSum;
00542   int SB = nImageSum;
00543 
00544   int N = mnPatchSize * mnPatchSize;
00545   return ((2*SA*SB - SA*SA - SB*SB)/N + nImageSumSq + mnTemplateSumSq - 2*nCrossSum);
00546 }
00547 
00548 
00549 
00550 
00551 
00552