frame_common: stereolib.c Source File

Go to the documentation of this file.
00001 /*********************************************************************
00002 * Software License Agreement (BSD License)
00003 * 
00004 *  Copyright (c) 2008, Willow Garage, Inc.
00005 *  All rights reserved.
00006 * 
00007 *  Redistribution and use in source and binary forms, with or without
00008 *  modification, are permitted provided that the following conditions
00009 *  are met:
00010 * 
00011 *   * Redistributions of source code must retain the above copyright
00012 *     notice, this list of conditions and the following disclaimer.
00013 *   * Redistributions in binary form must reproduce the above
00014 *     copyright notice, this list of conditions and the following
00015 *     disclaimer in the documentation and/or other materials provided
00016 *     with the distribution.
00017 *   * Neither the name of the Willow Garage nor the names of its
00018 *     contributors may be used to endorse or promote products derived
00019 *     from this software without specific prior written permission.
00020 * 
00021 *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
00022 *  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
00023 *  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
00024 *  FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
00025 *  COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
00026 *  INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
00027 *  BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
00028 *  LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
00029 *  CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
00030 *  LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
00031 *  ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
00032 *  POSSIBILITY OF SUCH DAMAGE.
00033 *********************************************************************/
00034 
00035 //
00036 // C version of stereo algorithm
00037 // Basic column-oriented library
00038 //
00039 
00040 #include <frame_common/stereolib.h>
00041 #include <emmintrin.h>
00042 #include <stdio.h>
00043 
00044 // algorithm requires larger buffers to be passed in
00045 // using lib fns like "memset" can cause problems (?? not sure -
00046 //   it was a stack alignment problem)
00047 // -- but still roll our own
00048 
00049 static inline void
00050 memclr(void *buf, int n)
00051 {
00052   int i;
00053   unsigned char *bb = (unsigned char *)buf;
00054   for (i=0; i<n; i++)
00055     *bb++ = 0;
00056 }
00057 
00058 
00059 //
00060 // simple normalization prefilter
00061 // XKERN x YKERN mean intensity
00062 // normalize center pixels
00063 // feature values are unsigned bytes, offset at <ftzero> 
00064 //
00065 // image aligned at 16 bytes
00066 // feature output aligned at 16 bytes
00067 //
00068 
00069 void
00070 do_prefilter_norm(uint8_t *im,  // input image
00071           uint8_t *ftim,        // feature image output
00072           int xim, int yim,     // size of image
00073           uint8_t ftzero,       // feature offset from zero
00074           uint8_t *buf          // buffer storage
00075           )
00076 {
00077   int i,j;
00078   uint8_t *imp, *impp;
00079   uint16_t acc;
00080   uint8_t *topp, *toppp, *botp, *botpp, *cenp;
00081   int32_t qval;
00082   uint32_t uqval;
00083   uint8_t *ftimp;
00084 
00085   // set up buffers
00086   uint16_t *accbuf, *accp;
00087   int ACCBUFSIZE = xim+64;
00088   accbuf = (uint16_t *)buf;
00089 
00090   // image ptrs
00091   imp = im;                     // leading
00092   impp = im;                    // lagging
00093 
00094   // clear acc buffer
00095   memclr(accbuf, ACCBUFSIZE*sizeof(int16_t));
00096 
00097   // loop over rows
00098   for (j=0; j<yim; j++, imp+=xim, ftim+=xim)
00099     {
00100       acc = 0;                  // accumulator
00101       accp = accbuf;            // start at beginning of buf
00102       topp  = imp;              // leading integration ptr
00103       toppp = imp;              // lagging integration ptr
00104       botp  = impp;             // leading integration ptr
00105       botpp = impp;             // lagging integration ptr
00106 
00107       if (j<YKERN)              // initial row accumulation
00108         {
00109           for (i=0; i<XKERN; i++) // initial col accumulation
00110             acc += *topp++;
00111           for (; i<xim; i++) // incremental col acc
00112             {
00113               acc += *topp++ - *toppp++; // do line sum increment
00114               *accp++ += acc;   // increment acc buf value
00115             }
00116         }
00117       else                      // incremental accumulation
00118         {
00119           cenp = imp - (YKERN/2)*xim + XKERN/2 + 1; // center pixel
00120           ftimp = ftim - (YKERN/2)*xim + XKERN/2 + 1; // output ptr
00121           for (i=0; i<XKERN; i++) // initial col accumulation
00122             acc += *topp++ - *botp++;
00123           for (; i<xim; i++, accp++, cenp++) // incremental col acc
00124             {
00125               acc += *topp++ - *toppp++; // do line sum increment
00126               acc -= *botp++ - *botpp++; // subtract out previous vals
00127               *accp += acc;     // increment acc buf value
00128 
00129               // now calculate diff from mean value and save
00130               qval = 4*(*cenp) + *(cenp-1) + *(cenp+1) + *(cenp-xim) + *(cenp+xim);
00131 #if (XKERN==9)
00132               qval = (qval*10)+*cenp - *accp;   // difference with mean, norm is 81
00133               qval = qval>>4;   // divide by 16 (cenp val divide by ~2)
00134 #endif
00135 #if (XKERN==7)
00136               qval = (qval*6)+*cenp - *accp;    // difference with mean, norm is 49
00137               qval = qval>>3;   // divide by 8 (cenp val divide by ~2)
00138 #endif
00139 
00140               if (qval < -ftzero)
00141                 uqval = 0;
00142               else if (qval > ftzero)
00143                 uqval = ftzero+ftzero;
00144               else
00145                 uqval = qval + ftzero;
00146               *ftimp++ = (uint8_t)uqval;
00147             }
00148           impp += xim;          // increment lagging image ptr
00149         }
00150     }
00151 }
00152 
00153 
00154 
00155 //
00156 // x-oriented Sobel prefilter
00157 // feature values are unsigned bytes, offset at <ftzero> 
00158 // buffer <buf> is not used
00159 //
00160 // image aligned at 16 bytes
00161 // feature output aligned at 16 bytes
00162 //
00163 
00164 void
00165 do_prefilter_xsobel(uint8_t *im, // input image
00166           uint8_t *ftim,        // feature image output
00167           int xim, int yim,     // size of image
00168           uint8_t ftzero,       // feature offset from zero
00169           uint8_t *buf          // buffer storage
00170           )
00171 {
00172   int i,j;
00173   uint8_t *impp, *impc, *impn;
00174   uint8_t *ftimp;
00175   int v;
00176 
00177   // image ptrs
00178   ftim += xim + 1;              // do we offset output too????
00179 
00180   // loop over rows
00181   for (j=1; j<yim-1; j++, im+=xim, ftim+=xim)
00182     {
00183       impp = im;
00184       impc = im+xim;
00185       impn = im+xim+xim;
00186       ftimp = ftim;
00187       for (i=1; i<xim-1; i+=4, impp+=4, impc+=4, impn+=4) // loop over line
00188             {
00189               // xsobel filter is -1 -2 -1; 0 0 0; 1 2 1
00190               v = *(impc+2)*2 - *(impc)*2 + *(impp+2) -
00191                 *(impp) + *(impn+2) - *(impn);
00192               v = v >> 0;       // normalize
00193               if (v < -ftzero)
00194                 v = 0;
00195               else if (v > ftzero)
00196                 v = ftzero+ftzero;
00197               else
00198                 v = v + ftzero;
00199               *ftimp++ = (uint8_t)v;
00200 
00201               v = *(impc+3)*2 - *(impc+1)*2 + *(impp+3) -
00202                 *(impp+1) + *(impn+3) - *(impn+1);
00203               v = v >> 0;       // normalize
00204               if (v < -ftzero)
00205                 v = 0;
00206               else if (v > ftzero)
00207                 v = ftzero+ftzero;
00208               else
00209                 v = v + ftzero;
00210               *ftimp++ = (uint8_t)v;
00211 
00212               v = *(impc+4)*2 - *(impc+2)*2 + *(impp+4) -
00213                 *(impp+2) + *(impn+4) - *(impn+2);
00214               v = v >> 0;       // normalize
00215               if (v < -ftzero)
00216                 v = 0;
00217               else if (v > ftzero)
00218                 v = ftzero+ftzero;
00219               else
00220                 v = v + ftzero;
00221               *ftimp++ = (uint8_t)v;
00222 
00223               v = *(impc+5)*2 - *(impc+3)*2 + *(impp+5) -
00224                 *(impp+3) + *(impn+5) - *(impn+3);
00225               v = v >> 0;       // normalize
00226               if (v < -ftzero)
00227                 v = 0;
00228               else if (v > ftzero)
00229                 v = ftzero+ftzero;
00230               else
00231                 v = v + ftzero;
00232               *ftimp++ = (uint8_t)v;
00233             }
00234     }
00235 }
00236 
00237 
00238 //
00239 // stereo routines
00240 //
00241 
00242 //
00243 // column-oriented stereo
00244 // needs all rows before getting any disparity output
00245 //
00246 
00247 void
00248 do_stereo_y(uint8_t *lim, uint8_t *rim, // input feature images
00249           int16_t *disp,        // disparity output
00250           int16_t *text,        // texture output
00251           int xim, int yim,     // size of images
00252           uint8_t ftzero,       // feature offset from zero
00253           int xwin, int ywin,   // size of corr window, usually square
00254           int dlen,             // size of disparity search, multiple of 8
00255           int tfilter_thresh,   // texture filter threshold
00256           int ufilter_thresh,   // uniqueness filter threshold, percent
00257           uint8_t *buf          // buffer storage
00258           )
00259 
00260 {
00261   int i,j,d;                    // iteration indices
00262   int16_t *accp, *accpp, *accminp; // acc buffer ptrs
00263   int8_t *limp, *rimp, *limpp, *rimpp, *limp2, *limpp2; // feature image ptrs
00264   int8_t *corrend, *corrp, *corrpp; // corr buffer ptrs
00265   int16_t *intp, *intpp;        // integration buffer pointers
00266   int16_t *textpp;              // texture buffer pointer
00267   int16_t *dispp, *disppp;      // disparity output pointer
00268   int16_t *textp;               // texture output pointer
00269   int16_t acc, newv;
00270   int8_t  newc, temp;
00271   int16_t umin, uminthresh;     // for uniqueness check
00272   int dval;                     // disparity value
00273   
00274   // set up buffers
00275 #define YINTBUFSIZE (yim+64)
00276   int16_t *intbuf  = (int16_t *)buf;    // integration buffer
00277 #define YTEXTBUFSIZE (yim+64)
00278   int16_t *textbuf = (int16_t *)&buf[YINTBUFSIZE*sizeof(int16_t)];      // texture buffer
00279 #define YACCBUFSIZE (yim*dlen + 64)
00280   int16_t *accbuf  = (int16_t *)&buf[(YINTBUFSIZE + YTEXTBUFSIZE)*sizeof(int16_t)]; // accumulator buffer
00281   int8_t  *corrbuf = (int8_t *)&buf[(YINTBUFSIZE + YTEXTBUFSIZE + YACCBUFSIZE)*sizeof(int16_t)]; // correlation buffer
00282 
00283   // clear out buffers
00284   memclr(intbuf, yim*sizeof(int16_t));
00285   memclr(corrbuf, dlen*yim*xwin*sizeof(int8_t));
00286   memclr(accbuf, dlen*yim*sizeof(int16_t));
00287   memclr(textbuf, yim*sizeof(int16_t));
00288 
00289   // set up corrbuf pointers
00290   corrend = corrbuf + dlen*yim*xwin;
00291   corrp = corrbuf;
00292   accminp = 0;                  // compiler warning
00293 
00294   // start further out on line to take care of disparity offsets
00295   limp = (int8_t *)lim + dlen;
00296   limp2 = limp;
00297   rimp = (int8_t *)rim + dlen;
00298   // offset to start of good disparity area
00299   dispp = disp + xim*(ywin+YKERN-2)/2 + dlen + (xwin+XKERN-2)/2; 
00300   textp = NULL;
00301   if (text != NULL)             // optional texture buffer output
00302     textp = text + dlen;
00303 
00304   // normalize texture threshold
00305   tfilter_thresh = tfilter_thresh * xwin * ywin * ftzero;
00306   tfilter_thresh = tfilter_thresh / 100; // now at percent of max
00307   //  tfilter_thresh = tfilter_thresh / 10;
00308 
00309   // iterate over columns first
00310   // buffer is column-oriented, not line-oriented
00311   for (i=0; i<xim-XKERN-dlen+2; i++, limp++, rimp++)
00312     {    
00313       accp = accbuf;
00314       if (corrp >= corrend) corrp = corrbuf;
00315           
00316       // iterate over disparities, to do incremental calcs on each disparity
00317       // can't mix disparities!
00318       for (d=0; d<dlen; d++, accp+=yim, corrp+=yim)      
00319         {
00320           limpp = limp;
00321           rimpp = rimp-d;
00322           intp = intbuf+ywin-1;
00323           intpp = intbuf;
00324           accpp = accp;
00325           corrpp = corrp;
00326               
00327           // iterate over rows
00328           // have to skip down a row each time...
00329           for (j=0; j<yim-YKERN+1; j++, limpp+=xim, rimpp+=xim)
00330             {
00331               newc = abs(*limpp - *rimpp); // new corr val
00332               newv = newc - *corrpp + *intp++; // new acc val
00333               *corrpp++ = newc; // save new corr val
00334               *intp = newv;     // save new acc val
00335               *accpp++ += newv - *intpp++; // update window sum
00336             }
00337         } 
00338 
00339       // average texture computation
00340       // use full corr window
00341       limpp = limp;
00342       limpp2 = limp2;
00343       intp = intbuf+ywin-1;
00344       intpp = intbuf;
00345       accpp = textbuf;
00346       acc = 0;
00347           
00348       // iterate over rows
00349       // have to skip down a row each time...
00350       // check for initial period
00351       if (i < xwin)
00352         {
00353           for (j=0; j<yim-YKERN+1; j++, limpp+=xim)
00354             {
00355               temp = abs(*limpp- ftzero); 
00356               *intp = temp;
00357               acc += *intp++ - *intpp++;
00358               *accpp++ += acc;
00359             }
00360         }
00361       else
00362         {
00363           for (j=0; j<yim-YKERN+1; j++, limpp+=xim, limpp2+=xim)
00364             {
00365               temp = abs(*limpp-ftzero); 
00366               *intp = temp - abs(*limpp2-ftzero);
00367               acc += *intp++ - *intpp++;
00368               *accpp++ += acc;
00369             }
00370           limp2++;
00371         }
00372           
00373       // disparity extraction, find min of correlations
00374       if (i >= xwin)            // far enough along...
00375         {
00376           disppp = dispp;
00377           accp   = accbuf + (ywin-1); // results within initial corr window are partial
00378           textpp = textbuf + (ywin-1); // texture measure
00379 
00380           // iterate over rows
00381           for (j=0; j<yim-ywin-YKERN+2; j++, accp++, disppp+=xim, textpp++) 
00382             {
00383               umin = 32000;
00384               accpp = accp;
00385               dval = -1;
00386 
00387               if (*textpp < tfilter_thresh)
00388                 {
00389                   *disppp = FILTEREDVAL;
00390                   continue;
00391                 }
00392 
00393               // find minimum and index over disparities
00394               for (d=0; d<dlen; d++, accpp+=yim)
00395                 {
00396                   if (*accpp < umin)
00397                     {
00398                       accminp = accpp;
00399                       umin = *accpp;
00400                       dval = d;
00401                     }
00402                 }  
00403 
00404               // check uniqueness
00405               accpp = accp;           
00406               if (ufilter_thresh > 0)
00407                 {
00408                   uminthresh = umin + (umin * ufilter_thresh) / 100;
00409                   for (d=0; d<dlen; d++, accpp+=yim)
00410                     {
00411                       if (*accpp <= uminthresh && (d < dval-1 || d > dval+1))
00412                         {
00413                           dval = -1;
00414                           break;
00415                         }
00416                     }
00417                 }
00418               if (dval < 0)
00419                 *disppp = FILTEREDVAL;
00420               else
00421                 {
00422                   double c, p, n, v;
00423                   c = (double)*accminp;
00424                   p = (double)*(accminp-yim);
00425                   n = (double)*(accminp+yim);
00426                   v = (double)dval + (p-n)/(2*(p+n-2*c));
00427                   *disppp = (int)(0.5 + 16*v);
00428                 }
00429             } // end of row loop
00430 
00431           dispp++;              // go to next column of disparity output
00432         }
00433 
00434     } // end of outer column loop
00435 
00436 }
00437 
00438 //
00439 // inner loop here is over disparities
00440 //
00441 
00442 void
00443 do_stereo_d(uint8_t *lim, uint8_t *rim, // input feature images
00444             int16_t *disp,      // disparity output
00445             int16_t *text,      // texture output
00446             int xim, int yim,   // size of images
00447             uint8_t ftzero,     // feature offset from zero
00448             int xwin, int ywin, // size of corr window, usually square
00449             int dlen,           // size of disparity search, multiple of 8
00450             int tfilter_thresh, // texture filter threshold
00451             int ufilter_thresh, // uniqueness filter threshold, percent
00452             uint8_t *buf        // buffer storage
00453             )
00454 
00455 {
00456   int i,j,d;                    // iteration indices
00457   int16_t *accp, *accpp, *accminp; // acc buffer ptrs
00458   int8_t *limp, *rimp, *limpp, *rimpp, *limp2, *limpp2; // feature image ptrs
00459   int8_t *corrend, *corrp, *corrpp; // corr buffer ptrs
00460   int16_t *intp, *intpp;        // integration buffer pointers
00461   int16_t *textpp;              // texture buffer pointer
00462   int16_t *dispp, *disppp;      // disparity output pointer
00463   int16_t *textp;               // texture output pointer
00464   int16_t acc, newv;
00465   int8_t  temp, newc;
00466   int16_t umin, uminthresh;     // for uniqueness check
00467   int dval;                     // disparity value
00468   
00469   int16_t *intbuf, *textbuf, *accbuf;
00470   int8_t *corrbuf;
00471 
00472   int pfilter_thresh = 2*ywin*xwin;
00473 
00474 #define INTBUFSIZE ((yim-YKERN+ywin)*dlen+64)
00475   intbuf  = (int16_t *)buf;     // integration buffer
00476 #define TEXTBUFSIZE (yim+64)
00477   textbuf = (int16_t *)&buf[INTBUFSIZE*sizeof(int16_t)];        // texture buffer
00478 #define ACCBUFSIZE (yim*dlen + 64)
00479   accbuf  = (int16_t *)&buf[(INTBUFSIZE+TEXTBUFSIZE)*sizeof(int16_t)]; // accumulator buffer
00480   corrbuf = (int8_t *)&buf[(INTBUFSIZE+TEXTBUFSIZE+ACCBUFSIZE)*sizeof(int16_t)]; // correlation buffer
00481 
00482   // clear out buffers
00483   memclr(intbuf, dlen*yim*sizeof(int16_t));
00484   memclr(corrbuf, dlen*yim*xwin*sizeof(int8_t));
00485   memclr(accbuf, dlen*yim*sizeof(int16_t));
00486   memclr(textbuf, yim*sizeof(int16_t));
00487 
00488   // set up corrbuf pointers
00489   corrend = corrbuf + dlen*yim*xwin;
00490   corrp = corrbuf;
00491   accminp = 0;                  // compiler warning
00492 
00493   // start further out on line to take care of disparity offsets
00494   limp = (int8_t *)lim + dlen - 1;
00495   limp2 = limp;
00496   rimp = (int8_t *)rim;
00497   dispp = disp + xim*(ywin+YKERN-2)/2 + dlen + (xwin+XKERN-2)/2; 
00498   textp = NULL;
00499   if (text != NULL)             // optional texture buffer output
00500     textp = text + dlen;
00501 
00502   // normalize texture threshold
00503   tfilter_thresh = tfilter_thresh * xwin * ywin * ftzero;
00504   tfilter_thresh = tfilter_thresh / 100; // now at percent of max
00505   //  tfilter_thresh = tfilter_thresh / 10;
00506 
00507   // iterate over columns first
00508   // acc buffer is column-oriented, not line-oriented
00509   // at each iteration, move across one column
00510   for (i=0; i<xim-XKERN-dlen+2; i++, limp++, rimp++, corrp+=yim*dlen)
00511     {    
00512       accp = accbuf;
00513       if (corrp >= corrend) corrp = corrbuf;
00514       limpp = limp;
00515       rimpp = rimp;
00516       corrpp = corrp;
00517       intp = intbuf+(ywin-1)*dlen; // intbuf current ptr
00518       intpp = intbuf;   // intbuf old ptr
00519           
00520       // iterate over rows
00521       for (j=0; j<yim-YKERN+1; j++, limpp+=xim, rimpp+=xim)
00522         {
00523           // iterate over disparities
00524           // have to use an intbuf column for each disparity
00525           for (d=0; d<dlen; d++, intp++)
00526             {
00527               // do SAD calculation
00528               newc = abs(*limpp - rimpp[d]); // new corr val
00529               newv = newc - *corrpp + *intp; // new acc val
00530               *corrpp++ = newc; // save new corr val
00531               *(intp+dlen) = newv; // save new acc val
00532               *accp++ += newv - *intpp++; // update window sum
00533             }
00534         } 
00535 
00536       // average texture computation
00537       // use full corr window
00538       memclr(intbuf, ywin*sizeof(int16_t));
00539       limpp = limp;
00540       limpp2 = limp2;
00541       intp = intbuf+ywin-1;
00542       intpp = intbuf;
00543       accpp = textbuf;
00544       acc = 0;
00545           
00546       // iterate over rows
00547       // have to skip down a row each time...
00548       // check for initial period
00549       if (i < xwin)
00550         {
00551           for (j=0; j<yim-YKERN+1; j++, limpp+=xim)
00552             {
00553               temp = abs(*limpp- ftzero); 
00554               *intp = temp;
00555               acc += *intp++ - *intpp++;
00556               *accpp++ += acc;
00557             }
00558         }
00559       else
00560         {
00561           for (j=0; j<yim-YKERN+1; j++, limpp+=xim, limpp2+=xim)
00562             {
00563               temp = abs(*limpp-ftzero); 
00564               *intp = temp - abs(*limpp2-ftzero);
00565               acc += *intp++ - *intpp++;
00566               *accpp++ += acc;
00567             }
00568           limp2++;
00569         }
00570           
00571       // disparity extraction, find min of correlations
00572       if (i >= xwin)            // far enough along...
00573         {
00574           disppp = dispp;
00575           accp   = accbuf + (ywin-1)*dlen; // results within initial corr window are partial
00576           textpp = textbuf + (ywin-1); // texture measure
00577 
00578           // iterate over rows
00579           for (j=0; j<yim-ywin-YKERN+2; j++, accp+=dlen, disppp+=xim, textpp++) 
00580             {
00581               umin = 32000;
00582               accpp = accp;
00583               dval = -1;
00584 
00585               if (*textpp < tfilter_thresh)
00586                 {
00587                   *disppp = FILTEREDVAL;
00588                   continue;
00589                 }
00590 
00591               // find minimum and index over disparities
00592               for (d=dlen-1; d>=0; d--, accpp++)
00593                 {
00594                   if (*accpp < umin)
00595                     {
00596                       accminp = accpp;
00597                       umin = *accpp;
00598                       dval = d;
00599                     }
00600                 }  
00601 
00602               // check uniqueness
00603               accpp = accp;           
00604               if (ufilter_thresh > 0)
00605                 {
00606                   uminthresh = umin + (umin * ufilter_thresh) / 100;
00607                   for (d=dlen-1; d>=0; d--, accpp++)
00608                     {
00609                       if (*accpp <= uminthresh && (d < dval-1 || d > dval+1))
00610                         {
00611                           dval = -1;
00612                           break;
00613                         }
00614                     }
00615                 }
00616               if (dval < 0)
00617                 *disppp = FILTEREDVAL;
00618               else
00619                 {
00620                   double c, p, n, v;
00621                   c = (double)*accminp;
00622                   p = (double)*(accminp+1);
00623                   n = (double)*(accminp-1);
00624                 
00625                   //Peak filter
00626                   if( *(accminp+1) + *(accminp-1) - 2*(*accminp) < pfilter_thresh){     
00627                         *disppp = FILTEREDVAL;
00628                   }     
00629                   else
00630                   {
00631 #define POW 0.7
00632 #if 1
00633                     // power law correction
00634                     v = (p-n)/(p+n-2*c);
00635                     if (v >= 0) v = pow(v,POW);
00636                     else v = -pow(-v,POW);
00637 #elif 0
00638                     // num/den addition correction
00639                     double k = 1.0;
00640                     v = (k+1)*(p-n)/((p+n-2*c)+k*abs(p-n));
00641 #elif 0
00642                     // standard quadratic interpolation
00643                     v = (p-n)/(p+n-2*c);
00644 #else
00645                     v = (p-n)/(p+n-2*c);
00646                     v = 0;
00647 #endif
00648                     v = (double)dval + 0.5*v;
00649                     *disppp = (int)(0.5 + 16.0*v);
00650                   }
00651                 }
00652             } // end of row loop
00653 
00654           dispp++;              // go to next column of disparity output
00655         }
00656 
00657     } // end of outer column loop
00658 
00659 }
00660 
00661 
00662 // algorithm requires larger buffers to be passed in
00663 // using lib fns like "memset" can cause problems (?? not sure -
00664 //   it was a stack alignment problem)
00665 // -- but still roll our own
00666 
00667 static inline void
00668 memclr_si128(__m128i *buf, int n)
00669 {
00670   int i;
00671   __m128i zz;
00672   zz = _mm_setzero_si128();
00673   n = n>>4;                     // divide by 16
00674   for (i=0; i<n; i++, buf++)
00675     _mm_store_si128(buf,zz);
00676 }
00677 
00678 //
00679 // fast SSE2 version
00680 // NOTE: output buffer <ftim> must be aligned on 16-byte boundary
00681 // NOTE: input image <im> must be aligned on 16-byte boundary
00682 // NOTE: KSIZE (XKERN, YKERN) is fixed at 7
00683 // 
00684 
00685 #define PXKERN 7
00686 #define PYKERN 7
00687 
00688 void
00689 do_prefilter_fast(uint8_t *im,  // input image
00690           uint8_t *ftim,        // feature image output
00691           int xim, int yim,     // size of image
00692           uint8_t ftzero,       // feature offset from zero
00693           uint8_t *buf          // buffer storage
00694           )
00695 {
00696   int i,j;
00697 
00698   // set up buffers, first align to 16 bytes
00699   uintptr_t bufp = (uintptr_t)buf;
00700   int16_t *accbuf, *accp;
00701   int16_t *intbuf, *intp;       // intbuf is size xim
00702   uint8_t *imp, *impp, *ftimp;
00703   __m128i acc, accs, acc1, accn1, accn2, acct, pxs, opxs, zeros;
00704   __m128i const_ftzero, const_ftzero_x48, const_ftzero_x2;
00705 
00706   int FACCBUFSIZE = xim+64;
00707 
00708   if (bufp & 0xF)
00709     bufp = (bufp+15) & ~(uintptr_t)0xF;
00710   buf = (uint8_t *)bufp;
00711   accbuf = (int16_t *)buf;
00712 
00713   intbuf = (int16_t *)&buf[FACCBUFSIZE*sizeof(int16_t)];        // integration buffer
00714   bufp = (uintptr_t)intbuf;
00715   if (bufp & 0xF)
00716     bufp = (bufp+15) & ~(uintptr_t)0xF;
00717   intbuf = (int16_t *)bufp;  
00718 
00719   // clear buffers
00720   memclr_si128((__m128i *)accbuf, FACCBUFSIZE*sizeof(int16_t));
00721   memclr_si128((__m128i *)intbuf, 8*sizeof(uint16_t));
00722   impp = im;                    // old row window pointer
00723 
00724   // constants
00725   zeros = _mm_setzero_si128();
00726   const_ftzero     = _mm_set1_epi16(ftzero);
00727   const_ftzero_x48 = _mm_set1_epi16(ftzero*48);
00728   const_ftzero_x2  = _mm_set1_epi16(ftzero*2);
00729 
00730   // loop over rows
00731   for (j=0; j<yim; j++, im+=xim, ftim+=xim)
00732     {
00733       accp = accbuf;            // start at beginning of buf
00734       intp = intbuf+8;
00735       acc = _mm_setzero_si128(); 
00736       imp = im;                 // new row window ptr
00737       impp = im - PYKERN*xim;
00738 
00739       // initial row accumulation
00740       if (j<PYKERN)
00741         {
00742           for (i=0; i<xim; i+=16, imp+=16, intp+=16, accp+=16)
00743             {
00744               pxs = _mm_load_si128((__m128i *)imp); // next 16 pixels
00745               accs = _mm_unpacklo_epi8(pxs,zeros); // unpack first 8 into words
00746               acc = _mm_srli_si128(acc, 14); // shift highest word to lowest
00747               accs = _mm_add_epi16(accs, acc); // add it in
00748               // sum horizontally
00749               acct = _mm_slli_si128(accs, 2); // shift left one word
00750               accs = _mm_add_epi16(accs, acct); // add it in
00751               acct = _mm_slli_si128(accs, 4); // shift left two words
00752               accs = _mm_add_epi16(accs, acct); // add it in
00753               acct = _mm_slli_si128(accs, 8); // shift left four words
00754               acc1 = _mm_add_epi16(accs, acct); // add it in, done
00755               _mm_store_si128((__m128i *)intp,acc1); // stored
00756               // next 8 pixels
00757               accs = _mm_unpackhi_epi8(pxs,zeros); // unpack second 8 into words
00758               acc = _mm_srli_si128(acc1, 14); // shift highest word to lowest
00759               accs = _mm_add_epi16(accs, acc); // add it in
00760               // sum horizontally
00761               acct = _mm_slli_si128(accs, 2); // shift left one word
00762               accs = _mm_add_epi16(accs, acct); // add it in
00763               acct = _mm_slli_si128(accs, 4); // shift left two words
00764               accs = _mm_add_epi16(accs, acct); // add it in
00765               acct = _mm_slli_si128(accs, 8); // shift left four words
00766               acc  = _mm_add_epi16(accs, acct); // add it in, done
00767               _mm_store_si128((__m128i *)(intp+8),acc); // stored
00768               
00769               // update acc buffer, first 8 vals
00770               acct = _mm_loadu_si128((__m128i *)(intp-7)); // previous int buffer values
00771               acc1 = _mm_sub_epi16(acc1,acct);
00772               accs = _mm_load_si128((__m128i *)accp); // acc value
00773               acc1 = _mm_add_epi16(acc1,accs);
00774               _mm_store_si128((__m128i *)accp,acc1); // stored        
00775               // update acc buffer, second 8 vals
00776               acct = _mm_loadu_si128((__m128i *)(intp-7+8)); // previous int buffer values
00777               acc1 = _mm_sub_epi16(acc,acct);
00778               accs = _mm_load_si128((__m128i *)(accp+8)); // acc value
00779               acc1 = _mm_add_epi16(acc1,accs);
00780               _mm_store_si128((__m128i *)(accp+8),acc1); // stored            
00781 
00782             }
00783         }
00784 
00785 
00786       else                      // incremental accumulation
00787         {
00788           for (i=0; i<xim; i+=16, imp+=16, impp+=16, intp+=16, accp+=16)
00789             {
00790               pxs = _mm_load_si128((__m128i *)imp); // next 16 pixels
00791               opxs = _mm_load_si128((__m128i *)impp); // next 16 pixels
00792               accs = _mm_unpacklo_epi8(pxs,zeros); // unpack first 8 into words
00793               acc = _mm_srli_si128(acc, 14); // shift highest word to lowest
00794               accs = _mm_add_epi16(accs, acc); // add it in
00795               acct = _mm_unpacklo_epi8(opxs,zeros); // unpack first 8 into words
00796               accs = _mm_sub_epi16(accs, acct); // subtract it out
00797 
00798               // sum horizontally
00799               acct = _mm_slli_si128(accs, 2); // shift left one word
00800               accs = _mm_add_epi16(accs, acct); // add it in
00801               acct = _mm_slli_si128(accs, 4); // shift left two words
00802               accs = _mm_add_epi16(accs, acct); // add it in
00803               acct = _mm_slli_si128(accs, 8); // shift left four words
00804               acc1 = _mm_add_epi16(accs, acct); // add it in, done
00805               _mm_store_si128((__m128i *)intp,acc1); // stored
00806 
00807               // next 8 pixels
00808               accs = _mm_unpackhi_epi8(pxs,zeros); // unpack second 8 into words
00809               acc = _mm_srli_si128(acc1, 14); // shift highest word to lowest
00810               accs = _mm_add_epi16(accs, acc); // add it in
00811               acct = _mm_unpackhi_epi8(opxs,zeros); // unpack first 8 into words
00812               accs = _mm_sub_epi16(accs, acct); // subtract it out
00813 
00814               // sum horizontally
00815               acct = _mm_slli_si128(accs, 2); // shift left one word
00816               accs = _mm_add_epi16(accs, acct); // add it in
00817               acct = _mm_slli_si128(accs, 4); // shift left two words
00818               accs = _mm_add_epi16(accs, acct); // add it in
00819               acct = _mm_slli_si128(accs, 8); // shift left four words
00820               acc  = _mm_add_epi16(accs, acct); // add it in, done
00821               _mm_store_si128((__m128i *)(intp+8),acc); // stored
00822               
00823               // update acc buffer, first 8 vals
00824               acct = _mm_loadu_si128((__m128i *)(intp-7)); // previous int buffer values
00825               acc1 = _mm_sub_epi16(acc1,acct);
00826               accs = _mm_load_si128((__m128i *)accp); // acc value
00827               acc1 = _mm_add_epi16(acc1,accs);
00828               _mm_store_si128((__m128i *)accp,acc1); // stored        
00829               // update acc buffer, second 8 vals
00830               acct = _mm_loadu_si128((__m128i *)(intp-7+8)); // previous int buffer values
00831               acc1 = _mm_sub_epi16(acc,acct);
00832               accs = _mm_load_si128((__m128i *)(accp+8)); // acc value
00833               acc1 = _mm_add_epi16(acc1,accs);
00834               _mm_store_si128((__m128i *)(accp+8),acc1); // stored            
00835             }
00836 
00837           // now do normalization and saving of results
00838           accp = accbuf+6;      // start at beginning of good values, off by 1 pixel
00839           imp = im - (PYKERN/2)*xim + PXKERN/2;
00840           ftimp = ftim - (PYKERN/2)*xim + PXKERN/2;
00841           for (i=0; i<xim-8; i+=16, imp+=16, accp+=16, ftimp+=16)
00842             {
00843               // sum up weighted pixels in a 4-square pattern
00844               pxs = _mm_loadu_si128((__m128i *)imp); // next 16 pixels
00845               accn1 = _mm_unpacklo_epi8(pxs,zeros); // unpack first 8 into words
00846               accn2  = _mm_unpackhi_epi8(pxs,zeros); // unpack second 8 into words
00847               accs = _mm_slli_epi16(accn1,2); // multiply by 4
00848               acc  = _mm_slli_epi16(accn2,2); // multiply by 4        
00849               pxs  = _mm_loadu_si128((__m128i *)(imp+1)); // 16 pixels to the right
00850               acct = _mm_unpacklo_epi8(pxs,zeros); // unpack first 8 into words
00851               acc1 = _mm_unpackhi_epi8(pxs,zeros); // unpack second 8 into words
00852               accs = _mm_add_epi16(acct,accs);
00853               acc  = _mm_add_epi16(acc1,acc);
00854               pxs  = _mm_loadu_si128((__m128i *)(imp-1)); // 16 pixels to the left
00855               acct = _mm_unpacklo_epi8(pxs,zeros); // unpack first 8 into words
00856               acc1 = _mm_unpackhi_epi8(pxs,zeros); // unpack second 8 into words
00857               accs = _mm_add_epi16(acct,accs);
00858               acc  = _mm_add_epi16(acc1,acc);
00859               pxs  = _mm_loadu_si128((__m128i *)(imp+xim)); // 16 pixels below
00860               acct = _mm_unpacklo_epi8(pxs,zeros); // unpack first 8 into words
00861               acc1 = _mm_unpackhi_epi8(pxs,zeros); // unpack second 8 into words
00862               accs = _mm_add_epi16(acct,accs);
00863               acc  = _mm_add_epi16(acc1,acc);
00864               pxs  = _mm_loadu_si128((__m128i *)(imp-xim)); // 16 pixels above
00865               acct = _mm_unpacklo_epi8(pxs,zeros); // unpack first 8 into words
00866               acc1 = _mm_unpackhi_epi8(pxs,zeros); // unpack second 8 into words
00867               accs = _mm_add_epi16(acct,accs);
00868               acc  = _mm_add_epi16(acc1,acc);
00869               // first 8 values
00870               // times weighted center by 6, giving 48x single pixel value
00871               acct = _mm_slli_epi16(accs,2); // multiply by 4
00872               accs = _mm_add_epi16(accs,accs); // double
00873               accs = _mm_add_epi16(accs,acct); // now x6
00874               accs = _mm_add_epi16(accs,accn1); // +1 is 49x
00875               // subtract from window sum
00876               acct = _mm_loadu_si128((__m128i *)accp);
00877               accs = _mm_sub_epi16(accs,acct); // subtract out norm
00878               // normalize to ftzero and saturate, divide by 8
00879               accs = _mm_srai_epi16(accs,3); // divide by 8
00880               accs = _mm_adds_epi16(accs,const_ftzero); // normalize to ftzero
00881               // saturate to [0, ftzero*2]
00882               accs = _mm_max_epi16(accs,zeros); // floor of 0
00883               accs = _mm_min_epi16(accs,const_ftzero_x2);
00884 
00885               // second 8 values
00886               // times weighted center by 6
00887               acct = _mm_slli_epi16(acc,2); // multiply by 4
00888               acc = _mm_add_epi16(acc,acc); // double
00889               acc = _mm_add_epi16(acc,acct); // now x6
00890               acc = _mm_add_epi16(acc,accn2); // now 49x
00891               // subtract from window sum, get absolute value
00892               acct = _mm_loadu_si128((__m128i *)(accp+8));
00893               acc1 = _mm_sub_epi16(acc,acct);
00894               // normalize to ftzero and saturate, divide by 8
00895               acc1 = _mm_srai_epi16(acc1,3); // divide by 8
00896               acc1 = _mm_adds_epi16(acc1,const_ftzero); // normalize to ftzero
00897               // saturate to [0, ftzero*2]
00898               acc1 = _mm_max_epi16(acc1,zeros); // floor of 0
00899               acc1 = _mm_min_epi16(acc1,const_ftzero_x2);
00900 
00901               // pack both results
00902               accs = _mm_packus_epi16(accs,acc1);
00903               _mm_storeu_si128((__m128i *)ftimp,accs);
00904               
00905             }
00906 
00907         } // end of j >= YKERN section
00908       
00909     }
00910 
00911 }
00912 
00913 //
00914 // fast version using SSE intrinsics
00915 //
00916 
00917 
00918 #define EXACTUNIQ   // set this to do exact uniqueness values
00919 
00920 void
00921 do_stereo_d_fast(uint8_t *lim, uint8_t *rim, // input feature images
00922             int16_t *disp,      // disparity output
00923             int16_t *text,      // texture output
00924             int xim, int yim,   // size of images
00925             uint8_t ftzero,     // feature offset from zero
00926             int xwin, int ywin, // size of corr window, usually square
00927             int dlen,           // size of disparity search, multiple of 8
00928             int tfilter_thresh, // texture filter threshold
00929             int ufilter_thresh, // uniqueness filter threshold, percent
00930             uint8_t *buf        // buffer storage
00931             )
00932 {
00933   int i,j,k,d;                  // iteration indices
00934   int16_t *accp, *accpp;        // acc buffer ptrs
00935   int8_t *limp, *rimp, *limpp, *rimpp, *limp2, *limpp2; // feature image ptrs
00936   int8_t *corrend, *corrp, *corrpp; // corr buffer ptrs
00937   int16_t *intp, *intpp;        // integration buffer pointers
00938   int16_t *textpp;              // texture buffer pointer
00939   int16_t *dispp, *disppp;      // disparity output pointer
00940   int16_t *textp;               // texture output pointer
00941   int16_t acc;
00942   int dval;                     // disparity value
00943   int uniqthresh;               // fractional 16-bit threshold
00944   
00945   // xmm variables
00946   __m128i limpix, rimpix, newpix, tempix, oldpix;
00947   __m128i newval, temval;
00948   __m128i minv, indv, indtop, indd, indm, temv, nexv;
00949   __m128i cval, pval, nval, ival; // correlation values for subpixel disparity interpolation
00950   __m128i denv, numv, intv, sgnv; // numerator, denominator, interpolation, sign of subpixel interp
00951   __m128i uniqcnt, uniqth, uniqinit, unim, uval; // uniqueness count, uniqueness threshold
00952 
00953   // xmm constants, remember the 64-bit values are backwards
00954 #ifdef WIN32
00955   const __m128i p0xfffffff0 = {0x0, 0x0, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
00956                                0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff};
00957   const __m128i p0x0000000f = {0xff, 0xff, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
00958                                0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0};
00959   const __m128i zeros = {0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
00960                          0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0};
00961   const __m128i val_epi16_7fff = {0xff, 0x7f, 0xff, 0x7f, 0xff, 0x7f, 0xff, 0x7f, 
00962                                   0xff, 0x7f, 0xff, 0x7f, 0xff, 0x7f, 0xff, 0x7f};
00963   const __m128i val_epi16_8    = {0x08, 0x0, 0x08, 0x0, 0x08, 0x0, 0x08, 0x0, 
00964                                   0x08, 0x0, 0x08, 0x0, 0x08, 0x0, 0x08, 0x0};
00965   const __m128i val_epi16_1    = {0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00,
00966                                   0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00};
00967   const __m128i val_epi16_2    = {0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00,
00968                                   0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00};
00969   const __m128i val_epi16_3    = {0x03, 0x00, 0x03, 0x00, 0x03, 0x00, 0x03, 0x00,
00970                                   0x03, 0x00, 0x03, 0x00, 0x03, 0x00, 0x03, 0x00};
00971 #else
00972   const __m128i p0xfffffff0 = {0xffffffffffff0000LL, 0xffffffffffffffffLL};
00973   const __m128i p0x0000000f = {0xffffLL, 0x0LL};
00974   const __m128i zeros = {0x0LL, 0x0LL};
00975   const __m128i val_epi16_7fff = {0x7fff7fff7fff7fffLL, 0x7fff7fff7fff7fffLL};
00976   const __m128i val_epi16_8    = {0x0008000800080008LL, 0x0008000800080008LL};
00977   const __m128i val_epi16_1    = {0x0001000100010001LL, 0x0001000100010001LL};
00978   const __m128i val_epi16_2    = {0x0002000200020002LL, 0x0002000200020002LL};
00979   const __m128i val_epi16_3    = {0x0003000300030003LL, 0x0003000300030003LL};
00980 #endif
00981 
00982   // set up buffers, first align to 16 bytes
00983   uintptr_t bufp = (uintptr_t)buf;
00984   int16_t *intbuf, *textbuf, *accbuf, temp;
00985   int8_t *corrbuf;
00986 
00987   if (bufp & 0xF)
00988     bufp = (bufp+15) & ~(uintptr_t)0xF;
00989   buf = (uint8_t *)bufp;
00990 
00991 #define INTEBUFSIZE ((yim+16+ywin)*dlen)
00992   intbuf  = (int16_t *)buf;     // integration buffer
00993   bufp = (uintptr_t)intbuf;
00994   if (bufp & 0xF)
00995     bufp = (bufp+15) & ~(uintptr_t)0xF;
00996   intbuf = (int16_t *)bufp;  
00997 
00998 #define TXTBUFSIZE (yim + 64)
00999   textbuf = (int16_t *)&buf[INTEBUFSIZE*sizeof(int16_t)];       // texture buffer
01000   bufp = (uintptr_t)textbuf;
01001   if (bufp & 0xF)
01002     bufp = (bufp+15) & ~(uintptr_t)0xF;
01003   textbuf = (int16_t *)bufp;  
01004 
01005 
01006 #define ACCBUFSIZE (yim*dlen + 64)
01007   accbuf  = (int16_t *)&buf[(INTEBUFSIZE+TXTBUFSIZE)*sizeof(int16_t)]; // accumulator buffer
01008   bufp = (uintptr_t)accbuf;
01009   if (bufp & 0xF)
01010     bufp = (bufp+15) & ~(uintptr_t)0xF;
01011   accbuf = (int16_t *)bufp;  
01012 
01013 
01014   corrbuf = (int8_t *)&buf[(INTEBUFSIZE+TXTBUFSIZE+ACCBUFSIZE)*sizeof(int16_t)]; // correlation buffer
01015   bufp = (uintptr_t)corrbuf;
01016   if (bufp & 0xF)
01017     bufp = (bufp+15) & ~(uintptr_t)0xF;
01018   corrbuf = (int8_t *)bufp;  
01019 
01020   // clear out buffers
01021   memclr_si128((__m128i *)intbuf, dlen*yim*sizeof(int16_t));
01022   memclr_si128((__m128i *)corrbuf, dlen*yim*xwin*sizeof(int8_t));
01023   memclr_si128((__m128i *)accbuf, dlen*yim*sizeof(int16_t));
01024   memclr_si128((__m128i *)textbuf, yim*sizeof(int16_t));
01025 
01026   // set up corrbuf pointers
01027   corrend = corrbuf + dlen*yim*xwin;
01028   corrp = corrbuf;
01029 
01030   // start further out on line to take care of disparity offsets
01031   limp = (int8_t *)lim + dlen - 1;
01032   limp2 = limp;
01033   rimp = (int8_t *)rim;
01034   dispp = disp + xim*(ywin+YKERN-2)/2 + dlen + (xwin+XKERN-2)/2; 
01035   textp = NULL;
01036   if (text != NULL)             // optional texture buffer output
01037     textp = text + dlen;
01038 
01039   // normalize texture threshold
01040   tfilter_thresh = tfilter_thresh * xwin * ywin * ftzero;
01041   tfilter_thresh = tfilter_thresh / 100; // now at percent of max
01042   //  tfilter_thresh = tfilter_thresh / 10;
01043 
01044   // set up some constants
01045   //  indtop = _mm_set_epi16(dlen+7, dlen+6, dlen+5, dlen+4, dlen+3, dlen+2, dlen+1, dlen);
01046   indtop = _mm_set_epi16(dlen+0, dlen+1, dlen+2, dlen+3, dlen+4, dlen+5, dlen+6, dlen+7);
01047   uniqthresh = (0x8000 * ufilter_thresh)/100; // fractional multiplication factor
01048                                               // for uniqueness test
01049   uniqinit = _mm_set1_epi16(uniqthresh);
01050   // init variables so compiler doesn't complain
01051   intv = zeros;
01052   ival = zeros;
01053   nval = zeros;
01054   pval = zeros;
01055   cval = zeros;
01056   uval = zeros;
01057 
01058   // iterate over columns first
01059   // acc buffer is column-oriented, not line-oriented
01060   // at each iteration, move across one column
01061   for (i=0; i<xim-XKERN-dlen+2; i++, limp++, rimp++, corrp+=yim*dlen)
01062     {    
01063       accp = accbuf;
01064       if (corrp >= corrend) corrp = corrbuf;
01065       limpp = limp;
01066       rimpp = rimp;
01067       corrpp = corrp;
01068       intp = intbuf+(ywin-1)*dlen; // intbuf current ptr
01069       intpp = intbuf;   // intbuf old ptr
01070           
01071       // iterate over rows
01072       for (j=0; j<yim-YKERN+1; j++, limpp+=xim, rimpp+=xim-dlen)
01073         {
01074           // replicate left image pixel
01075           // could replace this with unpacklo_epi8, shufflelo_epi16, pshuffle_epi32
01076           limpix = _mm_set1_epi8(*limpp);
01077 
01078           // iterate over disparities
01079           // have to use an intbuf column for each disparity
01080           for (d=0; d<dlen; d+=16, intp+=16, intpp+=16, accp+=16, corrpp+=16, rimpp+=16)    
01081             {
01082               // do SAD calculation
01083               //              newc = abs(*limpp - rimpp[d]); // new corr val
01084               rimpix = _mm_loadu_si128((__m128i *)rimpp);
01085               newpix = _mm_subs_epu8(limpix,rimpix); // subtract pixel values, saturate
01086               tempix = _mm_subs_epu8(rimpix,limpix); // subtract pixel values the other way, saturate
01087               newpix = _mm_add_epi8(newpix,tempix); // holds abs value
01088 //            newpix = _mm_sub_epi8(limpix,rimpix); // subtract pixel values, alternate for SSSE3
01089 //            newpix = _mm_abs_epi8(newpix); // absolute value
01090 
01091               // calculate new acc values, 8 at a time
01092               //              newv = newc - *corrpp + *intp; // new acc val
01093               oldpix = _mm_load_si128((__m128i *)corrpp);             
01094               newval = _mm_unpacklo_epi8(newpix,zeros); // unpack into words
01095               temval = _mm_load_si128((__m128i *)intp); // get acc values
01096               newval = _mm_add_epi16(newval,temval);
01097               temval = _mm_unpacklo_epi8(oldpix,zeros); // unpack into words
01098               newval = _mm_sub_epi16(newval,temval); // subtract out old corr vals
01099 
01100               // save new corr value
01101               //              *corrpp++ = newc; // save new corr val
01102               _mm_store_si128((__m128i *)corrpp,newpix);
01103 
01104               // save new acc values
01105               //              *(intp+dlen) = newv; // save new acc val
01106               _mm_store_si128((__m128i *)(intp+dlen),newval);         
01107 
01108               // update windowed sum
01109               //              *accp++ += newv - *intpp++; // update window sum
01110               temval = _mm_load_si128((__m128i *)intpp); // get old acc values
01111               newval = _mm_sub_epi16(newval,temval);
01112               temval = _mm_load_si128((__m128i *)accp); // get window sum values
01113               newval = _mm_add_epi16(newval,temval);
01114               _mm_store_si128((__m128i *)accp,newval);
01115 
01116               // next set of 8 values
01117               // calculate new acc values, 8 at a time
01118               //              newv = newc - *corrpp + *intp; // new acc val
01119               newval = _mm_unpackhi_epi8(newpix,zeros); // unpack into words
01120               temval = _mm_load_si128((__m128i *)(intp+8));     // get acc values
01121               newval = _mm_add_epi16(newval,temval);
01122               temval = _mm_unpackhi_epi8(oldpix,zeros); // unpack into words
01123               newval = _mm_sub_epi16(newval,temval); // subtract out old corr vals
01124 
01125               // save new acc values
01126               //              *(intp+dlen) = newv; // save new acc val
01127               _mm_store_si128((__m128i *)(intp+dlen+8),newval);       
01128 
01129               // update windowed sum
01130               //              *accp++ += newv - *intpp++; // update window sum
01131               temval = _mm_load_si128((__m128i *)(intpp+8)); // get old acc values
01132               newval = _mm_sub_epi16(newval,temval);
01133               temval = _mm_load_si128((__m128i *)(accp+8)); // get window sum values
01134               newval = _mm_add_epi16(newval,temval);
01135               _mm_store_si128((__m128i *)(accp+8),newval);
01136             }
01137         } 
01138 
01139       // average texture computation
01140       // use full corr window
01141       memclr_si128((__m128i *)intbuf, yim*sizeof(int16_t));
01142       limpp = limp;
01143       limpp2 = limp2;
01144       intp = intbuf+ywin-1;
01145       intpp = intbuf;
01146       accpp = textbuf;
01147       acc = 0;
01148           
01149 #if 1                           // this should be optimized
01150       // iterate over rows
01151       // have to skip down a row each time...
01152       // check for initial period
01153       if (i < xwin)
01154         {
01155           for (j=0; j<yim-YKERN+1; j++, limpp+=xim)
01156             {
01157               temp = abs(*limpp- ftzero); 
01158               *intp = temp;
01159               acc += *intp++ - *intpp++;
01160               *accpp++ += acc;
01161             }
01162         }
01163       else
01164         {
01165           for (j=0; j<yim-YKERN+1; j++, limpp+=xim, limpp2+=xim)
01166             {
01167               temp = abs(*limpp-ftzero); 
01168               *intp = temp - abs(*limpp2-ftzero);
01169               acc += *intp++ - *intpp++;
01170               *accpp++ += acc;
01171             }
01172           limp2++;
01173         }
01174 #endif
01175           
01176       // disparity extraction, find min of correlations
01177       if (i >= xwin)            // far enough along...
01178         {
01179           disppp = dispp;
01180           accp   = accbuf + (ywin-1)*dlen; // results within initial corr window are partial
01181           textpp = textbuf + (ywin-1); // texture measure
01182 
01183           // start the minimum calc
01184           accpp = accp;
01185           nexv = val_epi16_7fff;
01186           for (d=0; d<dlen; d+=8, accpp+=8)
01187             nexv = _mm_min_epi16(nexv,*(__m128i *)accpp); // get window sum values
01188 
01189           // iterate over rows
01190           for (j=0; j<yim-ywin-YKERN+2; j+=8) 
01191             {
01192               //              if (*textpp < tfilter_thresh && tfilter_thresh > 0)
01193               //                {
01194               //                  *disppp = FILTEREDVAL;
01195               //                  continue;
01196               //                }
01197 
01198               // 8 rows at a time
01199               for (k=0; k<8; k++, textpp++) // do 8 values at a time
01200                 {
01201                   // shift all values left 2 bytes for next entry
01202                   cval = _mm_slli_si128(cval,2);
01203                   pval = _mm_slli_si128(pval,2);
01204                   nval = _mm_slli_si128(nval,2);
01205                   uval = _mm_slli_si128(uval,2);
01206                   ival = _mm_slli_si128(ival,2);
01207 
01208                   // propagate minimum value
01209                   // could use _mm_minpos_epu16 if available, still need two shuffles
01210                   temv = _mm_shufflelo_epi16(nexv,0xb1); // shuffle words
01211                   temv = _mm_shufflehi_epi16(temv,0xb1); // shuffle words
01212                   minv = _mm_min_epi16(nexv,temv); // word mins propagated
01213                   minv = _mm_min_epi16(minv,_mm_shuffle_epi32(minv,0xb1)); // shuffle dwords
01214                   minv = _mm_min_epi16(minv,_mm_shuffle_epi32(minv,0x4e)); // shuffle dwords
01215                   // save center value
01216                   cval = _mm_or_si128(_mm_and_si128(cval,p0xfffffff0),_mm_and_si128(minv,p0x0000000f)); 
01217                   // uniqueness threshold
01218                   uniqth = _mm_mulhi_epu16(uniqinit,minv);
01219                   uniqth = _mm_add_epi16(uniqth,minv);
01220 
01221                   // find index of min, and uniqueness count
01222                   // also do next min
01223                   indv = indtop;
01224                   indd = val_epi16_8;
01225                   uniqcnt = zeros;
01226                   nexv = val_epi16_7fff; // initial min value for next 8 rows
01227                   for (d=0; d<dlen; d+=8, accp+=8)
01228                     {
01229                       indm = _mm_cmpgt_epi16(*(__m128i *)accp,minv); // compare to min
01230                       indv = _mm_subs_epu16(indv,indd); // decrement indices
01231                       indd = _mm_and_si128(indd,indm); // wipe out decrement at min
01232                       unim = _mm_cmpgt_epi16(uniqth,*(__m128i *)accp); // compare to unique thresh                    
01233                       nexv = _mm_min_epi16(nexv,*(__m128i *)(accp+dlen)); // get min for next 8 rows
01234                       uniqcnt = _mm_sub_epi16(uniqcnt,unim); // add in uniq count
01235                     } 
01236                   indv = _mm_subs_epu16(indv,indd); // decrement indices to saturate at 0
01237                   // propagate max value
01238                   temv = _mm_shufflelo_epi16(indv,0xb1); // shuffle words
01239                   temv = _mm_shufflehi_epi16(temv,0xb1); // shuffle words
01240                   indv = _mm_max_epi16(indv,temv); // word maxs propagated
01241                   indv = _mm_max_epi16(indv,_mm_shuffle_epi32(indv,0xb1)); // shuffle dwords
01242                   indv = _mm_max_epi16(indv,_mm_shuffle_epi32(indv,0x4e)); // shuffle dwords
01243 
01244                   // set up subpixel interpolation (center, previous, next correlation sums)
01245                   dval = _mm_extract_epi16(indv,0);     // index of minimum
01246                   nval = _mm_insert_epi16(nval,*(accp-dval),0);
01247                   pval = _mm_insert_epi16(pval,*(accp-dval-2),0);
01248                   // save disparity
01249                   ival = _mm_or_si128(_mm_and_si128(ival,p0xfffffff0),_mm_and_si128(indv,p0x0000000f)); 
01250 
01251                   // finish up uniqueness count
01252                   uniqcnt = _mm_sad_epu8(uniqcnt,zeros); // add up each half
01253                   uniqcnt = _mm_add_epi32(uniqcnt, _mm_srli_si128(uniqcnt,8)); // add two halves
01254 #ifdef EXACTUNIQ
01255                   unim = _mm_cmpgt_epi16(uniqth,nval); // compare to unique thresh
01256                   uniqcnt = _mm_add_epi16(uniqcnt,unim); // subtract from uniq count
01257                   unim = _mm_cmpgt_epi16(uniqth,pval); // compare to unique thresh
01258                   uniqcnt = _mm_add_epi16(uniqcnt,unim); // subtract from uniq count
01259 #endif
01260                   if (*textpp < tfilter_thresh)
01261                     uniqcnt = val_epi16_8; // cancel out this value
01262                   uval = _mm_or_si128(_mm_and_si128(uval,p0xfffffff0),
01263                                       _mm_and_si128(uniqcnt,p0x0000000f));
01264                 }
01265 
01266               // disparity interpolation (to 1/16 pixel)
01267               // use 16*|p-n| / 2*(p+n-2c)  [p = previous, c = center, n = next]
01268               // sign determined by p-n, add increment if positive
01269               // denominator is always positive
01270               // division done by subtraction and comparison
01271               //   computes successive bits of the interpolation
01272               // NB: need to check disparities 0 and d-1, skip interpolation
01273               // addendum:
01274               //   for smoother interpolation, at |p-n| to numerator and denom
01275               //   approximates alpha^0.7
01276 
01277               // numerator
01278               numv = _mm_sub_epi16(pval,nval);
01279               sgnv = _mm_cmpgt_epi16(nval,pval); // sign, 0xffff for n>p
01280               numv = _mm_xor_si128(numv,sgnv);
01281               numv = _mm_sub_epi16(numv,sgnv);  // abs val here, |p-n|
01282               
01283               // denominator
01284               denv = _mm_add_epi16(pval,nval);
01285               denv = _mm_sub_epi16(denv,cval);
01286               denv = _mm_sub_epi16(denv,cval); // p+n-2c
01287 
01288               // addendum
01289               denv = _mm_add_epi16(denv,numv); // p+n-2c + |p-n|
01290               numv = _mm_add_epi16(numv,numv);  // 2*|p-n|
01291 
01292               // first bit is always zero unless c = p or c = n, but
01293               //   for this case we'll get all 1's in the code below, which will round up
01294               // multiply num by 2 for second bit
01295               numv = _mm_add_epi16(numv,numv);  
01296               // second bit
01297               temv = _mm_cmpgt_epi16(numv,denv); // 0xffff if n>d
01298               intv = _mm_srli_epi16(temv,15); // add 1 where n>d; use shift because subtraction kills gcc 4.1.x
01299               intv = _mm_slli_epi16(intv,1); // shift left
01300               numv = _mm_subs_epu16(numv,_mm_and_si128(temv,denv)); // sub out denominator
01301               numv = _mm_slli_epi16(numv,1); // shift left (x2)
01302 
01303               // third bit
01304               temv = _mm_cmpgt_epi16(numv,denv); // 0xffff if n>d
01305               intv = _mm_sub_epi16(intv,temv); // add 1 where n>d
01306               intv = _mm_slli_epi16(intv,1); // shift left
01307               numv = _mm_subs_epu16(numv,_mm_and_si128(temv,denv)); // sub out denominator
01308               numv = _mm_slli_epi16(numv,1); // shift left (x2)
01309 
01310               // fourth bit
01311               temv = _mm_cmpgt_epi16(numv,denv); // 0xffff if n>d
01312               intv = _mm_sub_epi16(intv,temv); // add 1 where n>d
01313               intv = _mm_slli_epi16(intv,1); // shift left
01314               numv = _mm_subs_epu16(numv,_mm_and_si128(temv,denv)); // sub out denominator
01315               numv = _mm_slli_epi16(numv,1); // shift left (x2)
01316 
01317               // fifth bit
01318               temv = _mm_cmpgt_epi16(numv,denv); // 0xffff if n>d
01319               intv = _mm_sub_epi16(intv,temv); // add 1 where n>d
01320 
01321               // round and do sign
01322               intv = _mm_add_epi16(intv,val_epi16_1);   // add in 1 to round
01323               intv = _mm_srli_epi16(intv,1);    // shift back
01324               intv = _mm_xor_si128(intv,sgnv); // get sign back
01325               intv = _mm_sub_epi16(intv,sgnv); // signed val here
01326 
01327               // add in increment, should be max +-8
01328               ival = _mm_slli_epi16(ival,4);
01329               ival = _mm_sub_epi16(ival,intv);
01330               
01331               // check uniq count
01332               // null out pval and nval mins
01333               //              unim = _mm_cmpgt_epi16(uniqth,nval); // compare to unique thresh
01334               //              uval = _mm_add_epi16(uval,unim); // sub out
01335               //              unim = _mm_cmpgt_epi16(uniqth,pval); // compare to unique thresh
01336               //              uval = _mm_add_epi16(uval,unim); // sub out
01337 #ifdef EXACTUNIQ
01338               uval = _mm_cmplt_epi16(uval,val_epi16_2); // 0s where uniqcnt > 1
01339 #else
01340               uval = _mm_cmplt_epi16(uval,val_epi16_3); // 0s where uniqcnt > 2
01341 #endif
01342               ival = _mm_and_si128(uval,ival); // set to bad values to 0
01343 
01344               // store in output array
01345               *disppp = _mm_extract_epi16(ival,7);
01346               disppp += xim;
01347               *disppp = _mm_extract_epi16(ival,6);
01348               disppp += xim;
01349               *disppp = _mm_extract_epi16(ival,5);
01350               disppp += xim;
01351               *disppp = _mm_extract_epi16(ival,4);
01352               disppp += xim;
01353               *disppp = _mm_extract_epi16(ival,3);
01354               disppp += xim;
01355               *disppp = _mm_extract_epi16(ival,2);
01356               disppp += xim;
01357               *disppp = _mm_extract_epi16(ival,1);
01358               disppp += xim;
01359               *disppp = _mm_extract_epi16(ival,0);
01360               disppp += xim;
01361 
01362             } // end of row loop
01363 
01364           dispp++;              // go to next column of disparity output
01365         }
01366 
01367     } // end of outer column loop
01368 
01369   // clean up last few rows in case of overrun
01370   dispp = disp + (yim - YKERN/2 - ywin/2)*xim;
01371   for (i=0; i<YKERN; i++, dispp+=xim)
01372     memclr_si128((__m128i *)dispp, xim*sizeof(int16_t));
01373 
01374 }
01375 
01376 
01377 //
01378 // sparse stereo
01379 // returns disparity value in 1/16 pixel, -1 on failure
01380 // refpat is a 16x16 patch around the feature pixel
01381 //   feature pixel is at coordinate 7,7 within the patch
01382 // rim is the right gradient image
01383 // x,y is the feature pixel coordinate
01384 // other parameters as in do_stereo
01385 //
01386 
01387 int
01388 do_stereo_sparse(uint8_t *refpat, uint8_t *rim, // input feature images
01389           int x, int y,         // position of feature pixel
01390           int xim, int yim,     // size of images
01391           uint8_t ftzero,       // feature offset from zero
01392           int dlen,             // size of disparity search, multiple of 8
01393           int tfilter_thresh,   // texture filter threshold
01394           int ufilter_thresh    // uniqueness filter threshold, percent
01395           )
01396 {
01397   int i,j,k,sum,min,ind;
01398   uint8_t *rp, *rpp, *rppp, *pp, *ppp;
01399   int cbuf[1024];               // shouldn't have more disparities than this...
01400   int *cp;
01401   float c, p, n, v;
01402 
01403   if (x < dlen || y < 7 || y > (yim-8)) return -1;
01404 
01405   min = (int)1e6;
01406   ind = 0;
01407   rp = rim + (y-7)*xim + x - dlen + 1 - 7; // upper left corner of dlen-1 disparity block
01408   cp = cbuf;
01409   for (i=0; i<dlen; i++, rp++, cp++) // loop over disparities
01410     {
01411       rpp = rp;
01412       pp  = refpat;
01413       sum = 0;
01414       for (j=0; j<15; j++, rpp+=xim, pp+=16)    // loop over rows
01415         {
01416           rppp = rpp;
01417           ppp = pp;
01418           for (k=0; k<15; k++)  // loop over columns
01419             sum += abs(*ppp++ - *rppp++);
01420         }
01421       // store sum, keep track of min
01422       *cp = sum;
01423       if (sum < min)
01424         {
01425           min = sum;
01426           ind = i;
01427         }
01428     }
01429   // do interpolation
01430   if (ind==0 || ind==(dlen-1))
01431     return (dlen-ind-1)*16;
01432 
01433   c = (float)min;
01434   p = (float)cbuf[ind+1];
01435   n = (float)cbuf[ind-1];
01436   v = (float)(dlen-ind-1) + (p-n)/(2*(p+n-2*c));
01437   return (int)(0.5 + 16*v);
01438 }
01439 
01440 
01441 
01442 //
01443 // sparse stereo
01444 // returns disparity value in 1/16 pixel, -1 on failure
01445 // refpat is a 16x16 patch around the feature pixel
01446 //   feature pixel is at coordinate 7,7 within the patch
01447 // rim is the right gradient image
01448 // x,y is the feature pixel coordinate
01449 // other parameters as in do_stereo
01450 //
01451 
01452 int __attribute__ ((force_align_arg_pointer))
01453 do_stereo_sparse_fast(uint8_t *refpat, uint8_t *rim, // input feature images
01454       int x, int y,         // position of feature pixel
01455     int xim, int yim, // size of images
01456     uint8_t ftzero, // feature offset from zero
01457     int dlen,   // size of disparity search, multiple of 8
01458     int tfilter_thresh, // texture filter threshold
01459     int ufilter_thresh  // uniqueness filter threshold, percent
01460     )
01461 {
01462   int i,j,k,sum,min,ind;
01463   uint8_t *rp, *rpp, *rppp, *pp, *ppp;
01464   int cbuf[1024];   // shouldn't have more disparities than this...
01465   int *cp;
01466   float c, p, n, v;
01467 
01468   if (x < dlen || y < 7 || y > (yim-8))
01469     return -1;
01470 
01471   min = (int)1e6;
01472   ind = 0;
01473   rp = rim + (y-7)*xim + x - dlen + 1 - 7; // upper left corner of dlen-1 disparity block
01474   cp = cbuf;
01475   for (i=0; i<dlen; i++, rp++, cp++) { // loop over disparities
01476     rpp = rp;
01477     pp  = refpat;
01478 #ifdef __SSE2__
01479     __m128i a, b, diff, total;
01480     total = (__m128i)_mm_setzero_ps();
01481     unsigned int sums[4];
01482 
01483 #define ACCUM_SAD(offset) \
01484     a = (__m128i)_mm_loadu_ps((float*)(pp + (16 * offset))); \
01485     b = (__m128i)_mm_loadu_ps((float*)(rpp)); \
01486     rpp += xim; \
01487     diff = _mm_sad_epu8(a, b); \
01488     total = _mm_add_epi32(diff, total);
01489 
01490     ACCUM_SAD(0)
01491     ACCUM_SAD(1)
01492     ACCUM_SAD(2)
01493     ACCUM_SAD(3)
01494     ACCUM_SAD(4)
01495     ACCUM_SAD(5)
01496     ACCUM_SAD(6)
01497     ACCUM_SAD(7)
01498     ACCUM_SAD(8)
01499     ACCUM_SAD(9)
01500     ACCUM_SAD(10)
01501     ACCUM_SAD(11)
01502     ACCUM_SAD(12)
01503     ACCUM_SAD(13)
01504     ACCUM_SAD(14)
01505     ACCUM_SAD(15)
01506     _mm_storeu_ps((float*)sums, (__m128)total);
01507     sum = sums[0] + sums[2];
01508 #else
01509     sum = 0;
01510     for (j=0; j<15; j++, rpp+=xim, pp+=16) { // loop over rows
01511       rppp = rpp;
01512       ppp = pp;
01513       for (k=0; k<15; k++)  // loop over columns
01514         sum += abs(*ppp++ - *rppp++);
01515     }
01516 #endif
01517     // store sum, keep track of min
01518     *cp = sum;
01519     if (sum < min) {
01520       min = sum;
01521       ind = i;
01522     }
01523   }
01524 
01525   // do interpolation
01526   if (ind==0 || ind==(dlen-1))
01527     return (dlen-ind-1)*16;
01528 
01529   c = (float)min;
01530   p = (float)cbuf[ind+1];
01531   n = (float)cbuf[ind-1];
01532   v = (float)(dlen-ind-1) + (p-n)/(2*(p+n-2*c));
01533   return (int)(0.5 + 16*v);
01534 }
01535 
01536 
01537 
01538 //
01539 // speckle removal algorithm
01540 //
01541 // basic idea: find coherent regions of similar disparities
01542 // use 4-neighbors for checking disparity diffs
01543 // reject regions less than a certain size
01544 //
01545 // algorithm: 
01546 //   read through pixels in scan-line order
01547 //   for each pixel
01548 //     - if not a disparity, ignore
01549 //     - if labeled, check label status (good/bad) and assign disparity
01550 //     - if not labeled, assign next label and propagate
01551 //          * put on wavefront list
01552 //          * pop first wavefront member
01553 //              - if labeled, ignore
01554 //              - if no disparity, ignore
01555 //              - if not within diff, ignore
01556 //              - label, increment count, and push 4 neighbors
01557 //              - repeat until wavefront empty
01558 //
01559 //
01560 //
01561 // args:
01562 //   disp:  disparity image (16b)
01563 //   badval: value of bad disparity, usually 0
01564 //   width, height: size of image
01565 //   rdiff: max diff between pixels in a region
01566 //   rcount: min count for non-speckle region, in pixels
01567 //
01568 // buffers:
01569 //   labs:  holds pixel labels (32b), size is image
01570 //   wbuf:  holds propagating wavefront (32b), size is image
01571 //   rtype: holds region types (1=bad, 0=good)
01572 //
01573 
01574 
01575 #define push(x,y) { if ((disp[x] != badval) && (!labels[x])         \
01576                         && (disp[x]-y < rdiff) && (disp[x]-y > -rdiff)) \
01577     { labels[x] = cur; *ws++ = x; }}
01578 
01579 void
01580 do_speckle(int16_t *disp, int16_t badval, int width, int height, 
01581            int rdiff, int rcount, 
01582            uint32_t *labels, uint32_t *wbuf, uint8_t *rtype)
01583 {
01584   int i,j,k,cnt;
01585   uint32_t *ws, *ls;
01586   uint32_t p, cur;
01587   int16_t dp, *ds;
01588 
01589 //  printf("diff: %d  count: %d\n", rdiff, rcount);
01590 //  printf("width: %d  height: %d  badval: %d\n", width, height, badval);
01591 
01592   // clear out label assignments
01593   memset(labels,0x0,width*height*sizeof(uint32_t));
01594 
01595   // loop over rows, omit borders (TDB: use dtop etc here)
01596   const uint32_t MIN_P = 4*width+4;
01597   const uint32_t MAX_P = (height-4)*width-4;
01598   cur = 0;                      // current label
01599   for (i=4; i<height-4; i++)
01600     {
01601       k = i*width+4;            // index of pixel
01602       ds = disp+k;              // buffer ptrs
01603       ls = labels+k;
01604       for (j=4; j<width-4; j++, k++, ls++, ds++)
01605         {
01606           if (*ds != badval)    // not a bad disparity
01607             {
01608               if (*ls)          // has a label, check for bad label
01609                 {  
01610                   if (rtype[*ls]) // small region, zero out disparity
01611                     *ds = badval;
01612                 }
01613 
01614               // no label, assign and propagate
01615               else
01616                 {
01617                   ws = wbuf;    // initialize wavefront
01618                   p = k;        // current pixel
01619                   cur++;        // next label
01620                   cnt = 0;      // current region count
01621                   labels[p] = cur; 
01622 
01623                   // wavefront propagation
01624                   while (ws >= wbuf) // wavefront not empty
01625                     {
01626                       cnt++;
01627                       // put neighbors onto wavefront
01628                       dp = disp[p];
01630                       if (p+1 < MAX_P)      push(p+1,dp);
01631                       if (p-1 >= MIN_P)     push(p-1,dp);
01632                       if (p-width >= MIN_P) push(p-width,dp);
01633                       if (p+width < MAX_P)  push(p+width,dp);
01634 
01635                       // pop most recent and propagate
01636                       // NB: could try least recent, maybe better convergence
01637                       p = *--ws;
01638                     }
01639 
01640                   // assign label type
01641                   if (cnt < rcount)     // speckle region
01642                     {
01643                       rtype[*ls] = 1;   // small region label
01644                       *ds = badval;
01645                     }
01646                   else
01647                     {
01648                       rtype[*ls] = 0;   // large region label
01649 //                    printf("Count: %d  label: %d\n", cnt, k);
01650                     }
01651                   
01652                 }
01653             }
01654 
01655         }
01656     }
01657 }
01658 
01659 
01660 
01661 //
01662 // rectification algorithm
01663 // does bilinear interpolation using fixed lookup table
01664 //
01665 
01666 void 
01667 do_rectify_mono(uint8_t *dest, uint8_t *src, int w, int h, inttab_t *rtab)
01668 {
01669   int i,j;
01670   uint8_t *p;
01671   int val;
01672   // loop over rows
01673   for (i=0; i<h; i++)
01674     {
01675       // loop over cols
01676       for (j=0; j<w; j++, dest++, rtab++)
01677         {
01678           p = src + rtab->addr; // upper left pixel
01679           val = (*p)*rtab->a1 + (*(p+1))*rtab->a2 + (*(p+w))*rtab->b1 + (*(p+w+1))*rtab->b2;
01680           *dest = val >> INTOFFSET; // get rid of fractional offset
01681         }
01682     }
01683 }
01684 
01685 // SSE version, TBD
01686 void 
01687 do_rectify_mono_fast(uint8_t *dest, uint8_t *src, int w, int h, inttab_t *rtab)
01688 {
01689   int i,j;
01690   uint8_t *p;
01691   int val;
01692   // loop over rows
01693   for (i=0; i<h; i++)
01694     {
01695       // loop over cols
01696       for (j=0; j<w; j++, dest++, rtab++)
01697         {
01698           p = src + rtab->addr; // upper left pixel
01699           val = (*p)*rtab->a1 + (*(p+1))*rtab->a2 + (*(p+w))*rtab->b1 + (*(p+w+1))*rtab->b2;
01700           *dest = val >> INTOFFSET; // get rid of fractional offset
01701         }
01702     }
01703 }