00001 
00002 
00003 
00004 
00005 
00006 
00007 
00008 
00009 
00010 
00011 
00012 
00013 
00014 
00015 
00016 
00017 
00018 
00019 
00020 
00021 
00022 
00023 
00024 
00025 
00026 
00027 
00028 
00029 
00030 
00031 
00032 
00033 
00034 
00035 
00036 
00037 
00038 
00039 
00040 #include <frame_common/stereolib.h>
00041 #include <emmintrin.h>
00042 #include <stdio.h>
00043 
00044 
00045 
00046 
00047 
00048 
00049 static inline void
00050 memclr(void *buf, int n)
00051 {
00052   int i;
00053   unsigned char *bb = (unsigned char *)buf;
00054   for (i=0; i<n; i++)
00055     *bb++ = 0;
00056 }
00057 
00058 
00059 
00060 
00061 
00062 
00063 
00064 
00065 
00066 
00067 
00068 
00069 void
00070 do_prefilter_norm(uint8_t *im,  
00071           uint8_t *ftim,        
00072           int xim, int yim,     
00073           uint8_t ftzero,       
00074           uint8_t *buf          
00075           )
00076 {
00077   int i,j;
00078   uint8_t *imp, *impp;
00079   uint16_t acc;
00080   uint8_t *topp, *toppp, *botp, *botpp, *cenp;
00081   int32_t qval;
00082   uint32_t uqval;
00083   uint8_t *ftimp;
00084 
00085   
00086   uint16_t *accbuf, *accp;
00087   int ACCBUFSIZE = xim+64;
00088   accbuf = (uint16_t *)buf;
00089 
00090   
00091   imp = im;                     
00092   impp = im;                    
00093 
00094   
00095   memclr(accbuf, ACCBUFSIZE*sizeof(int16_t));
00096 
00097   
00098   for (j=0; j<yim; j++, imp+=xim, ftim+=xim)
00099     {
00100       acc = 0;                  
00101       accp = accbuf;            
00102       topp  = imp;              
00103       toppp = imp;              
00104       botp  = impp;             
00105       botpp = impp;             
00106 
00107       if (j<YKERN)              
00108         {
00109           for (i=0; i<XKERN; i++) 
00110             acc += *topp++;
00111           for (; i<xim; i++) 
00112             {
00113               acc += *topp++ - *toppp++; 
00114               *accp++ += acc;   
00115             }
00116         }
00117       else                      
00118         {
00119           cenp = imp - (YKERN/2)*xim + XKERN/2 + 1; 
00120           ftimp = ftim - (YKERN/2)*xim + XKERN/2 + 1; 
00121           for (i=0; i<XKERN; i++) 
00122             acc += *topp++ - *botp++;
00123           for (; i<xim; i++, accp++, cenp++) 
00124             {
00125               acc += *topp++ - *toppp++; 
00126               acc -= *botp++ - *botpp++; 
00127               *accp += acc;     
00128 
00129               
00130               qval = 4*(*cenp) + *(cenp-1) + *(cenp+1) + *(cenp-xim) + *(cenp+xim);
00131 #if (XKERN==9)
00132               qval = (qval*10)+*cenp - *accp;   
00133               qval = qval>>4;   
00134 #endif
00135 #if (XKERN==7)
00136               qval = (qval*6)+*cenp - *accp;    
00137               qval = qval>>3;   
00138 #endif
00139 
00140               if (qval < -ftzero)
00141                 uqval = 0;
00142               else if (qval > ftzero)
00143                 uqval = ftzero+ftzero;
00144               else
00145                 uqval = qval + ftzero;
00146               *ftimp++ = (uint8_t)uqval;
00147             }
00148           impp += xim;          
00149         }
00150     }
00151 }
00152 
00153 
00154 
00155 
00156 
00157 
00158 
00159 
00160 
00161 
00162 
00163 
00164 void
00165 do_prefilter_xsobel(uint8_t *im, 
00166           uint8_t *ftim,        
00167           int xim, int yim,     
00168           uint8_t ftzero,       
00169           uint8_t *buf          
00170           )
00171 {
00172   int i,j;
00173   uint8_t *impp, *impc, *impn;
00174   uint8_t *ftimp;
00175   int v;
00176 
00177   
00178   ftim += xim + 1;              
00179 
00180   
00181   for (j=1; j<yim-1; j++, im+=xim, ftim+=xim)
00182     {
00183       impp = im;
00184       impc = im+xim;
00185       impn = im+xim+xim;
00186       ftimp = ftim;
00187       for (i=1; i<xim-1; i+=4, impp+=4, impc+=4, impn+=4) 
00188             {
00189               
00190               v = *(impc+2)*2 - *(impc)*2 + *(impp+2) -
00191                 *(impp) + *(impn+2) - *(impn);
00192               v = v >> 0;       
00193               if (v < -ftzero)
00194                 v = 0;
00195               else if (v > ftzero)
00196                 v = ftzero+ftzero;
00197               else
00198                 v = v + ftzero;
00199               *ftimp++ = (uint8_t)v;
00200 
00201               v = *(impc+3)*2 - *(impc+1)*2 + *(impp+3) -
00202                 *(impp+1) + *(impn+3) - *(impn+1);
00203               v = v >> 0;       
00204               if (v < -ftzero)
00205                 v = 0;
00206               else if (v > ftzero)
00207                 v = ftzero+ftzero;
00208               else
00209                 v = v + ftzero;
00210               *ftimp++ = (uint8_t)v;
00211 
00212               v = *(impc+4)*2 - *(impc+2)*2 + *(impp+4) -
00213                 *(impp+2) + *(impn+4) - *(impn+2);
00214               v = v >> 0;       
00215               if (v < -ftzero)
00216                 v = 0;
00217               else if (v > ftzero)
00218                 v = ftzero+ftzero;
00219               else
00220                 v = v + ftzero;
00221               *ftimp++ = (uint8_t)v;
00222 
00223               v = *(impc+5)*2 - *(impc+3)*2 + *(impp+5) -
00224                 *(impp+3) + *(impn+5) - *(impn+3);
00225               v = v >> 0;       
00226               if (v < -ftzero)
00227                 v = 0;
00228               else if (v > ftzero)
00229                 v = ftzero+ftzero;
00230               else
00231                 v = v + ftzero;
00232               *ftimp++ = (uint8_t)v;
00233             }
00234     }
00235 }
00236 
00237 
00238 
00239 
00240 
00241 
00242 
00243 
00244 
00245 
00246 
00247 void
00248 do_stereo_y(uint8_t *lim, uint8_t *rim, 
00249           int16_t *disp,        
00250           int16_t *text,        
00251           int xim, int yim,     
00252           uint8_t ftzero,       
00253           int xwin, int ywin,   
00254           int dlen,             
00255           int tfilter_thresh,   
00256           int ufilter_thresh,   
00257           uint8_t *buf          
00258           )
00259 
00260 {
00261   int i,j,d;                    
00262   int16_t *accp, *accpp, *accminp; 
00263   int8_t *limp, *rimp, *limpp, *rimpp, *limp2, *limpp2; 
00264   int8_t *corrend, *corrp, *corrpp; 
00265   int16_t *intp, *intpp;        
00266   int16_t *textpp;              
00267   int16_t *dispp, *disppp;      
00268   int16_t *textp;               
00269   int16_t acc, newv;
00270   int8_t  newc, temp;
00271   int16_t umin, uminthresh;     
00272   int dval;                     
00273   
00274   
00275 #define YINTBUFSIZE (yim+64)
00276   int16_t *intbuf  = (int16_t *)buf;    
00277 #define YTEXTBUFSIZE (yim+64)
00278   int16_t *textbuf = (int16_t *)&buf[YINTBUFSIZE*sizeof(int16_t)];      
00279 #define YACCBUFSIZE (yim*dlen + 64)
00280   int16_t *accbuf  = (int16_t *)&buf[(YINTBUFSIZE + YTEXTBUFSIZE)*sizeof(int16_t)]; 
00281   int8_t  *corrbuf = (int8_t *)&buf[(YINTBUFSIZE + YTEXTBUFSIZE + YACCBUFSIZE)*sizeof(int16_t)]; 
00282 
00283   
00284   memclr(intbuf, yim*sizeof(int16_t));
00285   memclr(corrbuf, dlen*yim*xwin*sizeof(int8_t));
00286   memclr(accbuf, dlen*yim*sizeof(int16_t));
00287   memclr(textbuf, yim*sizeof(int16_t));
00288 
00289   
00290   corrend = corrbuf + dlen*yim*xwin;
00291   corrp = corrbuf;
00292   accminp = 0;                  
00293 
00294   
00295   limp = (int8_t *)lim + dlen;
00296   limp2 = limp;
00297   rimp = (int8_t *)rim + dlen;
00298   
00299   dispp = disp + xim*(ywin+YKERN-2)/2 + dlen + (xwin+XKERN-2)/2; 
00300   textp = NULL;
00301   if (text != NULL)             
00302     textp = text + dlen;
00303 
00304   
00305   tfilter_thresh = tfilter_thresh * xwin * ywin * ftzero;
00306   tfilter_thresh = tfilter_thresh / 100; 
00307   
00308 
00309   
00310   
00311   for (i=0; i<xim-XKERN-dlen+2; i++, limp++, rimp++)
00312     {    
00313       accp = accbuf;
00314       if (corrp >= corrend) corrp = corrbuf;
00315           
00316       
00317       
00318       for (d=0; d<dlen; d++, accp+=yim, corrp+=yim)      
00319         {
00320           limpp = limp;
00321           rimpp = rimp-d;
00322           intp = intbuf+ywin-1;
00323           intpp = intbuf;
00324           accpp = accp;
00325           corrpp = corrp;
00326               
00327           
00328           
00329           for (j=0; j<yim-YKERN+1; j++, limpp+=xim, rimpp+=xim)
00330             {
00331               newc = abs(*limpp - *rimpp); 
00332               newv = newc - *corrpp + *intp++; 
00333               *corrpp++ = newc; 
00334               *intp = newv;     
00335               *accpp++ += newv - *intpp++; 
00336             }
00337         } 
00338 
00339       
00340       
00341       limpp = limp;
00342       limpp2 = limp2;
00343       intp = intbuf+ywin-1;
00344       intpp = intbuf;
00345       accpp = textbuf;
00346       acc = 0;
00347           
00348       
00349       
00350       
00351       if (i < xwin)
00352         {
00353           for (j=0; j<yim-YKERN+1; j++, limpp+=xim)
00354             {
00355               temp = abs(*limpp- ftzero); 
00356               *intp = temp;
00357               acc += *intp++ - *intpp++;
00358               *accpp++ += acc;
00359             }
00360         }
00361       else
00362         {
00363           for (j=0; j<yim-YKERN+1; j++, limpp+=xim, limpp2+=xim)
00364             {
00365               temp = abs(*limpp-ftzero); 
00366               *intp = temp - abs(*limpp2-ftzero);
00367               acc += *intp++ - *intpp++;
00368               *accpp++ += acc;
00369             }
00370           limp2++;
00371         }
00372           
00373       
00374       if (i >= xwin)            
00375         {
00376           disppp = dispp;
00377           accp   = accbuf + (ywin-1); 
00378           textpp = textbuf + (ywin-1); 
00379 
00380           
00381           for (j=0; j<yim-ywin-YKERN+2; j++, accp++, disppp+=xim, textpp++) 
00382             {
00383               umin = 32000;
00384               accpp = accp;
00385               dval = -1;
00386 
00387               if (*textpp < tfilter_thresh)
00388                 {
00389                   *disppp = FILTEREDVAL;
00390                   continue;
00391                 }
00392 
00393               
00394               for (d=0; d<dlen; d++, accpp+=yim)
00395                 {
00396                   if (*accpp < umin)
00397                     {
00398                       accminp = accpp;
00399                       umin = *accpp;
00400                       dval = d;
00401                     }
00402                 }  
00403 
00404               
00405               accpp = accp;           
00406               if (ufilter_thresh > 0)
00407                 {
00408                   uminthresh = umin + (umin * ufilter_thresh) / 100;
00409                   for (d=0; d<dlen; d++, accpp+=yim)
00410                     {
00411                       if (*accpp <= uminthresh && (d < dval-1 || d > dval+1))
00412                         {
00413                           dval = -1;
00414                           break;
00415                         }
00416                     }
00417                 }
00418               if (dval < 0)
00419                 *disppp = FILTEREDVAL;
00420               else
00421                 {
00422                   double c, p, n, v;
00423                   c = (double)*accminp;
00424                   p = (double)*(accminp-yim);
00425                   n = (double)*(accminp+yim);
00426                   v = (double)dval + (p-n)/(2*(p+n-2*c));
00427                   *disppp = (int)(0.5 + 16*v);
00428                 }
00429             } 
00430 
00431           dispp++;              
00432         }
00433 
00434     } 
00435 
00436 }
00437 
00438 
00439 
00440 
00441 
00442 void
00443 do_stereo_d(uint8_t *lim, uint8_t *rim, 
00444             int16_t *disp,      
00445             int16_t *text,      
00446             int xim, int yim,   
00447             uint8_t ftzero,     
00448             int xwin, int ywin, 
00449             int dlen,           
00450             int tfilter_thresh, 
00451             int ufilter_thresh, 
00452             uint8_t *buf        
00453             )
00454 
00455 {
00456   int i,j,d;                    
00457   int16_t *accp, *accpp, *accminp; 
00458   int8_t *limp, *rimp, *limpp, *rimpp, *limp2, *limpp2; 
00459   int8_t *corrend, *corrp, *corrpp; 
00460   int16_t *intp, *intpp;        
00461   int16_t *textpp;              
00462   int16_t *dispp, *disppp;      
00463   int16_t *textp;               
00464   int16_t acc, newv;
00465   int8_t  temp, newc;
00466   int16_t umin, uminthresh;     
00467   int dval;                     
00468   
00469   int16_t *intbuf, *textbuf, *accbuf;
00470   int8_t *corrbuf;
00471 
00472   int pfilter_thresh = 2*ywin*xwin;
00473 
00474 #define INTBUFSIZE ((yim-YKERN+ywin)*dlen+64)
00475   intbuf  = (int16_t *)buf;     
00476 #define TEXTBUFSIZE (yim+64)
00477   textbuf = (int16_t *)&buf[INTBUFSIZE*sizeof(int16_t)];        
00478 #define ACCBUFSIZE (yim*dlen + 64)
00479   accbuf  = (int16_t *)&buf[(INTBUFSIZE+TEXTBUFSIZE)*sizeof(int16_t)]; 
00480   corrbuf = (int8_t *)&buf[(INTBUFSIZE+TEXTBUFSIZE+ACCBUFSIZE)*sizeof(int16_t)]; 
00481 
00482   
00483   memclr(intbuf, dlen*yim*sizeof(int16_t));
00484   memclr(corrbuf, dlen*yim*xwin*sizeof(int8_t));
00485   memclr(accbuf, dlen*yim*sizeof(int16_t));
00486   memclr(textbuf, yim*sizeof(int16_t));
00487 
00488   
00489   corrend = corrbuf + dlen*yim*xwin;
00490   corrp = corrbuf;
00491   accminp = 0;                  
00492 
00493   
00494   limp = (int8_t *)lim + dlen - 1;
00495   limp2 = limp;
00496   rimp = (int8_t *)rim;
00497   dispp = disp + xim*(ywin+YKERN-2)/2 + dlen + (xwin+XKERN-2)/2; 
00498   textp = NULL;
00499   if (text != NULL)             
00500     textp = text + dlen;
00501 
00502   
00503   tfilter_thresh = tfilter_thresh * xwin * ywin * ftzero;
00504   tfilter_thresh = tfilter_thresh / 100; 
00505   
00506 
00507   
00508   
00509   
00510   for (i=0; i<xim-XKERN-dlen+2; i++, limp++, rimp++, corrp+=yim*dlen)
00511     {    
00512       accp = accbuf;
00513       if (corrp >= corrend) corrp = corrbuf;
00514       limpp = limp;
00515       rimpp = rimp;
00516       corrpp = corrp;
00517       intp = intbuf+(ywin-1)*dlen; 
00518       intpp = intbuf;   
00519           
00520       
00521       for (j=0; j<yim-YKERN+1; j++, limpp+=xim, rimpp+=xim)
00522         {
00523           
00524           
00525           for (d=0; d<dlen; d++, intp++)
00526             {
00527               
00528               newc = abs(*limpp - rimpp[d]); 
00529               newv = newc - *corrpp + *intp; 
00530               *corrpp++ = newc; 
00531               *(intp+dlen) = newv; 
00532               *accp++ += newv - *intpp++; 
00533             }
00534         } 
00535 
00536       
00537       
00538       memclr(intbuf, ywin*sizeof(int16_t));
00539       limpp = limp;
00540       limpp2 = limp2;
00541       intp = intbuf+ywin-1;
00542       intpp = intbuf;
00543       accpp = textbuf;
00544       acc = 0;
00545           
00546       
00547       
00548       
00549       if (i < xwin)
00550         {
00551           for (j=0; j<yim-YKERN+1; j++, limpp+=xim)
00552             {
00553               temp = abs(*limpp- ftzero); 
00554               *intp = temp;
00555               acc += *intp++ - *intpp++;
00556               *accpp++ += acc;
00557             }
00558         }
00559       else
00560         {
00561           for (j=0; j<yim-YKERN+1; j++, limpp+=xim, limpp2+=xim)
00562             {
00563               temp = abs(*limpp-ftzero); 
00564               *intp = temp - abs(*limpp2-ftzero);
00565               acc += *intp++ - *intpp++;
00566               *accpp++ += acc;
00567             }
00568           limp2++;
00569         }
00570           
00571       
00572       if (i >= xwin)            
00573         {
00574           disppp = dispp;
00575           accp   = accbuf + (ywin-1)*dlen; 
00576           textpp = textbuf + (ywin-1); 
00577 
00578           
00579           for (j=0; j<yim-ywin-YKERN+2; j++, accp+=dlen, disppp+=xim, textpp++) 
00580             {
00581               umin = 32000;
00582               accpp = accp;
00583               dval = -1;
00584 
00585               if (*textpp < tfilter_thresh)
00586                 {
00587                   *disppp = FILTEREDVAL;
00588                   continue;
00589                 }
00590 
00591               
00592               for (d=dlen-1; d>=0; d--, accpp++)
00593                 {
00594                   if (*accpp < umin)
00595                     {
00596                       accminp = accpp;
00597                       umin = *accpp;
00598                       dval = d;
00599                     }
00600                 }  
00601 
00602               
00603               accpp = accp;           
00604               if (ufilter_thresh > 0)
00605                 {
00606                   uminthresh = umin + (umin * ufilter_thresh) / 100;
00607                   for (d=dlen-1; d>=0; d--, accpp++)
00608                     {
00609                       if (*accpp <= uminthresh && (d < dval-1 || d > dval+1))
00610                         {
00611                           dval = -1;
00612                           break;
00613                         }
00614                     }
00615                 }
00616               if (dval < 0)
00617                 *disppp = FILTEREDVAL;
00618               else
00619                 {
00620                   double c, p, n, v;
00621                   c = (double)*accminp;
00622                   p = (double)*(accminp+1);
00623                   n = (double)*(accminp-1);
00624                 
00625                   
00626                   if( *(accminp+1) + *(accminp-1) - 2*(*accminp) < pfilter_thresh){     
00627                         *disppp = FILTEREDVAL;
00628                   }     
00629                   else
00630                   {
00631 #define POW 0.7
00632 #if 1
00633                     
00634                     v = (p-n)/(p+n-2*c);
00635                     if (v >= 0) v = pow(v,POW);
00636                     else v = -pow(-v,POW);
00637 #elif 0
00638                     
00639                     double k = 1.0;
00640                     v = (k+1)*(p-n)/((p+n-2*c)+k*abs(p-n));
00641 #elif 0
00642                     
00643                     v = (p-n)/(p+n-2*c);
00644 #else
00645                     v = (p-n)/(p+n-2*c);
00646                     v = 0;
00647 #endif
00648                     v = (double)dval + 0.5*v;
00649                     *disppp = (int)(0.5 + 16.0*v);
00650                   }
00651                 }
00652             } 
00653 
00654           dispp++;              
00655         }
00656 
00657     } 
00658 
00659 }
00660 
00661 
00662 
00663 
00664 
00665 
00666 
00667 static inline void
00668 memclr_si128(__m128i *buf, int n)
00669 {
00670   int i;
00671   __m128i zz;
00672   zz = _mm_setzero_si128();
00673   n = n>>4;                     
00674   for (i=0; i<n; i++, buf++)
00675     _mm_store_si128(buf,zz);
00676 }
00677 
00678 
00679 
00680 
00681 
00682 
00683 
00684 
00685 #define PXKERN 7
00686 #define PYKERN 7
00687 
00688 void
00689 do_prefilter_fast(uint8_t *im,  
00690           uint8_t *ftim,        
00691           int xim, int yim,     
00692           uint8_t ftzero,       
00693           uint8_t *buf          
00694           )
00695 {
00696   int i,j;
00697 
00698   
00699   uintptr_t bufp = (uintptr_t)buf;
00700   int16_t *accbuf, *accp;
00701   int16_t *intbuf, *intp;       
00702   uint8_t *imp, *impp, *ftimp;
00703   __m128i acc, accs, acc1, accn1, accn2, acct, pxs, opxs, zeros;
00704   __m128i const_ftzero, const_ftzero_x48, const_ftzero_x2;
00705 
00706   int FACCBUFSIZE = xim+64;
00707 
00708   if (bufp & 0xF)
00709     bufp = (bufp+15) & ~(uintptr_t)0xF;
00710   buf = (uint8_t *)bufp;
00711   accbuf = (int16_t *)buf;
00712 
00713   intbuf = (int16_t *)&buf[FACCBUFSIZE*sizeof(int16_t)];        
00714   bufp = (uintptr_t)intbuf;
00715   if (bufp & 0xF)
00716     bufp = (bufp+15) & ~(uintptr_t)0xF;
00717   intbuf = (int16_t *)bufp;  
00718 
00719   
00720   memclr_si128((__m128i *)accbuf, FACCBUFSIZE*sizeof(int16_t));
00721   memclr_si128((__m128i *)intbuf, 8*sizeof(uint16_t));
00722   impp = im;                    
00723 
00724   
00725   zeros = _mm_setzero_si128();
00726   const_ftzero     = _mm_set1_epi16(ftzero);
00727   const_ftzero_x48 = _mm_set1_epi16(ftzero*48);
00728   const_ftzero_x2  = _mm_set1_epi16(ftzero*2);
00729 
00730   
00731   for (j=0; j<yim; j++, im+=xim, ftim+=xim)
00732     {
00733       accp = accbuf;            
00734       intp = intbuf+8;
00735       acc = _mm_setzero_si128(); 
00736       imp = im;                 
00737       impp = im - PYKERN*xim;
00738 
00739       
00740       if (j<PYKERN)
00741         {
00742           for (i=0; i<xim; i+=16, imp+=16, intp+=16, accp+=16)
00743             {
00744               pxs = _mm_load_si128((__m128i *)imp); 
00745               accs = _mm_unpacklo_epi8(pxs,zeros); 
00746               acc = _mm_srli_si128(acc, 14); 
00747               accs = _mm_add_epi16(accs, acc); 
00748               
00749               acct = _mm_slli_si128(accs, 2); 
00750               accs = _mm_add_epi16(accs, acct); 
00751               acct = _mm_slli_si128(accs, 4); 
00752               accs = _mm_add_epi16(accs, acct); 
00753               acct = _mm_slli_si128(accs, 8); 
00754               acc1 = _mm_add_epi16(accs, acct); 
00755               _mm_store_si128((__m128i *)intp,acc1); 
00756               
00757               accs = _mm_unpackhi_epi8(pxs,zeros); 
00758               acc = _mm_srli_si128(acc1, 14); 
00759               accs = _mm_add_epi16(accs, acc); 
00760               
00761               acct = _mm_slli_si128(accs, 2); 
00762               accs = _mm_add_epi16(accs, acct); 
00763               acct = _mm_slli_si128(accs, 4); 
00764               accs = _mm_add_epi16(accs, acct); 
00765               acct = _mm_slli_si128(accs, 8); 
00766               acc  = _mm_add_epi16(accs, acct); 
00767               _mm_store_si128((__m128i *)(intp+8),acc); 
00768               
00769               
00770               acct = _mm_loadu_si128((__m128i *)(intp-7)); 
00771               acc1 = _mm_sub_epi16(acc1,acct);
00772               accs = _mm_load_si128((__m128i *)accp); 
00773               acc1 = _mm_add_epi16(acc1,accs);
00774               _mm_store_si128((__m128i *)accp,acc1); 
00775               
00776               acct = _mm_loadu_si128((__m128i *)(intp-7+8)); 
00777               acc1 = _mm_sub_epi16(acc,acct);
00778               accs = _mm_load_si128((__m128i *)(accp+8)); 
00779               acc1 = _mm_add_epi16(acc1,accs);
00780               _mm_store_si128((__m128i *)(accp+8),acc1); 
00781 
00782             }
00783         }
00784 
00785 
00786       else                      
00787         {
00788           for (i=0; i<xim; i+=16, imp+=16, impp+=16, intp+=16, accp+=16)
00789             {
00790               pxs = _mm_load_si128((__m128i *)imp); 
00791               opxs = _mm_load_si128((__m128i *)impp); 
00792               accs = _mm_unpacklo_epi8(pxs,zeros); 
00793               acc = _mm_srli_si128(acc, 14); 
00794               accs = _mm_add_epi16(accs, acc); 
00795               acct = _mm_unpacklo_epi8(opxs,zeros); 
00796               accs = _mm_sub_epi16(accs, acct); 
00797 
00798               
00799               acct = _mm_slli_si128(accs, 2); 
00800               accs = _mm_add_epi16(accs, acct); 
00801               acct = _mm_slli_si128(accs, 4); 
00802               accs = _mm_add_epi16(accs, acct); 
00803               acct = _mm_slli_si128(accs, 8); 
00804               acc1 = _mm_add_epi16(accs, acct); 
00805               _mm_store_si128((__m128i *)intp,acc1); 
00806 
00807               
00808               accs = _mm_unpackhi_epi8(pxs,zeros); 
00809               acc = _mm_srli_si128(acc1, 14); 
00810               accs = _mm_add_epi16(accs, acc); 
00811               acct = _mm_unpackhi_epi8(opxs,zeros); 
00812               accs = _mm_sub_epi16(accs, acct); 
00813 
00814               
00815               acct = _mm_slli_si128(accs, 2); 
00816               accs = _mm_add_epi16(accs, acct); 
00817               acct = _mm_slli_si128(accs, 4); 
00818               accs = _mm_add_epi16(accs, acct); 
00819               acct = _mm_slli_si128(accs, 8); 
00820               acc  = _mm_add_epi16(accs, acct); 
00821               _mm_store_si128((__m128i *)(intp+8),acc); 
00822               
00823               
00824               acct = _mm_loadu_si128((__m128i *)(intp-7)); 
00825               acc1 = _mm_sub_epi16(acc1,acct);
00826               accs = _mm_load_si128((__m128i *)accp); 
00827               acc1 = _mm_add_epi16(acc1,accs);
00828               _mm_store_si128((__m128i *)accp,acc1); 
00829               
00830               acct = _mm_loadu_si128((__m128i *)(intp-7+8)); 
00831               acc1 = _mm_sub_epi16(acc,acct);
00832               accs = _mm_load_si128((__m128i *)(accp+8)); 
00833               acc1 = _mm_add_epi16(acc1,accs);
00834               _mm_store_si128((__m128i *)(accp+8),acc1); 
00835             }
00836 
00837           
00838           accp = accbuf+6;      
00839           imp = im - (PYKERN/2)*xim + PXKERN/2;
00840           ftimp = ftim - (PYKERN/2)*xim + PXKERN/2;
00841           for (i=0; i<xim-8; i+=16, imp+=16, accp+=16, ftimp+=16)
00842             {
00843               
00844               pxs = _mm_loadu_si128((__m128i *)imp); 
00845               accn1 = _mm_unpacklo_epi8(pxs,zeros); 
00846               accn2  = _mm_unpackhi_epi8(pxs,zeros); 
00847               accs = _mm_slli_epi16(accn1,2); 
00848               acc  = _mm_slli_epi16(accn2,2); 
00849               pxs  = _mm_loadu_si128((__m128i *)(imp+1)); 
00850               acct = _mm_unpacklo_epi8(pxs,zeros); 
00851               acc1 = _mm_unpackhi_epi8(pxs,zeros); 
00852               accs = _mm_add_epi16(acct,accs);
00853               acc  = _mm_add_epi16(acc1,acc);
00854               pxs  = _mm_loadu_si128((__m128i *)(imp-1)); 
00855               acct = _mm_unpacklo_epi8(pxs,zeros); 
00856               acc1 = _mm_unpackhi_epi8(pxs,zeros); 
00857               accs = _mm_add_epi16(acct,accs);
00858               acc  = _mm_add_epi16(acc1,acc);
00859               pxs  = _mm_loadu_si128((__m128i *)(imp+xim)); 
00860               acct = _mm_unpacklo_epi8(pxs,zeros); 
00861               acc1 = _mm_unpackhi_epi8(pxs,zeros); 
00862               accs = _mm_add_epi16(acct,accs);
00863               acc  = _mm_add_epi16(acc1,acc);
00864               pxs  = _mm_loadu_si128((__m128i *)(imp-xim)); 
00865               acct = _mm_unpacklo_epi8(pxs,zeros); 
00866               acc1 = _mm_unpackhi_epi8(pxs,zeros); 
00867               accs = _mm_add_epi16(acct,accs);
00868               acc  = _mm_add_epi16(acc1,acc);
00869               
00870               
00871               acct = _mm_slli_epi16(accs,2); 
00872               accs = _mm_add_epi16(accs,accs); 
00873               accs = _mm_add_epi16(accs,acct); 
00874               accs = _mm_add_epi16(accs,accn1); 
00875               
00876               acct = _mm_loadu_si128((__m128i *)accp);
00877               accs = _mm_sub_epi16(accs,acct); 
00878               
00879               accs = _mm_srai_epi16(accs,3); 
00880               accs = _mm_adds_epi16(accs,const_ftzero); 
00881               
00882               accs = _mm_max_epi16(accs,zeros); 
00883               accs = _mm_min_epi16(accs,const_ftzero_x2);
00884 
00885               
00886               
00887               acct = _mm_slli_epi16(acc,2); 
00888               acc = _mm_add_epi16(acc,acc); 
00889               acc = _mm_add_epi16(acc,acct); 
00890               acc = _mm_add_epi16(acc,accn2); 
00891               
00892               acct = _mm_loadu_si128((__m128i *)(accp+8));
00893               acc1 = _mm_sub_epi16(acc,acct);
00894               
00895               acc1 = _mm_srai_epi16(acc1,3); 
00896               acc1 = _mm_adds_epi16(acc1,const_ftzero); 
00897               
00898               acc1 = _mm_max_epi16(acc1,zeros); 
00899               acc1 = _mm_min_epi16(acc1,const_ftzero_x2);
00900 
00901               
00902               accs = _mm_packus_epi16(accs,acc1);
00903               _mm_storeu_si128((__m128i *)ftimp,accs);
00904               
00905             }
00906 
00907         } 
00908       
00909     }
00910 
00911 }
00912 
00913 
00914 
00915 
00916 
00917 
00918 #define EXACTUNIQ   // set this to do exact uniqueness values
00919 
00920 void
00921 do_stereo_d_fast(uint8_t *lim, uint8_t *rim, 
00922             int16_t *disp,      
00923             int16_t *text,      
00924             int xim, int yim,   
00925             uint8_t ftzero,     
00926             int xwin, int ywin, 
00927             int dlen,           
00928             int tfilter_thresh, 
00929             int ufilter_thresh, 
00930             uint8_t *buf        
00931             )
00932 {
00933   int i,j,k,d;                  
00934   int16_t *accp, *accpp;        
00935   int8_t *limp, *rimp, *limpp, *rimpp, *limp2, *limpp2; 
00936   int8_t *corrend, *corrp, *corrpp; 
00937   int16_t *intp, *intpp;        
00938   int16_t *textpp;              
00939   int16_t *dispp, *disppp;      
00940   int16_t *textp;               
00941   int16_t acc;
00942   int dval;                     
00943   int uniqthresh;               
00944   
00945   
00946   __m128i limpix, rimpix, newpix, tempix, oldpix;
00947   __m128i newval, temval;
00948   __m128i minv, indv, indtop, indd, indm, temv, nexv;
00949   __m128i cval, pval, nval, ival; 
00950   __m128i denv, numv, intv, sgnv; 
00951   __m128i uniqcnt, uniqth, uniqinit, unim, uval; 
00952 
00953   
00954 #ifdef WIN32
00955   const __m128i p0xfffffff0 = {0x0, 0x0, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
00956                                0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff};
00957   const __m128i p0x0000000f = {0xff, 0xff, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
00958                                0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0};
00959   const __m128i zeros = {0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
00960                          0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0};
00961   const __m128i val_epi16_7fff = {0xff, 0x7f, 0xff, 0x7f, 0xff, 0x7f, 0xff, 0x7f, 
00962                                   0xff, 0x7f, 0xff, 0x7f, 0xff, 0x7f, 0xff, 0x7f};
00963   const __m128i val_epi16_8    = {0x08, 0x0, 0x08, 0x0, 0x08, 0x0, 0x08, 0x0, 
00964                                   0x08, 0x0, 0x08, 0x0, 0x08, 0x0, 0x08, 0x0};
00965   const __m128i val_epi16_1    = {0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00,
00966                                   0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00};
00967   const __m128i val_epi16_2    = {0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00,
00968                                   0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00};
00969   const __m128i val_epi16_3    = {0x03, 0x00, 0x03, 0x00, 0x03, 0x00, 0x03, 0x00,
00970                                   0x03, 0x00, 0x03, 0x00, 0x03, 0x00, 0x03, 0x00};
00971 #else
00972   const __m128i p0xfffffff0 = {0xffffffffffff0000LL, 0xffffffffffffffffLL};
00973   const __m128i p0x0000000f = {0xffffLL, 0x0LL};
00974   const __m128i zeros = {0x0LL, 0x0LL};
00975   const __m128i val_epi16_7fff = {0x7fff7fff7fff7fffLL, 0x7fff7fff7fff7fffLL};
00976   const __m128i val_epi16_8    = {0x0008000800080008LL, 0x0008000800080008LL};
00977   const __m128i val_epi16_1    = {0x0001000100010001LL, 0x0001000100010001LL};
00978   const __m128i val_epi16_2    = {0x0002000200020002LL, 0x0002000200020002LL};
00979   const __m128i val_epi16_3    = {0x0003000300030003LL, 0x0003000300030003LL};
00980 #endif
00981 
00982   
00983   uintptr_t bufp = (uintptr_t)buf;
00984   int16_t *intbuf, *textbuf, *accbuf, temp;
00985   int8_t *corrbuf;
00986 
00987   if (bufp & 0xF)
00988     bufp = (bufp+15) & ~(uintptr_t)0xF;
00989   buf = (uint8_t *)bufp;
00990 
00991 #define INTEBUFSIZE ((yim+16+ywin)*dlen)
00992   intbuf  = (int16_t *)buf;     
00993   bufp = (uintptr_t)intbuf;
00994   if (bufp & 0xF)
00995     bufp = (bufp+15) & ~(uintptr_t)0xF;
00996   intbuf = (int16_t *)bufp;  
00997 
00998 #define TXTBUFSIZE (yim + 64)
00999   textbuf = (int16_t *)&buf[INTEBUFSIZE*sizeof(int16_t)];       
01000   bufp = (uintptr_t)textbuf;
01001   if (bufp & 0xF)
01002     bufp = (bufp+15) & ~(uintptr_t)0xF;
01003   textbuf = (int16_t *)bufp;  
01004 
01005 
01006 #define ACCBUFSIZE (yim*dlen + 64)
01007   accbuf  = (int16_t *)&buf[(INTEBUFSIZE+TXTBUFSIZE)*sizeof(int16_t)]; 
01008   bufp = (uintptr_t)accbuf;
01009   if (bufp & 0xF)
01010     bufp = (bufp+15) & ~(uintptr_t)0xF;
01011   accbuf = (int16_t *)bufp;  
01012 
01013 
01014   corrbuf = (int8_t *)&buf[(INTEBUFSIZE+TXTBUFSIZE+ACCBUFSIZE)*sizeof(int16_t)]; 
01015   bufp = (uintptr_t)corrbuf;
01016   if (bufp & 0xF)
01017     bufp = (bufp+15) & ~(uintptr_t)0xF;
01018   corrbuf = (int8_t *)bufp;  
01019 
01020   
01021   memclr_si128((__m128i *)intbuf, dlen*yim*sizeof(int16_t));
01022   memclr_si128((__m128i *)corrbuf, dlen*yim*xwin*sizeof(int8_t));
01023   memclr_si128((__m128i *)accbuf, dlen*yim*sizeof(int16_t));
01024   memclr_si128((__m128i *)textbuf, yim*sizeof(int16_t));
01025 
01026   
01027   corrend = corrbuf + dlen*yim*xwin;
01028   corrp = corrbuf;
01029 
01030   
01031   limp = (int8_t *)lim + dlen - 1;
01032   limp2 = limp;
01033   rimp = (int8_t *)rim;
01034   dispp = disp + xim*(ywin+YKERN-2)/2 + dlen + (xwin+XKERN-2)/2; 
01035   textp = NULL;
01036   if (text != NULL)             
01037     textp = text + dlen;
01038 
01039   
01040   tfilter_thresh = tfilter_thresh * xwin * ywin * ftzero;
01041   tfilter_thresh = tfilter_thresh / 100; 
01042   
01043 
01044   
01045   
01046   indtop = _mm_set_epi16(dlen+0, dlen+1, dlen+2, dlen+3, dlen+4, dlen+5, dlen+6, dlen+7);
01047   uniqthresh = (0x8000 * ufilter_thresh)/100; 
01048                                               
01049   uniqinit = _mm_set1_epi16(uniqthresh);
01050   
01051   intv = zeros;
01052   ival = zeros;
01053   nval = zeros;
01054   pval = zeros;
01055   cval = zeros;
01056   uval = zeros;
01057 
01058   
01059   
01060   
01061   for (i=0; i<xim-XKERN-dlen+2; i++, limp++, rimp++, corrp+=yim*dlen)
01062     {    
01063       accp = accbuf;
01064       if (corrp >= corrend) corrp = corrbuf;
01065       limpp = limp;
01066       rimpp = rimp;
01067       corrpp = corrp;
01068       intp = intbuf+(ywin-1)*dlen; 
01069       intpp = intbuf;   
01070           
01071       
01072       for (j=0; j<yim-YKERN+1; j++, limpp+=xim, rimpp+=xim-dlen)
01073         {
01074           
01075           
01076           limpix = _mm_set1_epi8(*limpp);
01077 
01078           
01079           
01080           for (d=0; d<dlen; d+=16, intp+=16, intpp+=16, accp+=16, corrpp+=16, rimpp+=16)    
01081             {
01082               
01083               
01084               rimpix = _mm_loadu_si128((__m128i *)rimpp);
01085               newpix = _mm_subs_epu8(limpix,rimpix); 
01086               tempix = _mm_subs_epu8(rimpix,limpix); 
01087               newpix = _mm_add_epi8(newpix,tempix); 
01088 
01089 
01090 
01091               
01092               
01093               oldpix = _mm_load_si128((__m128i *)corrpp);             
01094               newval = _mm_unpacklo_epi8(newpix,zeros); 
01095               temval = _mm_load_si128((__m128i *)intp); 
01096               newval = _mm_add_epi16(newval,temval);
01097               temval = _mm_unpacklo_epi8(oldpix,zeros); 
01098               newval = _mm_sub_epi16(newval,temval); 
01099 
01100               
01101               
01102               _mm_store_si128((__m128i *)corrpp,newpix);
01103 
01104               
01105               
01106               _mm_store_si128((__m128i *)(intp+dlen),newval);         
01107 
01108               
01109               
01110               temval = _mm_load_si128((__m128i *)intpp); 
01111               newval = _mm_sub_epi16(newval,temval);
01112               temval = _mm_load_si128((__m128i *)accp); 
01113               newval = _mm_add_epi16(newval,temval);
01114               _mm_store_si128((__m128i *)accp,newval);
01115 
01116               
01117               
01118               
01119               newval = _mm_unpackhi_epi8(newpix,zeros); 
01120               temval = _mm_load_si128((__m128i *)(intp+8));     
01121               newval = _mm_add_epi16(newval,temval);
01122               temval = _mm_unpackhi_epi8(oldpix,zeros); 
01123               newval = _mm_sub_epi16(newval,temval); 
01124 
01125               
01126               
01127               _mm_store_si128((__m128i *)(intp+dlen+8),newval);       
01128 
01129               
01130               
01131               temval = _mm_load_si128((__m128i *)(intpp+8)); 
01132               newval = _mm_sub_epi16(newval,temval);
01133               temval = _mm_load_si128((__m128i *)(accp+8)); 
01134               newval = _mm_add_epi16(newval,temval);
01135               _mm_store_si128((__m128i *)(accp+8),newval);
01136             }
01137         } 
01138 
01139       
01140       
01141       memclr_si128((__m128i *)intbuf, yim*sizeof(int16_t));
01142       limpp = limp;
01143       limpp2 = limp2;
01144       intp = intbuf+ywin-1;
01145       intpp = intbuf;
01146       accpp = textbuf;
01147       acc = 0;
01148           
01149 #if 1                           // this should be optimized
01150       
01151       
01152       
01153       if (i < xwin)
01154         {
01155           for (j=0; j<yim-YKERN+1; j++, limpp+=xim)
01156             {
01157               temp = abs(*limpp- ftzero); 
01158               *intp = temp;
01159               acc += *intp++ - *intpp++;
01160               *accpp++ += acc;
01161             }
01162         }
01163       else
01164         {
01165           for (j=0; j<yim-YKERN+1; j++, limpp+=xim, limpp2+=xim)
01166             {
01167               temp = abs(*limpp-ftzero); 
01168               *intp = temp - abs(*limpp2-ftzero);
01169               acc += *intp++ - *intpp++;
01170               *accpp++ += acc;
01171             }
01172           limp2++;
01173         }
01174 #endif
01175           
01176       
01177       if (i >= xwin)            
01178         {
01179           disppp = dispp;
01180           accp   = accbuf + (ywin-1)*dlen; 
01181           textpp = textbuf + (ywin-1); 
01182 
01183           
01184           accpp = accp;
01185           nexv = val_epi16_7fff;
01186           for (d=0; d<dlen; d+=8, accpp+=8)
01187             nexv = _mm_min_epi16(nexv,*(__m128i *)accpp); 
01188 
01189           
01190           for (j=0; j<yim-ywin-YKERN+2; j+=8) 
01191             {
01192               
01193               
01194               
01195               
01196               
01197 
01198               
01199               for (k=0; k<8; k++, textpp++) 
01200                 {
01201                   
01202                   cval = _mm_slli_si128(cval,2);
01203                   pval = _mm_slli_si128(pval,2);
01204                   nval = _mm_slli_si128(nval,2);
01205                   uval = _mm_slli_si128(uval,2);
01206                   ival = _mm_slli_si128(ival,2);
01207 
01208                   
01209                   
01210                   temv = _mm_shufflelo_epi16(nexv,0xb1); 
01211                   temv = _mm_shufflehi_epi16(temv,0xb1); 
01212                   minv = _mm_min_epi16(nexv,temv); 
01213                   minv = _mm_min_epi16(minv,_mm_shuffle_epi32(minv,0xb1)); 
01214                   minv = _mm_min_epi16(minv,_mm_shuffle_epi32(minv,0x4e)); 
01215                   
01216                   cval = _mm_or_si128(_mm_and_si128(cval,p0xfffffff0),_mm_and_si128(minv,p0x0000000f)); 
01217                   
01218                   uniqth = _mm_mulhi_epu16(uniqinit,minv);
01219                   uniqth = _mm_add_epi16(uniqth,minv);
01220 
01221                   
01222                   
01223                   indv = indtop;
01224                   indd = val_epi16_8;
01225                   uniqcnt = zeros;
01226                   nexv = val_epi16_7fff; 
01227                   for (d=0; d<dlen; d+=8, accp+=8)
01228                     {
01229                       indm = _mm_cmpgt_epi16(*(__m128i *)accp,minv); 
01230                       indv = _mm_subs_epu16(indv,indd); 
01231                       indd = _mm_and_si128(indd,indm); 
01232                       unim = _mm_cmpgt_epi16(uniqth,*(__m128i *)accp); 
01233                       nexv = _mm_min_epi16(nexv,*(__m128i *)(accp+dlen)); 
01234                       uniqcnt = _mm_sub_epi16(uniqcnt,unim); 
01235                     } 
01236                   indv = _mm_subs_epu16(indv,indd); 
01237                   
01238                   temv = _mm_shufflelo_epi16(indv,0xb1); 
01239                   temv = _mm_shufflehi_epi16(temv,0xb1); 
01240                   indv = _mm_max_epi16(indv,temv); 
01241                   indv = _mm_max_epi16(indv,_mm_shuffle_epi32(indv,0xb1)); 
01242                   indv = _mm_max_epi16(indv,_mm_shuffle_epi32(indv,0x4e)); 
01243 
01244                   
01245                   dval = _mm_extract_epi16(indv,0);     
01246                   nval = _mm_insert_epi16(nval,*(accp-dval),0);
01247                   pval = _mm_insert_epi16(pval,*(accp-dval-2),0);
01248                   
01249                   ival = _mm_or_si128(_mm_and_si128(ival,p0xfffffff0),_mm_and_si128(indv,p0x0000000f)); 
01250 
01251                   
01252                   uniqcnt = _mm_sad_epu8(uniqcnt,zeros); 
01253                   uniqcnt = _mm_add_epi32(uniqcnt, _mm_srli_si128(uniqcnt,8)); 
01254 #ifdef EXACTUNIQ
01255                   unim = _mm_cmpgt_epi16(uniqth,nval); 
01256                   uniqcnt = _mm_add_epi16(uniqcnt,unim); 
01257                   unim = _mm_cmpgt_epi16(uniqth,pval); 
01258                   uniqcnt = _mm_add_epi16(uniqcnt,unim); 
01259 #endif
01260                   if (*textpp < tfilter_thresh)
01261                     uniqcnt = val_epi16_8; 
01262                   uval = _mm_or_si128(_mm_and_si128(uval,p0xfffffff0),
01263                                       _mm_and_si128(uniqcnt,p0x0000000f));
01264                 }
01265 
01266               
01267               
01268               
01269               
01270               
01271               
01272               
01273               
01274               
01275               
01276 
01277               
01278               numv = _mm_sub_epi16(pval,nval);
01279               sgnv = _mm_cmpgt_epi16(nval,pval); 
01280               numv = _mm_xor_si128(numv,sgnv);
01281               numv = _mm_sub_epi16(numv,sgnv);  
01282               
01283               
01284               denv = _mm_add_epi16(pval,nval);
01285               denv = _mm_sub_epi16(denv,cval);
01286               denv = _mm_sub_epi16(denv,cval); 
01287 
01288               
01289               denv = _mm_add_epi16(denv,numv); 
01290               numv = _mm_add_epi16(numv,numv);  
01291 
01292               
01293               
01294               
01295               numv = _mm_add_epi16(numv,numv);  
01296               
01297               temv = _mm_cmpgt_epi16(numv,denv); 
01298               intv = _mm_srli_epi16(temv,15); 
01299               intv = _mm_slli_epi16(intv,1); 
01300               numv = _mm_subs_epu16(numv,_mm_and_si128(temv,denv)); 
01301               numv = _mm_slli_epi16(numv,1); 
01302 
01303               
01304               temv = _mm_cmpgt_epi16(numv,denv); 
01305               intv = _mm_sub_epi16(intv,temv); 
01306               intv = _mm_slli_epi16(intv,1); 
01307               numv = _mm_subs_epu16(numv,_mm_and_si128(temv,denv)); 
01308               numv = _mm_slli_epi16(numv,1); 
01309 
01310               
01311               temv = _mm_cmpgt_epi16(numv,denv); 
01312               intv = _mm_sub_epi16(intv,temv); 
01313               intv = _mm_slli_epi16(intv,1); 
01314               numv = _mm_subs_epu16(numv,_mm_and_si128(temv,denv)); 
01315               numv = _mm_slli_epi16(numv,1); 
01316 
01317               
01318               temv = _mm_cmpgt_epi16(numv,denv); 
01319               intv = _mm_sub_epi16(intv,temv); 
01320 
01321               
01322               intv = _mm_add_epi16(intv,val_epi16_1);   
01323               intv = _mm_srli_epi16(intv,1);    
01324               intv = _mm_xor_si128(intv,sgnv); 
01325               intv = _mm_sub_epi16(intv,sgnv); 
01326 
01327               
01328               ival = _mm_slli_epi16(ival,4);
01329               ival = _mm_sub_epi16(ival,intv);
01330               
01331               
01332               
01333               
01334               
01335               
01336               
01337 #ifdef EXACTUNIQ
01338               uval = _mm_cmplt_epi16(uval,val_epi16_2); 
01339 #else
01340               uval = _mm_cmplt_epi16(uval,val_epi16_3); 
01341 #endif
01342               ival = _mm_and_si128(uval,ival); 
01343 
01344               
01345               *disppp = _mm_extract_epi16(ival,7);
01346               disppp += xim;
01347               *disppp = _mm_extract_epi16(ival,6);
01348               disppp += xim;
01349               *disppp = _mm_extract_epi16(ival,5);
01350               disppp += xim;
01351               *disppp = _mm_extract_epi16(ival,4);
01352               disppp += xim;
01353               *disppp = _mm_extract_epi16(ival,3);
01354               disppp += xim;
01355               *disppp = _mm_extract_epi16(ival,2);
01356               disppp += xim;
01357               *disppp = _mm_extract_epi16(ival,1);
01358               disppp += xim;
01359               *disppp = _mm_extract_epi16(ival,0);
01360               disppp += xim;
01361 
01362             } 
01363 
01364           dispp++;              
01365         }
01366 
01367     } 
01368 
01369   
01370   dispp = disp + (yim - YKERN/2 - ywin/2)*xim;
01371   for (i=0; i<YKERN; i++, dispp+=xim)
01372     memclr_si128((__m128i *)dispp, xim*sizeof(int16_t));
01373 
01374 }
01375 
01376 
01377 
01378 
01379 
01380 
01381 
01382 
01383 
01384 
01385 
01386 
01387 int
01388 do_stereo_sparse(uint8_t *refpat, uint8_t *rim, 
01389           int x, int y,         
01390           int xim, int yim,     
01391           uint8_t ftzero,       
01392           int dlen,             
01393           int tfilter_thresh,   
01394           int ufilter_thresh    
01395           )
01396 {
01397   int i,j,k,sum,min,ind;
01398   uint8_t *rp, *rpp, *rppp, *pp, *ppp;
01399   int cbuf[1024];               
01400   int *cp;
01401   float c, p, n, v;
01402 
01403   if (x < dlen || y < 7 || y > (yim-8)) return -1;
01404 
01405   min = (int)1e6;
01406   ind = 0;
01407   rp = rim + (y-7)*xim + x - dlen + 1 - 7; 
01408   cp = cbuf;
01409   for (i=0; i<dlen; i++, rp++, cp++) 
01410     {
01411       rpp = rp;
01412       pp  = refpat;
01413       sum = 0;
01414       for (j=0; j<15; j++, rpp+=xim, pp+=16)    
01415         {
01416           rppp = rpp;
01417           ppp = pp;
01418           for (k=0; k<15; k++)  
01419             sum += abs(*ppp++ - *rppp++);
01420         }
01421       
01422       *cp = sum;
01423       if (sum < min)
01424         {
01425           min = sum;
01426           ind = i;
01427         }
01428     }
01429   
01430   if (ind==0 || ind==(dlen-1))
01431     return (dlen-ind-1)*16;
01432 
01433   c = (float)min;
01434   p = (float)cbuf[ind+1];
01435   n = (float)cbuf[ind-1];
01436   v = (float)(dlen-ind-1) + (p-n)/(2*(p+n-2*c));
01437   return (int)(0.5 + 16*v);
01438 }
01439 
01440 
01441 
01442 
01443 
01444 
01445 
01446 
01447 
01448 
01449 
01450 
01451 
01452 int __attribute__ ((force_align_arg_pointer))
01453 do_stereo_sparse_fast(uint8_t *refpat, uint8_t *rim, 
01454       int x, int y,         
01455     int xim, int yim, 
01456     uint8_t ftzero, 
01457     int dlen,   
01458     int tfilter_thresh, 
01459     int ufilter_thresh  
01460     )
01461 {
01462   int i,j,k,sum,min,ind;
01463   uint8_t *rp, *rpp, *rppp, *pp, *ppp;
01464   int cbuf[1024];   
01465   int *cp;
01466   float c, p, n, v;
01467 
01468   if (x < dlen || y < 7 || y > (yim-8))
01469     return -1;
01470 
01471   min = (int)1e6;
01472   ind = 0;
01473   rp = rim + (y-7)*xim + x - dlen + 1 - 7; 
01474   cp = cbuf;
01475   for (i=0; i<dlen; i++, rp++, cp++) { 
01476     rpp = rp;
01477     pp  = refpat;
01478 #ifdef __SSE2__
01479     __m128i a, b, diff, total;
01480     total = (__m128i)_mm_setzero_ps();
01481     unsigned int sums[4];
01482 
01483 #define ACCUM_SAD(offset) \
01484     a = (__m128i)_mm_loadu_ps((float*)(pp + (16 * offset))); \
01485     b = (__m128i)_mm_loadu_ps((float*)(rpp)); \
01486     rpp += xim; \
01487     diff = _mm_sad_epu8(a, b); \
01488     total = _mm_add_epi32(diff, total);
01489 
01490     ACCUM_SAD(0)
01491     ACCUM_SAD(1)
01492     ACCUM_SAD(2)
01493     ACCUM_SAD(3)
01494     ACCUM_SAD(4)
01495     ACCUM_SAD(5)
01496     ACCUM_SAD(6)
01497     ACCUM_SAD(7)
01498     ACCUM_SAD(8)
01499     ACCUM_SAD(9)
01500     ACCUM_SAD(10)
01501     ACCUM_SAD(11)
01502     ACCUM_SAD(12)
01503     ACCUM_SAD(13)
01504     ACCUM_SAD(14)
01505     ACCUM_SAD(15)
01506     _mm_storeu_ps((float*)sums, (__m128)total);
01507     sum = sums[0] + sums[2];
01508 #else
01509     sum = 0;
01510     for (j=0; j<15; j++, rpp+=xim, pp+=16) { 
01511       rppp = rpp;
01512       ppp = pp;
01513       for (k=0; k<15; k++)  
01514         sum += abs(*ppp++ - *rppp++);
01515     }
01516 #endif
01517     
01518     *cp = sum;
01519     if (sum < min) {
01520       min = sum;
01521       ind = i;
01522     }
01523   }
01524 
01525   
01526   if (ind==0 || ind==(dlen-1))
01527     return (dlen-ind-1)*16;
01528 
01529   c = (float)min;
01530   p = (float)cbuf[ind+1];
01531   n = (float)cbuf[ind-1];
01532   v = (float)(dlen-ind-1) + (p-n)/(2*(p+n-2*c));
01533   return (int)(0.5 + 16*v);
01534 }
01535 
01536 
01537 
01538 
01539 
01540 
01541 
01542 
01543 
01544 
01545 
01546 
01547 
01548 
01549 
01550 
01551 
01552 
01553 
01554 
01555 
01556 
01557 
01558 
01559 
01560 
01561 
01562 
01563 
01564 
01565 
01566 
01567 
01568 
01569 
01570 
01571 
01572 
01573 
01574 
01575 #define push(x,y) { if ((disp[x] != badval) && (!labels[x])         \
01576                         && (disp[x]-y < rdiff) && (disp[x]-y > -rdiff)) \
01577     { labels[x] = cur; *ws++ = x; }}
01578 
01579 void
01580 do_speckle(int16_t *disp, int16_t badval, int width, int height, 
01581            int rdiff, int rcount, 
01582            uint32_t *labels, uint32_t *wbuf, uint8_t *rtype)
01583 {
01584   int i,j,k,cnt;
01585   uint32_t *ws, *ls;
01586   uint32_t p, cur;
01587   int16_t dp, *ds;
01588 
01589 
01590 
01591 
01592   
01593   memset(labels,0x0,width*height*sizeof(uint32_t));
01594 
01595   
01596   const uint32_t MIN_P = 4*width+4;
01597   const uint32_t MAX_P = (height-4)*width-4;
01598   cur = 0;                      
01599   for (i=4; i<height-4; i++)
01600     {
01601       k = i*width+4;            
01602       ds = disp+k;              
01603       ls = labels+k;
01604       for (j=4; j<width-4; j++, k++, ls++, ds++)
01605         {
01606           if (*ds != badval)    
01607             {
01608               if (*ls)          
01609                 {  
01610                   if (rtype[*ls]) 
01611                     *ds = badval;
01612                 }
01613 
01614               
01615               else
01616                 {
01617                   ws = wbuf;    
01618                   p = k;        
01619                   cur++;        
01620                   cnt = 0;      
01621                   labels[p] = cur; 
01622 
01623                   
01624                   while (ws >= wbuf) 
01625                     {
01626                       cnt++;
01627                       
01628                       dp = disp[p];
01630                       if (p+1 < MAX_P)      push(p+1,dp);
01631                       if (p-1 >= MIN_P)     push(p-1,dp);
01632                       if (p-width >= MIN_P) push(p-width,dp);
01633                       if (p+width < MAX_P)  push(p+width,dp);
01634 
01635                       
01636                       
01637                       p = *--ws;
01638                     }
01639 
01640                   
01641                   if (cnt < rcount)     
01642                     {
01643                       rtype[*ls] = 1;   
01644                       *ds = badval;
01645                     }
01646                   else
01647                     {
01648                       rtype[*ls] = 0;   
01649 
01650                     }
01651                   
01652                 }
01653             }
01654 
01655         }
01656     }
01657 }
01658 
01659 
01660 
01661 
01662 
01663 
01664 
01665 
01666 void 
01667 do_rectify_mono(uint8_t *dest, uint8_t *src, int w, int h, inttab_t *rtab)
01668 {
01669   int i,j;
01670   uint8_t *p;
01671   int val;
01672   
01673   for (i=0; i<h; i++)
01674     {
01675       
01676       for (j=0; j<w; j++, dest++, rtab++)
01677         {
01678           p = src + rtab->addr; 
01679           val = (*p)*rtab->a1 + (*(p+1))*rtab->a2 + (*(p+w))*rtab->b1 + (*(p+w+1))*rtab->b2;
01680           *dest = val >> INTOFFSET; 
01681         }
01682     }
01683 }
01684 
01685 
01686 void 
01687 do_rectify_mono_fast(uint8_t *dest, uint8_t *src, int w, int h, inttab_t *rtab)
01688 {
01689   int i,j;
01690   uint8_t *p;
01691   int val;
01692   
01693   for (i=0; i<h; i++)
01694     {
01695       
01696       for (j=0; j<w; j++, dest++, rtab++)
01697         {
01698           p = src + rtab->addr; 
01699           val = (*p)*rtab->a1 + (*(p+1))*rtab->a2 + (*(p+w))*rtab->b1 + (*(p+w+1))*rtab->b2;
01700           *dest = val >> INTOFFSET; 
01701         }
01702     }
01703 }