00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029
00030
00031
00032
00033
00034
00035
00036
00037
00038
00039
00040 #include <frame_common/stereolib.h>
00041 #include <emmintrin.h>
00042 #include <stdio.h>
00043
00044
00045
00046
00047
00048
00049 static inline void
00050 memclr(void *buf, int n)
00051 {
00052 int i;
00053 unsigned char *bb = (unsigned char *)buf;
00054 for (i=0; i<n; i++)
00055 *bb++ = 0;
00056 }
00057
00058
00059
00060
00061
00062
00063
00064
00065
00066
00067
00068
00069 void
00070 do_prefilter_norm(uint8_t *im,
00071 uint8_t *ftim,
00072 int xim, int yim,
00073 uint8_t ftzero,
00074 uint8_t *buf
00075 )
00076 {
00077 int i,j;
00078 uint8_t *imp, *impp;
00079 uint16_t acc;
00080 uint8_t *topp, *toppp, *botp, *botpp, *cenp;
00081 int32_t qval;
00082 uint32_t uqval;
00083 uint8_t *ftimp;
00084
00085
00086 uint16_t *accbuf, *accp;
00087 int ACCBUFSIZE = xim+64;
00088 accbuf = (uint16_t *)buf;
00089
00090
00091 imp = im;
00092 impp = im;
00093
00094
00095 memclr(accbuf, ACCBUFSIZE*sizeof(int16_t));
00096
00097
00098 for (j=0; j<yim; j++, imp+=xim, ftim+=xim)
00099 {
00100 acc = 0;
00101 accp = accbuf;
00102 topp = imp;
00103 toppp = imp;
00104 botp = impp;
00105 botpp = impp;
00106
00107 if (j<YKERN)
00108 {
00109 for (i=0; i<XKERN; i++)
00110 acc += *topp++;
00111 for (; i<xim; i++)
00112 {
00113 acc += *topp++ - *toppp++;
00114 *accp++ += acc;
00115 }
00116 }
00117 else
00118 {
00119 cenp = imp - (YKERN/2)*xim + XKERN/2 + 1;
00120 ftimp = ftim - (YKERN/2)*xim + XKERN/2 + 1;
00121 for (i=0; i<XKERN; i++)
00122 acc += *topp++ - *botp++;
00123 for (; i<xim; i++, accp++, cenp++)
00124 {
00125 acc += *topp++ - *toppp++;
00126 acc -= *botp++ - *botpp++;
00127 *accp += acc;
00128
00129
00130 qval = 4*(*cenp) + *(cenp-1) + *(cenp+1) + *(cenp-xim) + *(cenp+xim);
00131 #if (XKERN==9)
00132 qval = (qval*10)+*cenp - *accp;
00133 qval = qval>>4;
00134 #endif
00135 #if (XKERN==7)
00136 qval = (qval*6)+*cenp - *accp;
00137 qval = qval>>3;
00138 #endif
00139
00140 if (qval < -ftzero)
00141 uqval = 0;
00142 else if (qval > ftzero)
00143 uqval = ftzero+ftzero;
00144 else
00145 uqval = qval + ftzero;
00146 *ftimp++ = (uint8_t)uqval;
00147 }
00148 impp += xim;
00149 }
00150 }
00151 }
00152
00153
00154
00155
00156
00157
00158
00159
00160
00161
00162
00163
00164 void
00165 do_prefilter_xsobel(uint8_t *im,
00166 uint8_t *ftim,
00167 int xim, int yim,
00168 uint8_t ftzero,
00169 uint8_t *buf
00170 )
00171 {
00172 int i,j;
00173 uint8_t *impp, *impc, *impn;
00174 uint8_t *ftimp;
00175 int v;
00176
00177
00178 ftim += xim + 1;
00179
00180
00181 for (j=1; j<yim-1; j++, im+=xim, ftim+=xim)
00182 {
00183 impp = im;
00184 impc = im+xim;
00185 impn = im+xim+xim;
00186 ftimp = ftim;
00187 for (i=1; i<xim-1; i+=4, impp+=4, impc+=4, impn+=4)
00188 {
00189
00190 v = *(impc+2)*2 - *(impc)*2 + *(impp+2) -
00191 *(impp) + *(impn+2) - *(impn);
00192 v = v >> 0;
00193 if (v < -ftzero)
00194 v = 0;
00195 else if (v > ftzero)
00196 v = ftzero+ftzero;
00197 else
00198 v = v + ftzero;
00199 *ftimp++ = (uint8_t)v;
00200
00201 v = *(impc+3)*2 - *(impc+1)*2 + *(impp+3) -
00202 *(impp+1) + *(impn+3) - *(impn+1);
00203 v = v >> 0;
00204 if (v < -ftzero)
00205 v = 0;
00206 else if (v > ftzero)
00207 v = ftzero+ftzero;
00208 else
00209 v = v + ftzero;
00210 *ftimp++ = (uint8_t)v;
00211
00212 v = *(impc+4)*2 - *(impc+2)*2 + *(impp+4) -
00213 *(impp+2) + *(impn+4) - *(impn+2);
00214 v = v >> 0;
00215 if (v < -ftzero)
00216 v = 0;
00217 else if (v > ftzero)
00218 v = ftzero+ftzero;
00219 else
00220 v = v + ftzero;
00221 *ftimp++ = (uint8_t)v;
00222
00223 v = *(impc+5)*2 - *(impc+3)*2 + *(impp+5) -
00224 *(impp+3) + *(impn+5) - *(impn+3);
00225 v = v >> 0;
00226 if (v < -ftzero)
00227 v = 0;
00228 else if (v > ftzero)
00229 v = ftzero+ftzero;
00230 else
00231 v = v + ftzero;
00232 *ftimp++ = (uint8_t)v;
00233 }
00234 }
00235 }
00236
00237
00238
00239
00240
00241
00242
00243
00244
00245
00246
00247 void
00248 do_stereo_y(uint8_t *lim, uint8_t *rim,
00249 int16_t *disp,
00250 int16_t *text,
00251 int xim, int yim,
00252 uint8_t ftzero,
00253 int xwin, int ywin,
00254 int dlen,
00255 int tfilter_thresh,
00256 int ufilter_thresh,
00257 uint8_t *buf
00258 )
00259
00260 {
00261 int i,j,d;
00262 int16_t *accp, *accpp, *accminp;
00263 int8_t *limp, *rimp, *limpp, *rimpp, *limp2, *limpp2;
00264 int8_t *corrend, *corrp, *corrpp;
00265 int16_t *intp, *intpp;
00266 int16_t *textpp;
00267 int16_t *dispp, *disppp;
00268 int16_t *textp;
00269 int16_t acc, newv;
00270 int8_t newc, temp;
00271 int16_t umin, uminthresh;
00272 int dval;
00273
00274
00275 #define YINTBUFSIZE (yim+64)
00276 int16_t *intbuf = (int16_t *)buf;
00277 #define YTEXTBUFSIZE (yim+64)
00278 int16_t *textbuf = (int16_t *)&buf[YINTBUFSIZE*sizeof(int16_t)];
00279 #define YACCBUFSIZE (yim*dlen + 64)
00280 int16_t *accbuf = (int16_t *)&buf[(YINTBUFSIZE + YTEXTBUFSIZE)*sizeof(int16_t)];
00281 int8_t *corrbuf = (int8_t *)&buf[(YINTBUFSIZE + YTEXTBUFSIZE + YACCBUFSIZE)*sizeof(int16_t)];
00282
00283
00284 memclr(intbuf, yim*sizeof(int16_t));
00285 memclr(corrbuf, dlen*yim*xwin*sizeof(int8_t));
00286 memclr(accbuf, dlen*yim*sizeof(int16_t));
00287 memclr(textbuf, yim*sizeof(int16_t));
00288
00289
00290 corrend = corrbuf + dlen*yim*xwin;
00291 corrp = corrbuf;
00292 accminp = 0;
00293
00294
00295 limp = (int8_t *)lim + dlen;
00296 limp2 = limp;
00297 rimp = (int8_t *)rim + dlen;
00298
00299 dispp = disp + xim*(ywin+YKERN-2)/2 + dlen + (xwin+XKERN-2)/2;
00300 textp = NULL;
00301 if (text != NULL)
00302 textp = text + dlen;
00303
00304
00305 tfilter_thresh = tfilter_thresh * xwin * ywin * ftzero;
00306 tfilter_thresh = tfilter_thresh / 100;
00307
00308
00309
00310
00311 for (i=0; i<xim-XKERN-dlen+2; i++, limp++, rimp++)
00312 {
00313 accp = accbuf;
00314 if (corrp >= corrend) corrp = corrbuf;
00315
00316
00317
00318 for (d=0; d<dlen; d++, accp+=yim, corrp+=yim)
00319 {
00320 limpp = limp;
00321 rimpp = rimp-d;
00322 intp = intbuf+ywin-1;
00323 intpp = intbuf;
00324 accpp = accp;
00325 corrpp = corrp;
00326
00327
00328
00329 for (j=0; j<yim-YKERN+1; j++, limpp+=xim, rimpp+=xim)
00330 {
00331 newc = abs(*limpp - *rimpp);
00332 newv = newc - *corrpp + *intp++;
00333 *corrpp++ = newc;
00334 *intp = newv;
00335 *accpp++ += newv - *intpp++;
00336 }
00337 }
00338
00339
00340
00341 limpp = limp;
00342 limpp2 = limp2;
00343 intp = intbuf+ywin-1;
00344 intpp = intbuf;
00345 accpp = textbuf;
00346 acc = 0;
00347
00348
00349
00350
00351 if (i < xwin)
00352 {
00353 for (j=0; j<yim-YKERN+1; j++, limpp+=xim)
00354 {
00355 temp = abs(*limpp- ftzero);
00356 *intp = temp;
00357 acc += *intp++ - *intpp++;
00358 *accpp++ += acc;
00359 }
00360 }
00361 else
00362 {
00363 for (j=0; j<yim-YKERN+1; j++, limpp+=xim, limpp2+=xim)
00364 {
00365 temp = abs(*limpp-ftzero);
00366 *intp = temp - abs(*limpp2-ftzero);
00367 acc += *intp++ - *intpp++;
00368 *accpp++ += acc;
00369 }
00370 limp2++;
00371 }
00372
00373
00374 if (i >= xwin)
00375 {
00376 disppp = dispp;
00377 accp = accbuf + (ywin-1);
00378 textpp = textbuf + (ywin-1);
00379
00380
00381 for (j=0; j<yim-ywin-YKERN+2; j++, accp++, disppp+=xim, textpp++)
00382 {
00383 umin = 32000;
00384 accpp = accp;
00385 dval = -1;
00386
00387 if (*textpp < tfilter_thresh)
00388 {
00389 *disppp = FILTEREDVAL;
00390 continue;
00391 }
00392
00393
00394 for (d=0; d<dlen; d++, accpp+=yim)
00395 {
00396 if (*accpp < umin)
00397 {
00398 accminp = accpp;
00399 umin = *accpp;
00400 dval = d;
00401 }
00402 }
00403
00404
00405 accpp = accp;
00406 if (ufilter_thresh > 0)
00407 {
00408 uminthresh = umin + (umin * ufilter_thresh) / 100;
00409 for (d=0; d<dlen; d++, accpp+=yim)
00410 {
00411 if (*accpp <= uminthresh && (d < dval-1 || d > dval+1))
00412 {
00413 dval = -1;
00414 break;
00415 }
00416 }
00417 }
00418 if (dval < 0)
00419 *disppp = FILTEREDVAL;
00420 else
00421 {
00422 double c, p, n, v;
00423 c = (double)*accminp;
00424 p = (double)*(accminp-yim);
00425 n = (double)*(accminp+yim);
00426 v = (double)dval + (p-n)/(2*(p+n-2*c));
00427 *disppp = (int)(0.5 + 16*v);
00428 }
00429 }
00430
00431 dispp++;
00432 }
00433
00434 }
00435
00436 }
00437
00438
00439
00440
00441
00442 void
00443 do_stereo_d(uint8_t *lim, uint8_t *rim,
00444 int16_t *disp,
00445 int16_t *text,
00446 int xim, int yim,
00447 uint8_t ftzero,
00448 int xwin, int ywin,
00449 int dlen,
00450 int tfilter_thresh,
00451 int ufilter_thresh,
00452 uint8_t *buf
00453 )
00454
00455 {
00456 int i,j,d;
00457 int16_t *accp, *accpp, *accminp;
00458 int8_t *limp, *rimp, *limpp, *rimpp, *limp2, *limpp2;
00459 int8_t *corrend, *corrp, *corrpp;
00460 int16_t *intp, *intpp;
00461 int16_t *textpp;
00462 int16_t *dispp, *disppp;
00463 int16_t *textp;
00464 int16_t acc, newv;
00465 int8_t temp, newc;
00466 int16_t umin, uminthresh;
00467 int dval;
00468
00469 int16_t *intbuf, *textbuf, *accbuf;
00470 int8_t *corrbuf;
00471
00472 int pfilter_thresh = 2*ywin*xwin;
00473
00474 #define INTBUFSIZE ((yim-YKERN+ywin)*dlen+64)
00475 intbuf = (int16_t *)buf;
00476 #define TEXTBUFSIZE (yim+64)
00477 textbuf = (int16_t *)&buf[INTBUFSIZE*sizeof(int16_t)];
00478 #define ACCBUFSIZE (yim*dlen + 64)
00479 accbuf = (int16_t *)&buf[(INTBUFSIZE+TEXTBUFSIZE)*sizeof(int16_t)];
00480 corrbuf = (int8_t *)&buf[(INTBUFSIZE+TEXTBUFSIZE+ACCBUFSIZE)*sizeof(int16_t)];
00481
00482
00483 memclr(intbuf, dlen*yim*sizeof(int16_t));
00484 memclr(corrbuf, dlen*yim*xwin*sizeof(int8_t));
00485 memclr(accbuf, dlen*yim*sizeof(int16_t));
00486 memclr(textbuf, yim*sizeof(int16_t));
00487
00488
00489 corrend = corrbuf + dlen*yim*xwin;
00490 corrp = corrbuf;
00491 accminp = 0;
00492
00493
00494 limp = (int8_t *)lim + dlen - 1;
00495 limp2 = limp;
00496 rimp = (int8_t *)rim;
00497 dispp = disp + xim*(ywin+YKERN-2)/2 + dlen + (xwin+XKERN-2)/2;
00498 textp = NULL;
00499 if (text != NULL)
00500 textp = text + dlen;
00501
00502
00503 tfilter_thresh = tfilter_thresh * xwin * ywin * ftzero;
00504 tfilter_thresh = tfilter_thresh / 100;
00505
00506
00507
00508
00509
00510 for (i=0; i<xim-XKERN-dlen+2; i++, limp++, rimp++, corrp+=yim*dlen)
00511 {
00512 accp = accbuf;
00513 if (corrp >= corrend) corrp = corrbuf;
00514 limpp = limp;
00515 rimpp = rimp;
00516 corrpp = corrp;
00517 intp = intbuf+(ywin-1)*dlen;
00518 intpp = intbuf;
00519
00520
00521 for (j=0; j<yim-YKERN+1; j++, limpp+=xim, rimpp+=xim)
00522 {
00523
00524
00525 for (d=0; d<dlen; d++, intp++)
00526 {
00527
00528 newc = abs(*limpp - rimpp[d]);
00529 newv = newc - *corrpp + *intp;
00530 *corrpp++ = newc;
00531 *(intp+dlen) = newv;
00532 *accp++ += newv - *intpp++;
00533 }
00534 }
00535
00536
00537
00538 memclr(intbuf, ywin*sizeof(int16_t));
00539 limpp = limp;
00540 limpp2 = limp2;
00541 intp = intbuf+ywin-1;
00542 intpp = intbuf;
00543 accpp = textbuf;
00544 acc = 0;
00545
00546
00547
00548
00549 if (i < xwin)
00550 {
00551 for (j=0; j<yim-YKERN+1; j++, limpp+=xim)
00552 {
00553 temp = abs(*limpp- ftzero);
00554 *intp = temp;
00555 acc += *intp++ - *intpp++;
00556 *accpp++ += acc;
00557 }
00558 }
00559 else
00560 {
00561 for (j=0; j<yim-YKERN+1; j++, limpp+=xim, limpp2+=xim)
00562 {
00563 temp = abs(*limpp-ftzero);
00564 *intp = temp - abs(*limpp2-ftzero);
00565 acc += *intp++ - *intpp++;
00566 *accpp++ += acc;
00567 }
00568 limp2++;
00569 }
00570
00571
00572 if (i >= xwin)
00573 {
00574 disppp = dispp;
00575 accp = accbuf + (ywin-1)*dlen;
00576 textpp = textbuf + (ywin-1);
00577
00578
00579 for (j=0; j<yim-ywin-YKERN+2; j++, accp+=dlen, disppp+=xim, textpp++)
00580 {
00581 umin = 32000;
00582 accpp = accp;
00583 dval = -1;
00584
00585 if (*textpp < tfilter_thresh)
00586 {
00587 *disppp = FILTEREDVAL;
00588 continue;
00589 }
00590
00591
00592 for (d=dlen-1; d>=0; d--, accpp++)
00593 {
00594 if (*accpp < umin)
00595 {
00596 accminp = accpp;
00597 umin = *accpp;
00598 dval = d;
00599 }
00600 }
00601
00602
00603 accpp = accp;
00604 if (ufilter_thresh > 0)
00605 {
00606 uminthresh = umin + (umin * ufilter_thresh) / 100;
00607 for (d=dlen-1; d>=0; d--, accpp++)
00608 {
00609 if (*accpp <= uminthresh && (d < dval-1 || d > dval+1))
00610 {
00611 dval = -1;
00612 break;
00613 }
00614 }
00615 }
00616 if (dval < 0)
00617 *disppp = FILTEREDVAL;
00618 else
00619 {
00620 double c, p, n, v;
00621 c = (double)*accminp;
00622 p = (double)*(accminp+1);
00623 n = (double)*(accminp-1);
00624
00625
00626 if( *(accminp+1) + *(accminp-1) - 2*(*accminp) < pfilter_thresh){
00627 *disppp = FILTEREDVAL;
00628 }
00629 else
00630 {
00631 #define POW 0.7
00632 #if 1
00633
00634 v = (p-n)/(p+n-2*c);
00635 if (v >= 0) v = pow(v,POW);
00636 else v = -pow(-v,POW);
00637 #elif 0
00638
00639 double k = 1.0;
00640 v = (k+1)*(p-n)/((p+n-2*c)+k*abs(p-n));
00641 #elif 0
00642
00643 v = (p-n)/(p+n-2*c);
00644 #else
00645 v = (p-n)/(p+n-2*c);
00646 v = 0;
00647 #endif
00648 v = (double)dval + 0.5*v;
00649 *disppp = (int)(0.5 + 16.0*v);
00650 }
00651 }
00652 }
00653
00654 dispp++;
00655 }
00656
00657 }
00658
00659 }
00660
00661
00662
00663
00664
00665
00666
00667 static inline void
00668 memclr_si128(__m128i *buf, int n)
00669 {
00670 int i;
00671 __m128i zz;
00672 zz = _mm_setzero_si128();
00673 n = n>>4;
00674 for (i=0; i<n; i++, buf++)
00675 _mm_store_si128(buf,zz);
00676 }
00677
00678
00679
00680
00681
00682
00683
00684
00685 #define PXKERN 7
00686 #define PYKERN 7
00687
00688 void
00689 do_prefilter_fast(uint8_t *im,
00690 uint8_t *ftim,
00691 int xim, int yim,
00692 uint8_t ftzero,
00693 uint8_t *buf
00694 )
00695 {
00696 int i,j;
00697
00698
00699 uintptr_t bufp = (uintptr_t)buf;
00700 int16_t *accbuf, *accp;
00701 int16_t *intbuf, *intp;
00702 uint8_t *imp, *impp, *ftimp;
00703 __m128i acc, accs, acc1, accn1, accn2, acct, pxs, opxs, zeros;
00704 __m128i const_ftzero, const_ftzero_x48, const_ftzero_x2;
00705
00706 int FACCBUFSIZE = xim+64;
00707
00708 if (bufp & 0xF)
00709 bufp = (bufp+15) & ~(uintptr_t)0xF;
00710 buf = (uint8_t *)bufp;
00711 accbuf = (int16_t *)buf;
00712
00713 intbuf = (int16_t *)&buf[FACCBUFSIZE*sizeof(int16_t)];
00714 bufp = (uintptr_t)intbuf;
00715 if (bufp & 0xF)
00716 bufp = (bufp+15) & ~(uintptr_t)0xF;
00717 intbuf = (int16_t *)bufp;
00718
00719
00720 memclr_si128((__m128i *)accbuf, FACCBUFSIZE*sizeof(int16_t));
00721 memclr_si128((__m128i *)intbuf, 8*sizeof(uint16_t));
00722 impp = im;
00723
00724
00725 zeros = _mm_setzero_si128();
00726 const_ftzero = _mm_set1_epi16(ftzero);
00727 const_ftzero_x48 = _mm_set1_epi16(ftzero*48);
00728 const_ftzero_x2 = _mm_set1_epi16(ftzero*2);
00729
00730
00731 for (j=0; j<yim; j++, im+=xim, ftim+=xim)
00732 {
00733 accp = accbuf;
00734 intp = intbuf+8;
00735 acc = _mm_setzero_si128();
00736 imp = im;
00737 impp = im - PYKERN*xim;
00738
00739
00740 if (j<PYKERN)
00741 {
00742 for (i=0; i<xim; i+=16, imp+=16, intp+=16, accp+=16)
00743 {
00744 pxs = _mm_load_si128((__m128i *)imp);
00745 accs = _mm_unpacklo_epi8(pxs,zeros);
00746 acc = _mm_srli_si128(acc, 14);
00747 accs = _mm_add_epi16(accs, acc);
00748
00749 acct = _mm_slli_si128(accs, 2);
00750 accs = _mm_add_epi16(accs, acct);
00751 acct = _mm_slli_si128(accs, 4);
00752 accs = _mm_add_epi16(accs, acct);
00753 acct = _mm_slli_si128(accs, 8);
00754 acc1 = _mm_add_epi16(accs, acct);
00755 _mm_store_si128((__m128i *)intp,acc1);
00756
00757 accs = _mm_unpackhi_epi8(pxs,zeros);
00758 acc = _mm_srli_si128(acc1, 14);
00759 accs = _mm_add_epi16(accs, acc);
00760
00761 acct = _mm_slli_si128(accs, 2);
00762 accs = _mm_add_epi16(accs, acct);
00763 acct = _mm_slli_si128(accs, 4);
00764 accs = _mm_add_epi16(accs, acct);
00765 acct = _mm_slli_si128(accs, 8);
00766 acc = _mm_add_epi16(accs, acct);
00767 _mm_store_si128((__m128i *)(intp+8),acc);
00768
00769
00770 acct = _mm_loadu_si128((__m128i *)(intp-7));
00771 acc1 = _mm_sub_epi16(acc1,acct);
00772 accs = _mm_load_si128((__m128i *)accp);
00773 acc1 = _mm_add_epi16(acc1,accs);
00774 _mm_store_si128((__m128i *)accp,acc1);
00775
00776 acct = _mm_loadu_si128((__m128i *)(intp-7+8));
00777 acc1 = _mm_sub_epi16(acc,acct);
00778 accs = _mm_load_si128((__m128i *)(accp+8));
00779 acc1 = _mm_add_epi16(acc1,accs);
00780 _mm_store_si128((__m128i *)(accp+8),acc1);
00781
00782 }
00783 }
00784
00785
00786 else
00787 {
00788 for (i=0; i<xim; i+=16, imp+=16, impp+=16, intp+=16, accp+=16)
00789 {
00790 pxs = _mm_load_si128((__m128i *)imp);
00791 opxs = _mm_load_si128((__m128i *)impp);
00792 accs = _mm_unpacklo_epi8(pxs,zeros);
00793 acc = _mm_srli_si128(acc, 14);
00794 accs = _mm_add_epi16(accs, acc);
00795 acct = _mm_unpacklo_epi8(opxs,zeros);
00796 accs = _mm_sub_epi16(accs, acct);
00797
00798
00799 acct = _mm_slli_si128(accs, 2);
00800 accs = _mm_add_epi16(accs, acct);
00801 acct = _mm_slli_si128(accs, 4);
00802 accs = _mm_add_epi16(accs, acct);
00803 acct = _mm_slli_si128(accs, 8);
00804 acc1 = _mm_add_epi16(accs, acct);
00805 _mm_store_si128((__m128i *)intp,acc1);
00806
00807
00808 accs = _mm_unpackhi_epi8(pxs,zeros);
00809 acc = _mm_srli_si128(acc1, 14);
00810 accs = _mm_add_epi16(accs, acc);
00811 acct = _mm_unpackhi_epi8(opxs,zeros);
00812 accs = _mm_sub_epi16(accs, acct);
00813
00814
00815 acct = _mm_slli_si128(accs, 2);
00816 accs = _mm_add_epi16(accs, acct);
00817 acct = _mm_slli_si128(accs, 4);
00818 accs = _mm_add_epi16(accs, acct);
00819 acct = _mm_slli_si128(accs, 8);
00820 acc = _mm_add_epi16(accs, acct);
00821 _mm_store_si128((__m128i *)(intp+8),acc);
00822
00823
00824 acct = _mm_loadu_si128((__m128i *)(intp-7));
00825 acc1 = _mm_sub_epi16(acc1,acct);
00826 accs = _mm_load_si128((__m128i *)accp);
00827 acc1 = _mm_add_epi16(acc1,accs);
00828 _mm_store_si128((__m128i *)accp,acc1);
00829
00830 acct = _mm_loadu_si128((__m128i *)(intp-7+8));
00831 acc1 = _mm_sub_epi16(acc,acct);
00832 accs = _mm_load_si128((__m128i *)(accp+8));
00833 acc1 = _mm_add_epi16(acc1,accs);
00834 _mm_store_si128((__m128i *)(accp+8),acc1);
00835 }
00836
00837
00838 accp = accbuf+6;
00839 imp = im - (PYKERN/2)*xim + PXKERN/2;
00840 ftimp = ftim - (PYKERN/2)*xim + PXKERN/2;
00841 for (i=0; i<xim-8; i+=16, imp+=16, accp+=16, ftimp+=16)
00842 {
00843
00844 pxs = _mm_loadu_si128((__m128i *)imp);
00845 accn1 = _mm_unpacklo_epi8(pxs,zeros);
00846 accn2 = _mm_unpackhi_epi8(pxs,zeros);
00847 accs = _mm_slli_epi16(accn1,2);
00848 acc = _mm_slli_epi16(accn2,2);
00849 pxs = _mm_loadu_si128((__m128i *)(imp+1));
00850 acct = _mm_unpacklo_epi8(pxs,zeros);
00851 acc1 = _mm_unpackhi_epi8(pxs,zeros);
00852 accs = _mm_add_epi16(acct,accs);
00853 acc = _mm_add_epi16(acc1,acc);
00854 pxs = _mm_loadu_si128((__m128i *)(imp-1));
00855 acct = _mm_unpacklo_epi8(pxs,zeros);
00856 acc1 = _mm_unpackhi_epi8(pxs,zeros);
00857 accs = _mm_add_epi16(acct,accs);
00858 acc = _mm_add_epi16(acc1,acc);
00859 pxs = _mm_loadu_si128((__m128i *)(imp+xim));
00860 acct = _mm_unpacklo_epi8(pxs,zeros);
00861 acc1 = _mm_unpackhi_epi8(pxs,zeros);
00862 accs = _mm_add_epi16(acct,accs);
00863 acc = _mm_add_epi16(acc1,acc);
00864 pxs = _mm_loadu_si128((__m128i *)(imp-xim));
00865 acct = _mm_unpacklo_epi8(pxs,zeros);
00866 acc1 = _mm_unpackhi_epi8(pxs,zeros);
00867 accs = _mm_add_epi16(acct,accs);
00868 acc = _mm_add_epi16(acc1,acc);
00869
00870
00871 acct = _mm_slli_epi16(accs,2);
00872 accs = _mm_add_epi16(accs,accs);
00873 accs = _mm_add_epi16(accs,acct);
00874 accs = _mm_add_epi16(accs,accn1);
00875
00876 acct = _mm_loadu_si128((__m128i *)accp);
00877 accs = _mm_sub_epi16(accs,acct);
00878
00879 accs = _mm_srai_epi16(accs,3);
00880 accs = _mm_adds_epi16(accs,const_ftzero);
00881
00882 accs = _mm_max_epi16(accs,zeros);
00883 accs = _mm_min_epi16(accs,const_ftzero_x2);
00884
00885
00886
00887 acct = _mm_slli_epi16(acc,2);
00888 acc = _mm_add_epi16(acc,acc);
00889 acc = _mm_add_epi16(acc,acct);
00890 acc = _mm_add_epi16(acc,accn2);
00891
00892 acct = _mm_loadu_si128((__m128i *)(accp+8));
00893 acc1 = _mm_sub_epi16(acc,acct);
00894
00895 acc1 = _mm_srai_epi16(acc1,3);
00896 acc1 = _mm_adds_epi16(acc1,const_ftzero);
00897
00898 acc1 = _mm_max_epi16(acc1,zeros);
00899 acc1 = _mm_min_epi16(acc1,const_ftzero_x2);
00900
00901
00902 accs = _mm_packus_epi16(accs,acc1);
00903 _mm_storeu_si128((__m128i *)ftimp,accs);
00904
00905 }
00906
00907 }
00908
00909 }
00910
00911 }
00912
00913
00914
00915
00916
00917
00918 #define EXACTUNIQ // set this to do exact uniqueness values
00919
00920 void
00921 do_stereo_d_fast(uint8_t *lim, uint8_t *rim,
00922 int16_t *disp,
00923 int16_t *text,
00924 int xim, int yim,
00925 uint8_t ftzero,
00926 int xwin, int ywin,
00927 int dlen,
00928 int tfilter_thresh,
00929 int ufilter_thresh,
00930 uint8_t *buf
00931 )
00932 {
00933 int i,j,k,d;
00934 int16_t *accp, *accpp;
00935 int8_t *limp, *rimp, *limpp, *rimpp, *limp2, *limpp2;
00936 int8_t *corrend, *corrp, *corrpp;
00937 int16_t *intp, *intpp;
00938 int16_t *textpp;
00939 int16_t *dispp, *disppp;
00940 int16_t *textp;
00941 int16_t acc;
00942 int dval;
00943 int uniqthresh;
00944
00945
00946 __m128i limpix, rimpix, newpix, tempix, oldpix;
00947 __m128i newval, temval;
00948 __m128i minv, indv, indtop, indd, indm, temv, nexv;
00949 __m128i cval, pval, nval, ival;
00950 __m128i denv, numv, intv, sgnv;
00951 __m128i uniqcnt, uniqth, uniqinit, unim, uval;
00952
00953
00954 #ifdef WIN32
00955 const __m128i p0xfffffff0 = {0x0, 0x0, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
00956 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff};
00957 const __m128i p0x0000000f = {0xff, 0xff, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
00958 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0};
00959 const __m128i zeros = {0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
00960 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0};
00961 const __m128i val_epi16_7fff = {0xff, 0x7f, 0xff, 0x7f, 0xff, 0x7f, 0xff, 0x7f,
00962 0xff, 0x7f, 0xff, 0x7f, 0xff, 0x7f, 0xff, 0x7f};
00963 const __m128i val_epi16_8 = {0x08, 0x0, 0x08, 0x0, 0x08, 0x0, 0x08, 0x0,
00964 0x08, 0x0, 0x08, 0x0, 0x08, 0x0, 0x08, 0x0};
00965 const __m128i val_epi16_1 = {0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00,
00966 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00};
00967 const __m128i val_epi16_2 = {0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00,
00968 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00};
00969 const __m128i val_epi16_3 = {0x03, 0x00, 0x03, 0x00, 0x03, 0x00, 0x03, 0x00,
00970 0x03, 0x00, 0x03, 0x00, 0x03, 0x00, 0x03, 0x00};
00971 #else
00972 const __m128i p0xfffffff0 = {0xffffffffffff0000LL, 0xffffffffffffffffLL};
00973 const __m128i p0x0000000f = {0xffffLL, 0x0LL};
00974 const __m128i zeros = {0x0LL, 0x0LL};
00975 const __m128i val_epi16_7fff = {0x7fff7fff7fff7fffLL, 0x7fff7fff7fff7fffLL};
00976 const __m128i val_epi16_8 = {0x0008000800080008LL, 0x0008000800080008LL};
00977 const __m128i val_epi16_1 = {0x0001000100010001LL, 0x0001000100010001LL};
00978 const __m128i val_epi16_2 = {0x0002000200020002LL, 0x0002000200020002LL};
00979 const __m128i val_epi16_3 = {0x0003000300030003LL, 0x0003000300030003LL};
00980 #endif
00981
00982
00983 uintptr_t bufp = (uintptr_t)buf;
00984 int16_t *intbuf, *textbuf, *accbuf, temp;
00985 int8_t *corrbuf;
00986
00987 if (bufp & 0xF)
00988 bufp = (bufp+15) & ~(uintptr_t)0xF;
00989 buf = (uint8_t *)bufp;
00990
00991 #define INTEBUFSIZE ((yim+16+ywin)*dlen)
00992 intbuf = (int16_t *)buf;
00993 bufp = (uintptr_t)intbuf;
00994 if (bufp & 0xF)
00995 bufp = (bufp+15) & ~(uintptr_t)0xF;
00996 intbuf = (int16_t *)bufp;
00997
00998 #define TXTBUFSIZE (yim + 64)
00999 textbuf = (int16_t *)&buf[INTEBUFSIZE*sizeof(int16_t)];
01000 bufp = (uintptr_t)textbuf;
01001 if (bufp & 0xF)
01002 bufp = (bufp+15) & ~(uintptr_t)0xF;
01003 textbuf = (int16_t *)bufp;
01004
01005
01006 #define ACCBUFSIZE (yim*dlen + 64)
01007 accbuf = (int16_t *)&buf[(INTEBUFSIZE+TXTBUFSIZE)*sizeof(int16_t)];
01008 bufp = (uintptr_t)accbuf;
01009 if (bufp & 0xF)
01010 bufp = (bufp+15) & ~(uintptr_t)0xF;
01011 accbuf = (int16_t *)bufp;
01012
01013
01014 corrbuf = (int8_t *)&buf[(INTEBUFSIZE+TXTBUFSIZE+ACCBUFSIZE)*sizeof(int16_t)];
01015 bufp = (uintptr_t)corrbuf;
01016 if (bufp & 0xF)
01017 bufp = (bufp+15) & ~(uintptr_t)0xF;
01018 corrbuf = (int8_t *)bufp;
01019
01020
01021 memclr_si128((__m128i *)intbuf, dlen*yim*sizeof(int16_t));
01022 memclr_si128((__m128i *)corrbuf, dlen*yim*xwin*sizeof(int8_t));
01023 memclr_si128((__m128i *)accbuf, dlen*yim*sizeof(int16_t));
01024 memclr_si128((__m128i *)textbuf, yim*sizeof(int16_t));
01025
01026
01027 corrend = corrbuf + dlen*yim*xwin;
01028 corrp = corrbuf;
01029
01030
01031 limp = (int8_t *)lim + dlen - 1;
01032 limp2 = limp;
01033 rimp = (int8_t *)rim;
01034 dispp = disp + xim*(ywin+YKERN-2)/2 + dlen + (xwin+XKERN-2)/2;
01035 textp = NULL;
01036 if (text != NULL)
01037 textp = text + dlen;
01038
01039
01040 tfilter_thresh = tfilter_thresh * xwin * ywin * ftzero;
01041 tfilter_thresh = tfilter_thresh / 100;
01042
01043
01044
01045
01046 indtop = _mm_set_epi16(dlen+0, dlen+1, dlen+2, dlen+3, dlen+4, dlen+5, dlen+6, dlen+7);
01047 uniqthresh = (0x8000 * ufilter_thresh)/100;
01048
01049 uniqinit = _mm_set1_epi16(uniqthresh);
01050
01051 intv = zeros;
01052 ival = zeros;
01053 nval = zeros;
01054 pval = zeros;
01055 cval = zeros;
01056 uval = zeros;
01057
01058
01059
01060
01061 for (i=0; i<xim-XKERN-dlen+2; i++, limp++, rimp++, corrp+=yim*dlen)
01062 {
01063 accp = accbuf;
01064 if (corrp >= corrend) corrp = corrbuf;
01065 limpp = limp;
01066 rimpp = rimp;
01067 corrpp = corrp;
01068 intp = intbuf+(ywin-1)*dlen;
01069 intpp = intbuf;
01070
01071
01072 for (j=0; j<yim-YKERN+1; j++, limpp+=xim, rimpp+=xim-dlen)
01073 {
01074
01075
01076 limpix = _mm_set1_epi8(*limpp);
01077
01078
01079
01080 for (d=0; d<dlen; d+=16, intp+=16, intpp+=16, accp+=16, corrpp+=16, rimpp+=16)
01081 {
01082
01083
01084 rimpix = _mm_loadu_si128((__m128i *)rimpp);
01085 newpix = _mm_subs_epu8(limpix,rimpix);
01086 tempix = _mm_subs_epu8(rimpix,limpix);
01087 newpix = _mm_add_epi8(newpix,tempix);
01088
01089
01090
01091
01092
01093 oldpix = _mm_load_si128((__m128i *)corrpp);
01094 newval = _mm_unpacklo_epi8(newpix,zeros);
01095 temval = _mm_load_si128((__m128i *)intp);
01096 newval = _mm_add_epi16(newval,temval);
01097 temval = _mm_unpacklo_epi8(oldpix,zeros);
01098 newval = _mm_sub_epi16(newval,temval);
01099
01100
01101
01102 _mm_store_si128((__m128i *)corrpp,newpix);
01103
01104
01105
01106 _mm_store_si128((__m128i *)(intp+dlen),newval);
01107
01108
01109
01110 temval = _mm_load_si128((__m128i *)intpp);
01111 newval = _mm_sub_epi16(newval,temval);
01112 temval = _mm_load_si128((__m128i *)accp);
01113 newval = _mm_add_epi16(newval,temval);
01114 _mm_store_si128((__m128i *)accp,newval);
01115
01116
01117
01118
01119 newval = _mm_unpackhi_epi8(newpix,zeros);
01120 temval = _mm_load_si128((__m128i *)(intp+8));
01121 newval = _mm_add_epi16(newval,temval);
01122 temval = _mm_unpackhi_epi8(oldpix,zeros);
01123 newval = _mm_sub_epi16(newval,temval);
01124
01125
01126
01127 _mm_store_si128((__m128i *)(intp+dlen+8),newval);
01128
01129
01130
01131 temval = _mm_load_si128((__m128i *)(intpp+8));
01132 newval = _mm_sub_epi16(newval,temval);
01133 temval = _mm_load_si128((__m128i *)(accp+8));
01134 newval = _mm_add_epi16(newval,temval);
01135 _mm_store_si128((__m128i *)(accp+8),newval);
01136 }
01137 }
01138
01139
01140
01141 memclr_si128((__m128i *)intbuf, yim*sizeof(int16_t));
01142 limpp = limp;
01143 limpp2 = limp2;
01144 intp = intbuf+ywin-1;
01145 intpp = intbuf;
01146 accpp = textbuf;
01147 acc = 0;
01148
01149 #if 1 // this should be optimized
01150
01151
01152
01153 if (i < xwin)
01154 {
01155 for (j=0; j<yim-YKERN+1; j++, limpp+=xim)
01156 {
01157 temp = abs(*limpp- ftzero);
01158 *intp = temp;
01159 acc += *intp++ - *intpp++;
01160 *accpp++ += acc;
01161 }
01162 }
01163 else
01164 {
01165 for (j=0; j<yim-YKERN+1; j++, limpp+=xim, limpp2+=xim)
01166 {
01167 temp = abs(*limpp-ftzero);
01168 *intp = temp - abs(*limpp2-ftzero);
01169 acc += *intp++ - *intpp++;
01170 *accpp++ += acc;
01171 }
01172 limp2++;
01173 }
01174 #endif
01175
01176
01177 if (i >= xwin)
01178 {
01179 disppp = dispp;
01180 accp = accbuf + (ywin-1)*dlen;
01181 textpp = textbuf + (ywin-1);
01182
01183
01184 accpp = accp;
01185 nexv = val_epi16_7fff;
01186 for (d=0; d<dlen; d+=8, accpp+=8)
01187 nexv = _mm_min_epi16(nexv,*(__m128i *)accpp);
01188
01189
01190 for (j=0; j<yim-ywin-YKERN+2; j+=8)
01191 {
01192
01193
01194
01195
01196
01197
01198
01199 for (k=0; k<8; k++, textpp++)
01200 {
01201
01202 cval = _mm_slli_si128(cval,2);
01203 pval = _mm_slli_si128(pval,2);
01204 nval = _mm_slli_si128(nval,2);
01205 uval = _mm_slli_si128(uval,2);
01206 ival = _mm_slli_si128(ival,2);
01207
01208
01209
01210 temv = _mm_shufflelo_epi16(nexv,0xb1);
01211 temv = _mm_shufflehi_epi16(temv,0xb1);
01212 minv = _mm_min_epi16(nexv,temv);
01213 minv = _mm_min_epi16(minv,_mm_shuffle_epi32(minv,0xb1));
01214 minv = _mm_min_epi16(minv,_mm_shuffle_epi32(minv,0x4e));
01215
01216 cval = _mm_or_si128(_mm_and_si128(cval,p0xfffffff0),_mm_and_si128(minv,p0x0000000f));
01217
01218 uniqth = _mm_mulhi_epu16(uniqinit,minv);
01219 uniqth = _mm_add_epi16(uniqth,minv);
01220
01221
01222
01223 indv = indtop;
01224 indd = val_epi16_8;
01225 uniqcnt = zeros;
01226 nexv = val_epi16_7fff;
01227 for (d=0; d<dlen; d+=8, accp+=8)
01228 {
01229 indm = _mm_cmpgt_epi16(*(__m128i *)accp,minv);
01230 indv = _mm_subs_epu16(indv,indd);
01231 indd = _mm_and_si128(indd,indm);
01232 unim = _mm_cmpgt_epi16(uniqth,*(__m128i *)accp);
01233 nexv = _mm_min_epi16(nexv,*(__m128i *)(accp+dlen));
01234 uniqcnt = _mm_sub_epi16(uniqcnt,unim);
01235 }
01236 indv = _mm_subs_epu16(indv,indd);
01237
01238 temv = _mm_shufflelo_epi16(indv,0xb1);
01239 temv = _mm_shufflehi_epi16(temv,0xb1);
01240 indv = _mm_max_epi16(indv,temv);
01241 indv = _mm_max_epi16(indv,_mm_shuffle_epi32(indv,0xb1));
01242 indv = _mm_max_epi16(indv,_mm_shuffle_epi32(indv,0x4e));
01243
01244
01245 dval = _mm_extract_epi16(indv,0);
01246 nval = _mm_insert_epi16(nval,*(accp-dval),0);
01247 pval = _mm_insert_epi16(pval,*(accp-dval-2),0);
01248
01249 ival = _mm_or_si128(_mm_and_si128(ival,p0xfffffff0),_mm_and_si128(indv,p0x0000000f));
01250
01251
01252 uniqcnt = _mm_sad_epu8(uniqcnt,zeros);
01253 uniqcnt = _mm_add_epi32(uniqcnt, _mm_srli_si128(uniqcnt,8));
01254 #ifdef EXACTUNIQ
01255 unim = _mm_cmpgt_epi16(uniqth,nval);
01256 uniqcnt = _mm_add_epi16(uniqcnt,unim);
01257 unim = _mm_cmpgt_epi16(uniqth,pval);
01258 uniqcnt = _mm_add_epi16(uniqcnt,unim);
01259 #endif
01260 if (*textpp < tfilter_thresh)
01261 uniqcnt = val_epi16_8;
01262 uval = _mm_or_si128(_mm_and_si128(uval,p0xfffffff0),
01263 _mm_and_si128(uniqcnt,p0x0000000f));
01264 }
01265
01266
01267
01268
01269
01270
01271
01272
01273
01274
01275
01276
01277
01278 numv = _mm_sub_epi16(pval,nval);
01279 sgnv = _mm_cmpgt_epi16(nval,pval);
01280 numv = _mm_xor_si128(numv,sgnv);
01281 numv = _mm_sub_epi16(numv,sgnv);
01282
01283
01284 denv = _mm_add_epi16(pval,nval);
01285 denv = _mm_sub_epi16(denv,cval);
01286 denv = _mm_sub_epi16(denv,cval);
01287
01288
01289 denv = _mm_add_epi16(denv,numv);
01290 numv = _mm_add_epi16(numv,numv);
01291
01292
01293
01294
01295 numv = _mm_add_epi16(numv,numv);
01296
01297 temv = _mm_cmpgt_epi16(numv,denv);
01298 intv = _mm_srli_epi16(temv,15);
01299 intv = _mm_slli_epi16(intv,1);
01300 numv = _mm_subs_epu16(numv,_mm_and_si128(temv,denv));
01301 numv = _mm_slli_epi16(numv,1);
01302
01303
01304 temv = _mm_cmpgt_epi16(numv,denv);
01305 intv = _mm_sub_epi16(intv,temv);
01306 intv = _mm_slli_epi16(intv,1);
01307 numv = _mm_subs_epu16(numv,_mm_and_si128(temv,denv));
01308 numv = _mm_slli_epi16(numv,1);
01309
01310
01311 temv = _mm_cmpgt_epi16(numv,denv);
01312 intv = _mm_sub_epi16(intv,temv);
01313 intv = _mm_slli_epi16(intv,1);
01314 numv = _mm_subs_epu16(numv,_mm_and_si128(temv,denv));
01315 numv = _mm_slli_epi16(numv,1);
01316
01317
01318 temv = _mm_cmpgt_epi16(numv,denv);
01319 intv = _mm_sub_epi16(intv,temv);
01320
01321
01322 intv = _mm_add_epi16(intv,val_epi16_1);
01323 intv = _mm_srli_epi16(intv,1);
01324 intv = _mm_xor_si128(intv,sgnv);
01325 intv = _mm_sub_epi16(intv,sgnv);
01326
01327
01328 ival = _mm_slli_epi16(ival,4);
01329 ival = _mm_sub_epi16(ival,intv);
01330
01331
01332
01333
01334
01335
01336
01337 #ifdef EXACTUNIQ
01338 uval = _mm_cmplt_epi16(uval,val_epi16_2);
01339 #else
01340 uval = _mm_cmplt_epi16(uval,val_epi16_3);
01341 #endif
01342 ival = _mm_and_si128(uval,ival);
01343
01344
01345 *disppp = _mm_extract_epi16(ival,7);
01346 disppp += xim;
01347 *disppp = _mm_extract_epi16(ival,6);
01348 disppp += xim;
01349 *disppp = _mm_extract_epi16(ival,5);
01350 disppp += xim;
01351 *disppp = _mm_extract_epi16(ival,4);
01352 disppp += xim;
01353 *disppp = _mm_extract_epi16(ival,3);
01354 disppp += xim;
01355 *disppp = _mm_extract_epi16(ival,2);
01356 disppp += xim;
01357 *disppp = _mm_extract_epi16(ival,1);
01358 disppp += xim;
01359 *disppp = _mm_extract_epi16(ival,0);
01360 disppp += xim;
01361
01362 }
01363
01364 dispp++;
01365 }
01366
01367 }
01368
01369
01370 dispp = disp + (yim - YKERN/2 - ywin/2)*xim;
01371 for (i=0; i<YKERN; i++, dispp+=xim)
01372 memclr_si128((__m128i *)dispp, xim*sizeof(int16_t));
01373
01374 }
01375
01376
01377
01378
01379
01380
01381
01382
01383
01384
01385
01386
01387 int
01388 do_stereo_sparse(uint8_t *refpat, uint8_t *rim,
01389 int x, int y,
01390 int xim, int yim,
01391 uint8_t ftzero,
01392 int dlen,
01393 int tfilter_thresh,
01394 int ufilter_thresh
01395 )
01396 {
01397 int i,j,k,sum,min,ind;
01398 uint8_t *rp, *rpp, *rppp, *pp, *ppp;
01399 int cbuf[1024];
01400 int *cp;
01401 float c, p, n, v;
01402
01403 if (x < dlen || y < 7 || y > (yim-8)) return -1;
01404
01405 min = (int)1e6;
01406 ind = 0;
01407 rp = rim + (y-7)*xim + x - dlen + 1 - 7;
01408 cp = cbuf;
01409 for (i=0; i<dlen; i++, rp++, cp++)
01410 {
01411 rpp = rp;
01412 pp = refpat;
01413 sum = 0;
01414 for (j=0; j<15; j++, rpp+=xim, pp+=16)
01415 {
01416 rppp = rpp;
01417 ppp = pp;
01418 for (k=0; k<15; k++)
01419 sum += abs(*ppp++ - *rppp++);
01420 }
01421
01422 *cp = sum;
01423 if (sum < min)
01424 {
01425 min = sum;
01426 ind = i;
01427 }
01428 }
01429
01430 if (ind==0 || ind==(dlen-1))
01431 return (dlen-ind-1)*16;
01432
01433 c = (float)min;
01434 p = (float)cbuf[ind+1];
01435 n = (float)cbuf[ind-1];
01436 v = (float)(dlen-ind-1) + (p-n)/(2*(p+n-2*c));
01437 return (int)(0.5 + 16*v);
01438 }
01439
01440
01441
01442
01443
01444
01445
01446
01447
01448
01449
01450
01451
01452 int __attribute__ ((force_align_arg_pointer))
01453 do_stereo_sparse_fast(uint8_t *refpat, uint8_t *rim,
01454 int x, int y,
01455 int xim, int yim,
01456 uint8_t ftzero,
01457 int dlen,
01458 int tfilter_thresh,
01459 int ufilter_thresh
01460 )
01461 {
01462 int i,j,k,sum,min,ind;
01463 uint8_t *rp, *rpp, *rppp, *pp, *ppp;
01464 int cbuf[1024];
01465 int *cp;
01466 float c, p, n, v;
01467
01468 if (x < dlen || y < 7 || y > (yim-8))
01469 return -1;
01470
01471 min = (int)1e6;
01472 ind = 0;
01473 rp = rim + (y-7)*xim + x - dlen + 1 - 7;
01474 cp = cbuf;
01475 for (i=0; i<dlen; i++, rp++, cp++) {
01476 rpp = rp;
01477 pp = refpat;
01478 #ifdef __SSE2__
01479 __m128i a, b, diff, total;
01480 total = (__m128i)_mm_setzero_ps();
01481 unsigned int sums[4];
01482
01483 #define ACCUM_SAD(offset) \
01484 a = (__m128i)_mm_loadu_ps((float*)(pp + (16 * offset))); \
01485 b = (__m128i)_mm_loadu_ps((float*)(rpp)); \
01486 rpp += xim; \
01487 diff = _mm_sad_epu8(a, b); \
01488 total = _mm_add_epi32(diff, total);
01489
01490 ACCUM_SAD(0)
01491 ACCUM_SAD(1)
01492 ACCUM_SAD(2)
01493 ACCUM_SAD(3)
01494 ACCUM_SAD(4)
01495 ACCUM_SAD(5)
01496 ACCUM_SAD(6)
01497 ACCUM_SAD(7)
01498 ACCUM_SAD(8)
01499 ACCUM_SAD(9)
01500 ACCUM_SAD(10)
01501 ACCUM_SAD(11)
01502 ACCUM_SAD(12)
01503 ACCUM_SAD(13)
01504 ACCUM_SAD(14)
01505 ACCUM_SAD(15)
01506 _mm_storeu_ps((float*)sums, (__m128)total);
01507 sum = sums[0] + sums[2];
01508 #else
01509 sum = 0;
01510 for (j=0; j<15; j++, rpp+=xim, pp+=16) {
01511 rppp = rpp;
01512 ppp = pp;
01513 for (k=0; k<15; k++)
01514 sum += abs(*ppp++ - *rppp++);
01515 }
01516 #endif
01517
01518 *cp = sum;
01519 if (sum < min) {
01520 min = sum;
01521 ind = i;
01522 }
01523 }
01524
01525
01526 if (ind==0 || ind==(dlen-1))
01527 return (dlen-ind-1)*16;
01528
01529 c = (float)min;
01530 p = (float)cbuf[ind+1];
01531 n = (float)cbuf[ind-1];
01532 v = (float)(dlen-ind-1) + (p-n)/(2*(p+n-2*c));
01533 return (int)(0.5 + 16*v);
01534 }
01535
01536
01537
01538
01539
01540
01541
01542
01543
01544
01545
01546
01547
01548
01549
01550
01551
01552
01553
01554
01555
01556
01557
01558
01559
01560
01561
01562
01563
01564
01565
01566
01567
01568
01569
01570
01571
01572
01573
01574
01575 #define push(x,y) { if ((disp[x] != badval) && (!labels[x]) \
01576 && (disp[x]-y < rdiff) && (disp[x]-y > -rdiff)) \
01577 { labels[x] = cur; *ws++ = x; }}
01578
01579 void
01580 do_speckle(int16_t *disp, int16_t badval, int width, int height,
01581 int rdiff, int rcount,
01582 uint32_t *labels, uint32_t *wbuf, uint8_t *rtype)
01583 {
01584 int i,j,k,cnt;
01585 uint32_t *ws, *ls;
01586 uint32_t p, cur;
01587 int16_t dp, *ds;
01588
01589
01590
01591
01592
01593 memset(labels,0x0,width*height*sizeof(uint32_t));
01594
01595
01596 const uint32_t MIN_P = 4*width+4;
01597 const uint32_t MAX_P = (height-4)*width-4;
01598 cur = 0;
01599 for (i=4; i<height-4; i++)
01600 {
01601 k = i*width+4;
01602 ds = disp+k;
01603 ls = labels+k;
01604 for (j=4; j<width-4; j++, k++, ls++, ds++)
01605 {
01606 if (*ds != badval)
01607 {
01608 if (*ls)
01609 {
01610 if (rtype[*ls])
01611 *ds = badval;
01612 }
01613
01614
01615 else
01616 {
01617 ws = wbuf;
01618 p = k;
01619 cur++;
01620 cnt = 0;
01621 labels[p] = cur;
01622
01623
01624 while (ws >= wbuf)
01625 {
01626 cnt++;
01627
01628 dp = disp[p];
01630 if (p+1 < MAX_P) push(p+1,dp);
01631 if (p-1 >= MIN_P) push(p-1,dp);
01632 if (p-width >= MIN_P) push(p-width,dp);
01633 if (p+width < MAX_P) push(p+width,dp);
01634
01635
01636
01637 p = *--ws;
01638 }
01639
01640
01641 if (cnt < rcount)
01642 {
01643 rtype[*ls] = 1;
01644 *ds = badval;
01645 }
01646 else
01647 {
01648 rtype[*ls] = 0;
01649
01650 }
01651
01652 }
01653 }
01654
01655 }
01656 }
01657 }
01658
01659
01660
01661
01662
01663
01664
01665
01666 void
01667 do_rectify_mono(uint8_t *dest, uint8_t *src, int w, int h, inttab_t *rtab)
01668 {
01669 int i,j;
01670 uint8_t *p;
01671 int val;
01672
01673 for (i=0; i<h; i++)
01674 {
01675
01676 for (j=0; j<w; j++, dest++, rtab++)
01677 {
01678 p = src + rtab->addr;
01679 val = (*p)*rtab->a1 + (*(p+1))*rtab->a2 + (*(p+w))*rtab->b1 + (*(p+w+1))*rtab->b2;
01680 *dest = val >> INTOFFSET;
01681 }
01682 }
01683 }
01684
01685
01686 void
01687 do_rectify_mono_fast(uint8_t *dest, uint8_t *src, int w, int h, inttab_t *rtab)
01688 {
01689 int i,j;
01690 uint8_t *p;
01691 int val;
01692
01693 for (i=0; i<h; i++)
01694 {
01695
01696 for (j=0; j<w; j++, dest++, rtab++)
01697 {
01698 p = src + rtab->addr;
01699 val = (*p)*rtab->a1 + (*(p+1))*rtab->a2 + (*(p+w))*rtab->b1 + (*(p+w+1))*rtab->b2;
01700 *dest = val >> INTOFFSET;
01701 }
01702 }
01703 }