00001
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015 #ifndef VL_MATHOP_SSE2_INSTANTIATING
00016
00017 #include "mathop_sse2.h"
00018
00019 #undef FLT
00020 #define FLT VL_TYPE_DOUBLE
00021 #define VL_MATHOP_SSE2_INSTANTIATING
00022 #include "mathop_sse2.c"
00023
00024 #undef FLT
00025 #define FLT VL_TYPE_FLOAT
00026 #define VL_MATHOP_SSE2_INSTANTIATING
00027 #include "mathop_sse2.c"
00028
00029
00030
00031 #else
00032 #ifndef VL_DISABLE_SSE2
00033
00034 #ifndef __SSE2__
00035 #error Compiling SSE2 functions but SSE2 does not to be supported by the compiler.
00036 #endif
00037
00038 #include <emmintrin.h>
00039 #include "mathop.h"
00040 #include "generic.h"
00041 #include "float.th"
00042
00043 VL_INLINE T
00044 VL_XCAT(_vl_vhsum_sse2_, SFX)(VTYPE x)
00045 {
00046 T acc ;
00047 #if (VSIZE == 4)
00048 {
00049 VTYPE sum ;
00050 VTYPE shuffle ;
00051
00052
00053
00054
00055 shuffle = VSHU (x, x, _MM_SHUFFLE(1, 0, 3, 2)) ;
00056 sum = VADD (x, shuffle) ;
00057 shuffle = VSHU (sum, sum, _MM_SHUFFLE(2, 3, 0, 1)) ;
00058 x = VADD (sum, shuffle) ;
00059 }
00060 #else
00061 {
00062 VTYPE shuffle ;
00063
00064
00065
00066 shuffle = VSHU (x, x, _MM_SHUFFLE2(0, 1)) ;
00067 x = VADD (x, shuffle) ;
00068 }
00069 #endif
00070 VST1(&acc, x);
00071 return acc ;
00072 }
00073
00074
00075
00076 VL_EXPORT T
00077 VL_XCAT(_vl_dot_sse2_, SFX)
00078 (vl_size dimension, T const * X, T const * Y)
00079 {
00080 T const * X_end = X + dimension ;
00081 T const * X_vec_end = X_end - VSIZE + 1 ;
00082 T acc ;
00083 VTYPE vacc = VSTZ() ;
00084 vl_bool dataAligned = VALIGNED(X) & VALIGNED(Y) ;
00085
00086 if (dataAligned) {
00087 while (X < X_vec_end) {
00088 VTYPE a = *(VTYPE*)X ;
00089 VTYPE b = *(VTYPE*)Y ;
00090 VTYPE d = VMUL(a, b) ;
00091 vacc = VADD(vacc, d) ;
00092 X += VSIZE ;
00093 Y += VSIZE ;
00094 }
00095 } else {
00096 while (X < X_vec_end) {
00097 VTYPE a = VLDU(X) ;
00098 VTYPE b = VLDU(Y) ;
00099 VTYPE d = VMUL(a, b) ;
00100 vacc = VADD(vacc, d) ;
00101 X += VSIZE ;
00102 Y += VSIZE ;
00103 }
00104 }
00105
00106 acc = VL_XCAT(_vl_vhsum_sse2_, SFX)(vacc) ;
00107
00108 while (X < X_end) {
00109 T a = *X++ ;
00110 T b = *Y++ ;
00111 acc += a * b ;
00112 }
00113
00114 return acc ;
00115 }
00116
00117 VL_EXPORT T
00118 VL_XCAT(_vl_distance_l2_sse2_, SFX)
00119 (vl_size dimension, T const * X, T const * Y)
00120 {
00121 T const * X_end = X + dimension ;
00122 T const * X_vec_end = X_end - VSIZE + 1 ;
00123 T acc ;
00124 VTYPE vacc = VSTZ() ;
00125 vl_bool dataAligned = VALIGNED(X) & VALIGNED(Y) ;
00126
00127 if (dataAligned) {
00128 while (X < X_vec_end) {
00129 VTYPE a = *(VTYPE*)X ;
00130 VTYPE b = *(VTYPE*)Y ;
00131 VTYPE delta = VSUB(a, b) ;
00132 VTYPE delta2 = VMUL(delta, delta) ;
00133 vacc = VADD(vacc, delta2) ;
00134 X += VSIZE ;
00135 Y += VSIZE ;
00136 }
00137 } else {
00138 while (X < X_vec_end) {
00139 VTYPE a = VLDU(X) ;
00140 VTYPE b = VLDU(Y) ;
00141 VTYPE delta = VSUB(a, b) ;
00142 VTYPE delta2 = VMUL(delta, delta) ;
00143 vacc = VADD(vacc, delta2) ;
00144 X += VSIZE ;
00145 Y += VSIZE ;
00146 }
00147 }
00148
00149 acc = VL_XCAT(_vl_vhsum_sse2_, SFX)(vacc) ;
00150
00151 while (X < X_end) {
00152 T a = *X++ ;
00153 T b = *Y++ ;
00154 T delta = a - b ;
00155 acc += delta * delta ;
00156 }
00157
00158 return acc ;
00159 }
00160
00161 VL_EXPORT T
00162 VL_XCAT(_vl_distance_mahalanobis_sq_sse2_, SFX)
00163 (vl_size dimension, T const * X, T const * MU, T const * S)
00164 {
00165 T const * X_end = X + dimension ;
00166 T const * X_vec_end = X_end - VSIZE + 1 ;
00167 T acc ;
00168 VTYPE vacc = VSTZ() ;
00169 vl_bool dataAligned = VALIGNED(X) & VALIGNED(MU) & VALIGNED(S);
00170
00171 if (dataAligned) {
00172 while (X < X_vec_end) {
00173 VTYPE a = *(VTYPE*)X ;
00174 VTYPE b = *(VTYPE*)MU ;
00175 VTYPE c = *(VTYPE*)S ;
00176
00177 VTYPE delta = VSUB(a, b) ;
00178 VTYPE delta2 = VMUL(delta, delta) ;
00179 VTYPE delta2div = VMUL(delta2,c);
00180
00181 vacc = VADD(vacc, delta2div) ;
00182
00183 X += VSIZE ;
00184 MU += VSIZE ;
00185 S += VSIZE ;
00186 }
00187 } else {
00188 while (X < X_vec_end) {
00189
00190 VTYPE a = VLDU(X) ;
00191 VTYPE b = VLDU(MU) ;
00192 VTYPE c = VLDU(S) ;
00193
00194 VTYPE delta = VSUB(a, b) ;
00195 VTYPE delta2 = VMUL(delta, delta) ;
00196 VTYPE delta2div = VMUL(delta2,c);
00197
00198 vacc = VADD(vacc, delta2div) ;
00199
00200 X += VSIZE ;
00201 MU += VSIZE ;
00202 S += VSIZE ;
00203 }
00204 }
00205
00206 acc = VL_XCAT(_vl_vhsum_sse2_, SFX)(vacc) ;
00207
00208 while (X < X_end) {
00209 T a = *X++ ;
00210 T b = *MU++ ;
00211 T c = *S++ ;
00212 T delta = a - b ;
00213 acc += (delta * delta) * c;
00214 }
00215
00216 return acc ;
00217 }
00218
00219
00220
00221 VL_EXPORT T
00222 VL_XCAT(_vl_distance_l1_sse2_, SFX)
00223 (vl_size dimension, T const * X, T const * Y)
00224 {
00225 T const * X_end = X + dimension ;
00226 T const * X_vec_end = X + dimension - VSIZE ;
00227 T acc ;
00228 VTYPE vacc = VSTZ() ;
00229 VTYPE vminus = VL_XCAT(_mm_set1_p, VSFX) ((T) -0.0) ;
00230 vl_bool dataAligned = VALIGNED(X) & VALIGNED(Y) ;
00231
00232 if (dataAligned) {
00233 while (X < X_vec_end) {
00234 VTYPE a = *(VTYPE*)X ;
00235 VTYPE b = *(VTYPE*)Y ;
00236 VTYPE delta = VSUB(a, b) ;
00237 vacc = VADD(vacc, VANDN(vminus, delta)) ;
00238 X += VSIZE ;
00239 Y += VSIZE ;
00240 }
00241 } else {
00242 while (X < X_vec_end) {
00243 VTYPE a = VLDU(X) ;
00244 VTYPE b = VLDU(Y) ;
00245 VTYPE delta = VSUB(a, b) ;
00246 vacc = VADD(vacc, VANDN(vminus, delta)) ;
00247 X += VSIZE ;
00248 Y += VSIZE ;
00249 }
00250 }
00251
00252 acc = VL_XCAT(_vl_vhsum_sse2_, SFX)(vacc) ;
00253
00254 while (X < X_end) {
00255 T a = *X++ ;
00256 T b = *Y++ ;
00257 T delta = a - b ;
00258 acc += VL_MAX(delta, - delta) ;
00259 }
00260
00261 return acc ;
00262 }
00263
00264 VL_EXPORT T
00265 VL_XCAT(_vl_distance_chi2_sse2_, SFX)
00266 (vl_size dimension, T const * X, T const * Y)
00267 {
00268 T const * X_end = X + dimension ;
00269 T const * X_vec_end = X + dimension - VSIZE ;
00270 T acc ;
00271 VTYPE vacc = VSTZ() ;
00272 vl_bool dataAligned = VALIGNED(X) & VALIGNED(Y) ;
00273
00274 if (dataAligned) {
00275 while (X < X_vec_end) {
00276 VTYPE a = *(VTYPE*)X ;
00277 VTYPE b = *(VTYPE*)Y ;
00278 VTYPE delta = VSUB(a, b) ;
00279 VTYPE denom = VADD(a, b) ;
00280 VTYPE numer = VMUL(delta, delta) ;
00281 VTYPE ratio = VDIV(numer, denom) ;
00282 ratio = VAND(ratio, VNEQ(denom, VSTZ())) ;
00283 vacc = VADD(vacc, ratio) ;
00284 X += VSIZE ;
00285 Y += VSIZE ;
00286 }
00287 } else {
00288 while (X < X_vec_end) {
00289 VTYPE a = VLDU(X) ;
00290 VTYPE b = VLDU(Y) ;
00291 VTYPE delta = VSUB(a, b) ;
00292 VTYPE denom = VADD(a, b) ;
00293 VTYPE numer = VMUL(delta, delta) ;
00294 VTYPE ratio = VDIV(numer, denom) ;
00295 ratio = VAND(ratio, VNEQ(denom, VSTZ())) ;
00296 vacc = VADD(vacc, ratio) ;
00297 X += VSIZE ;
00298 Y += VSIZE ;
00299 }
00300 }
00301
00302 acc = VL_XCAT(_vl_vhsum_sse2_, SFX)(vacc) ;
00303
00304 while (X < X_end) {
00305 T a = *X++ ;
00306 T b = *Y++ ;
00307 T delta = a - b ;
00308 T denom = a + b ;
00309 T numer = delta * delta ;
00310 if (denom) {
00311 T ratio = numer / denom ;
00312 acc += ratio ;
00313 }
00314 }
00315 return acc ;
00316 }
00317
00318
00319 VL_EXPORT T
00320 VL_XCAT(_vl_kernel_l2_sse2_, SFX)
00321 (vl_size dimension, T const * X, T const * Y)
00322 {
00323 T const * X_end = X + dimension ;
00324 T const * X_vec_end = X_end - VSIZE + 1 ;
00325 T acc ;
00326 VTYPE vacc = VSTZ() ;
00327 vl_bool dataAligned = VALIGNED(X) & VALIGNED(Y) ;
00328
00329 if (dataAligned) {
00330 while (X < X_vec_end) {
00331 VTYPE a = *(VTYPE*)X ;
00332 VTYPE b = *(VTYPE*)Y ;
00333 vacc = VADD(vacc, VMUL(a,b)) ;
00334 X += VSIZE ;
00335 Y += VSIZE ;
00336 }
00337 } else {
00338 while (X < X_vec_end) {
00339 VTYPE a = VLDU(X) ;
00340 VTYPE b = VLDU(Y) ;
00341 vacc = VADD(vacc, VMUL(a,b)) ;
00342 X += VSIZE ;
00343 Y += VSIZE ;
00344 }
00345 }
00346
00347 acc = VL_XCAT(_vl_vhsum_sse2_, SFX)(vacc) ;
00348
00349 while (X < X_end) {
00350 T a = *X++ ;
00351 T b = *Y++ ;
00352 acc += a * b ;
00353 }
00354 return acc ;
00355 }
00356
00357 VL_EXPORT T
00358 VL_XCAT(_vl_kernel_l1_sse2_, SFX)
00359 (vl_size dimension, T const * X, T const * Y)
00360 {
00361 T const * X_end = X + dimension ;
00362 T const * X_vec_end = X_end - VSIZE + 1 ;
00363 T acc ;
00364 VTYPE vacc = VSTZ() ;
00365 VTYPE vminus = VL_XCAT(_mm_set1_p, VSFX) ((T) -0.0) ;
00366 vl_bool dataAligned = VALIGNED(X) & VALIGNED(Y) ;
00367
00368 if (dataAligned) {
00369 while (X < X_vec_end) {
00370 VTYPE a = *(VTYPE*)X ;
00371 VTYPE b = *(VTYPE*)Y ;
00372 VTYPE a_ = VANDN(vminus, a) ;
00373 VTYPE b_ = VANDN(vminus, b) ;
00374 VTYPE sum = VADD(a_,b_) ;
00375 VTYPE diff = VSUB(a, b) ;
00376 VTYPE diff_ = VANDN(vminus, diff) ;
00377 vacc = VADD(vacc, VSUB(sum, diff_)) ;
00378 X += VSIZE ;
00379 Y += VSIZE ;
00380 }
00381 } else {
00382 while (X < X_vec_end) {
00383 VTYPE a = VLDU(X) ;
00384 VTYPE b = VLDU(Y) ;
00385 VTYPE a_ = VANDN(vminus, a) ;
00386 VTYPE b_ = VANDN(vminus, b) ;
00387 VTYPE sum = VADD(a_,b_) ;
00388 VTYPE diff = VSUB(a, b) ;
00389 VTYPE diff_ = VANDN(vminus, diff) ;
00390 vacc = VADD(vacc, VSUB(sum, diff_)) ;
00391 X += VSIZE ;
00392 Y += VSIZE ;
00393 }
00394 }
00395
00396 acc = VL_XCAT(_vl_vhsum_sse2_, SFX)(vacc) ;
00397
00398 while (X < X_end) {
00399 T a = *X++ ;
00400 T b = *Y++ ;
00401 T a_ = VL_XCAT(vl_abs_, SFX) (a) ;
00402 T b_ = VL_XCAT(vl_abs_, SFX) (b) ;
00403 acc += a_ + b_ - VL_XCAT(vl_abs_, SFX) (a - b) ;
00404 }
00405
00406 return acc / ((T)2) ;
00407 }
00408
00409 VL_EXPORT T
00410 VL_XCAT(_vl_kernel_chi2_sse2_, SFX)
00411 (vl_size dimension, T const * X, T const * Y)
00412 {
00413 T const * X_end = X + dimension ;
00414 T const * X_vec_end = X + dimension - VSIZE ;
00415 T acc ;
00416 VTYPE vacc = VSTZ() ;
00417 vl_bool dataAligned = VALIGNED(X) & VALIGNED(Y) ;
00418
00419 if (dataAligned) {
00420 while (X < X_vec_end) {
00421 VTYPE a = *(VTYPE*)X ;
00422 VTYPE b = *(VTYPE*)Y ;
00423 VTYPE denom = VADD(a, b) ;
00424 VTYPE numer = VMUL(a,b) ;
00425 VTYPE ratio = VDIV(numer, denom) ;
00426 ratio = VAND(ratio, VNEQ(denom, VSTZ())) ;
00427 vacc = VADD(vacc, ratio) ;
00428 X += VSIZE ;
00429 Y += VSIZE ;
00430 }
00431 } else {
00432 while (X < X_vec_end) {
00433 VTYPE a = VLDU(X) ;
00434 VTYPE b = VLDU(Y) ;
00435 VTYPE denom = VADD(a, b) ;
00436 VTYPE numer = VMUL(a,b) ;
00437 VTYPE ratio = VDIV(numer, denom) ;
00438 ratio = VAND(ratio, VNEQ(denom, VSTZ())) ;
00439 vacc = VADD(vacc, ratio) ;
00440 X += VSIZE ;
00441 Y += VSIZE ;
00442 }
00443 }
00444
00445 acc = VL_XCAT(_vl_vhsum_sse2_, SFX)(vacc) ;
00446
00447 while (X < X_end) {
00448 T a = *X++ ;
00449 T b = *Y++ ;
00450 T denom = a + b ;
00451 if (denom) {
00452 T ratio = a * b / denom ;
00453 acc += ratio ;
00454 }
00455 }
00456 return ((T)2) * acc ;
00457 }
00458
00459 VL_EXPORT void
00460 VL_XCAT(_vl_weighted_sigma_sse2_, SFX)
00461 (vl_size dimension, T * S, T const * X, T const * Y, T const W)
00462 {
00463 T const * X_end = X + dimension ;
00464 T const * X_vec_end = X_end - VSIZE + 1 ;
00465
00466 vl_bool dataAligned = VALIGNED(X) & VALIGNED(Y) & VALIGNED(S);
00467
00468 VTYPE w = VLD1 (&W) ;
00469
00470 if (dataAligned) {
00471 while (X < X_vec_end) {
00472 VTYPE a = *(VTYPE*)X ;
00473 VTYPE b = *(VTYPE*)Y ;
00474 VTYPE s = *(VTYPE*)S ;
00475
00476 VTYPE delta = VSUB(a, b) ;
00477 VTYPE delta2 = VMUL(delta, delta) ;
00478 VTYPE delta2w = VMUL(delta2, w) ;
00479 VTYPE sigmaStore = VADD(s,delta2w);
00480
00481 *(VTYPE *)S = sigmaStore;
00482
00483 X += VSIZE ;
00484 Y += VSIZE ;
00485 S += VSIZE ;
00486 }
00487 } else {
00488 while (X < X_vec_end) {
00489 VTYPE a = VLDU(X) ;
00490 VTYPE b = VLDU(Y) ;
00491 VTYPE s = VLDU(S) ;
00492
00493 VTYPE delta = VSUB(a, b) ;
00494 VTYPE delta2 = VMUL(delta, delta) ;
00495 VTYPE delta2w = VMUL(delta2, w) ;
00496 VTYPE sigmaStore = VADD(s,delta2w);
00497
00498 VST2U(S,sigmaStore);
00499
00500 X += VSIZE ;
00501 Y += VSIZE ;
00502 S += VSIZE ;
00503 }
00504 }
00505
00506
00507 while (X < X_end) {
00508 T a = *X++ ;
00509 T b = *Y++ ;
00510 T delta = a - b ;
00511 *S += ((delta * delta)*W) ;
00512 S++;
00513 }
00514 }
00515
00516 VL_EXPORT void
00517 VL_XCAT(_vl_weighted_mean_sse2_, SFX)
00518 (vl_size dimension, T * MU, T const * X, T const W)
00519 {
00520 T const * X_end = X + dimension ;
00521 T const * X_vec_end = X_end - VSIZE + 1 ;
00522
00523 vl_bool dataAligned = VALIGNED(X) & VALIGNED(MU);
00524 VTYPE w = VLD1 (&W) ;
00525
00526 if (dataAligned) {
00527 while (X < X_vec_end) {
00528 VTYPE a = *(VTYPE*)X ;
00529 VTYPE mu = *(VTYPE*)MU ;
00530
00531 VTYPE aw = VMUL(a, w) ;
00532 VTYPE meanStore = VADD(aw, mu);
00533
00534 *(VTYPE *)MU = meanStore;
00535
00536 X += VSIZE ;
00537 MU += VSIZE ;
00538 }
00539 } else {
00540 while (X < X_vec_end) {
00541 VTYPE a = VLDU(X) ;
00542 VTYPE mu = VLDU(MU) ;
00543
00544 VTYPE aw = VMUL(a, w) ;
00545 VTYPE meanStore = VADD(aw, mu);
00546
00547 VST2U(MU,meanStore);
00548
00549 X += VSIZE ;
00550 MU += VSIZE ;
00551 }
00552 }
00553
00554 while (X < X_end) {
00555 T a = *X++ ;
00556 *MU += a * W ;
00557 MU++;
00558 }
00559 }
00560
00561
00562 #endif
00563 #undef VL_MATHOP_SSE2_INSTANTIATING
00564 #endif