00001 #include <typeinfo>
00002 #include <iostream>
00003 #include <Eigen/Core>
00004 #include "BenchTimer.h"
00005 using namespace Eigen;
00006 using namespace std;
00007
00008 template<typename T>
00009 EIGEN_DONT_INLINE typename T::Scalar sqsumNorm(const T& v)
00010 {
00011 return v.norm();
00012 }
00013
00014 template<typename T>
00015 EIGEN_DONT_INLINE typename T::Scalar hypotNorm(const T& v)
00016 {
00017 return v.hypotNorm();
00018 }
00019
00020 template<typename T>
00021 EIGEN_DONT_INLINE typename T::Scalar blueNorm(const T& v)
00022 {
00023 return v.blueNorm();
00024 }
00025
00026 template<typename T>
00027 EIGEN_DONT_INLINE typename T::Scalar lapackNorm(T& v)
00028 {
00029 typedef typename T::Scalar Scalar;
00030 int n = v.size();
00031 Scalar scale = 0;
00032 Scalar ssq = 1;
00033 for (int i=0;i<n;++i)
00034 {
00035 Scalar ax = internal::abs(v.coeff(i));
00036 if (scale >= ax)
00037 {
00038 ssq += internal::abs2(ax/scale);
00039 }
00040 else
00041 {
00042 ssq = Scalar(1) + ssq * internal::abs2(scale/ax);
00043 scale = ax;
00044 }
00045 }
00046 return scale * internal::sqrt(ssq);
00047 }
00048
00049 template<typename T>
00050 EIGEN_DONT_INLINE typename T::Scalar twopassNorm(T& v)
00051 {
00052 typedef typename T::Scalar Scalar;
00053 Scalar s = v.cwise().abs().maxCoeff();
00054 return s*(v/s).norm();
00055 }
00056
00057 template<typename T>
00058 EIGEN_DONT_INLINE typename T::Scalar bl2passNorm(T& v)
00059 {
00060 return v.stableNorm();
00061 }
00062
00063 template<typename T>
00064 EIGEN_DONT_INLINE typename T::Scalar divacNorm(T& v)
00065 {
00066 int n =v.size() / 2;
00067 for (int i=0;i<n;++i)
00068 v(i) = v(2*i)*v(2*i) + v(2*i+1)*v(2*i+1);
00069 n = n/2;
00070 while (n>0)
00071 {
00072 for (int i=0;i<n;++i)
00073 v(i) = v(2*i) + v(2*i+1);
00074 n = n/2;
00075 }
00076 return internal::sqrt(v(0));
00077 }
00078
00079 #ifdef EIGEN_VECTORIZE
00080 Packet4f internal::plt(const Packet4f& a, Packet4f& b) { return _mm_cmplt_ps(a,b); }
00081 Packet2d internal::plt(const Packet2d& a, Packet2d& b) { return _mm_cmplt_pd(a,b); }
00082
00083 Packet4f internal::pandnot(const Packet4f& a, Packet4f& b) { return _mm_andnot_ps(a,b); }
00084 Packet2d internal::pandnot(const Packet2d& a, Packet2d& b) { return _mm_andnot_pd(a,b); }
00085 #endif
00086
00087 template<typename T>
00088 EIGEN_DONT_INLINE typename T::Scalar pblueNorm(const T& v)
00089 {
00090 #ifndef EIGEN_VECTORIZE
00091 return v.blueNorm();
00092 #else
00093 typedef typename T::Scalar Scalar;
00094
00095 static int nmax = 0;
00096 static Scalar b1, b2, s1m, s2m, overfl, rbig, relerr;
00097 int n;
00098
00099 if(nmax <= 0)
00100 {
00101 int nbig, ibeta, it, iemin, iemax, iexp;
00102 Scalar abig, eps;
00103
00104 nbig = std::numeric_limits<int>::max();
00105 ibeta = std::numeric_limits<Scalar>::radix;
00106 it = std::numeric_limits<Scalar>::digits;
00107 iemin = std::numeric_limits<Scalar>::min_exponent;
00108 iemax = std::numeric_limits<Scalar>::max_exponent;
00109 rbig = std::numeric_limits<Scalar>::max();
00110
00111
00112 if(iemin > 1 - 2*it || 1+it>iemax || (it==2 && ibeta<5)
00113 || (it<=4 && ibeta <= 3 ) || it<2)
00114 {
00115 eigen_assert(false && "the algorithm cannot be guaranteed on this computer");
00116 }
00117 iexp = -((1-iemin)/2);
00118 b1 = std::pow(ibeta, iexp);
00119 iexp = (iemax + 1 - it)/2;
00120 b2 = std::pow(ibeta,iexp);
00121
00122 iexp = (2-iemin)/2;
00123 s1m = std::pow(ibeta,iexp);
00124 iexp = - ((iemax+it)/2);
00125 s2m = std::pow(ibeta,iexp);
00126
00127 overfl = rbig*s2m;
00128 eps = std::pow(ibeta, 1-it);
00129 relerr = internal::sqrt(eps);
00130 abig = 1.0/eps - 1.0;
00131 if (Scalar(nbig)>abig) nmax = abig;
00132 else nmax = nbig;
00133 }
00134
00135 typedef typename internal::packet_traits<Scalar>::type Packet;
00136 const int ps = internal::packet_traits<Scalar>::size;
00137 Packet pasml = internal::pset1(Scalar(0));
00138 Packet pamed = internal::pset1(Scalar(0));
00139 Packet pabig = internal::pset1(Scalar(0));
00140 Packet ps2m = internal::pset1(s2m);
00141 Packet ps1m = internal::pset1(s1m);
00142 Packet pb2 = internal::pset1(b2);
00143 Packet pb1 = internal::pset1(b1);
00144 for(int j=0; j<v.size(); j+=ps)
00145 {
00146 Packet ax = internal::pabs(v.template packet<Aligned>(j));
00147 Packet ax_s2m = internal::pmul(ax,ps2m);
00148 Packet ax_s1m = internal::pmul(ax,ps1m);
00149 Packet maskBig = internal::plt(pb2,ax);
00150 Packet maskSml = internal::plt(ax,pb1);
00151
00152
00153
00154
00155
00156
00157
00158
00159
00160
00161
00162
00163
00164 pabig = internal::padd(pabig, internal::pand(maskBig, internal::pmul(ax_s2m,ax_s2m)));
00165 pasml = internal::padd(pasml, internal::pand(maskSml, internal::pmul(ax_s1m,ax_s1m)));
00166 pamed = internal::padd(pamed, internal::pandnot(internal::pmul(ax,ax),internal::pand(maskSml,maskBig)));
00167 }
00168 Scalar abig = internal::predux(pabig);
00169 Scalar asml = internal::predux(pasml);
00170 Scalar amed = internal::predux(pamed);
00171 if(abig > Scalar(0))
00172 {
00173 abig = internal::sqrt(abig);
00174 if(abig > overfl)
00175 {
00176 eigen_assert(false && "overflow");
00177 return rbig;
00178 }
00179 if(amed > Scalar(0))
00180 {
00181 abig = abig/s2m;
00182 amed = internal::sqrt(amed);
00183 }
00184 else
00185 {
00186 return abig/s2m;
00187 }
00188
00189 }
00190 else if(asml > Scalar(0))
00191 {
00192 if (amed > Scalar(0))
00193 {
00194 abig = internal::sqrt(amed);
00195 amed = internal::sqrt(asml) / s1m;
00196 }
00197 else
00198 {
00199 return internal::sqrt(asml)/s1m;
00200 }
00201 }
00202 else
00203 {
00204 return internal::sqrt(amed);
00205 }
00206 asml = std::min(abig, amed);
00207 abig = std::max(abig, amed);
00208 if(asml <= abig*relerr)
00209 return abig;
00210 else
00211 return abig * internal::sqrt(Scalar(1) + internal::abs2(asml/abig));
00212 #endif
00213 }
00214
00215 #define BENCH_PERF(NRM) { \
00216 Eigen::BenchTimer tf, td, tcf; tf.reset(); td.reset(); tcf.reset();\
00217 for (int k=0; k<tries; ++k) { \
00218 tf.start(); \
00219 for (int i=0; i<iters; ++i) NRM(vf); \
00220 tf.stop(); \
00221 } \
00222 for (int k=0; k<tries; ++k) { \
00223 td.start(); \
00224 for (int i=0; i<iters; ++i) NRM(vd); \
00225 td.stop(); \
00226 } \
00227 for (int k=0; k<std::max(1,tries/3); ++k) { \
00228 tcf.start(); \
00229 for (int i=0; i<iters; ++i) NRM(vcf); \
00230 tcf.stop(); \
00231 } \
00232 std::cout << #NRM << "\t" << tf.value() << " " << td.value() << " " << tcf.value() << "\n"; \
00233 }
00234
00235 void check_accuracy(double basef, double based, int s)
00236 {
00237 double yf = basef * internal::abs(internal::random<double>());
00238 double yd = based * internal::abs(internal::random<double>());
00239 VectorXf vf = VectorXf::Ones(s) * yf;
00240 VectorXd vd = VectorXd::Ones(s) * yd;
00241
00242 std::cout << "reference\t" << internal::sqrt(double(s))*yf << "\t" << internal::sqrt(double(s))*yd << "\n";
00243 std::cout << "sqsumNorm\t" << sqsumNorm(vf) << "\t" << sqsumNorm(vd) << "\n";
00244 std::cout << "hypotNorm\t" << hypotNorm(vf) << "\t" << hypotNorm(vd) << "\n";
00245 std::cout << "blueNorm\t" << blueNorm(vf) << "\t" << blueNorm(vd) << "\n";
00246 std::cout << "pblueNorm\t" << pblueNorm(vf) << "\t" << pblueNorm(vd) << "\n";
00247 std::cout << "lapackNorm\t" << lapackNorm(vf) << "\t" << lapackNorm(vd) << "\n";
00248 std::cout << "twopassNorm\t" << twopassNorm(vf) << "\t" << twopassNorm(vd) << "\n";
00249 std::cout << "bl2passNorm\t" << bl2passNorm(vf) << "\t" << bl2passNorm(vd) << "\n";
00250 }
00251
00252 void check_accuracy_var(int ef0, int ef1, int ed0, int ed1, int s)
00253 {
00254 VectorXf vf(s);
00255 VectorXd vd(s);
00256 for (int i=0; i<s; ++i)
00257 {
00258 vf[i] = internal::abs(internal::random<double>()) * std::pow(double(10), internal::random<int>(ef0,ef1));
00259 vd[i] = internal::abs(internal::random<double>()) * std::pow(double(10), internal::random<int>(ed0,ed1));
00260 }
00261
00262
00263 std::cout << "sqsumNorm\t" << sqsumNorm(vf) << "\t" << sqsumNorm(vd) << "\t" << sqsumNorm(vf.cast<long double>()) << "\t" << sqsumNorm(vd.cast<long double>()) << "\n";
00264 std::cout << "hypotNorm\t" << hypotNorm(vf) << "\t" << hypotNorm(vd) << "\t" << hypotNorm(vf.cast<long double>()) << "\t" << hypotNorm(vd.cast<long double>()) << "\n";
00265 std::cout << "blueNorm\t" << blueNorm(vf) << "\t" << blueNorm(vd) << "\t" << blueNorm(vf.cast<long double>()) << "\t" << blueNorm(vd.cast<long double>()) << "\n";
00266 std::cout << "pblueNorm\t" << pblueNorm(vf) << "\t" << pblueNorm(vd) << "\t" << blueNorm(vf.cast<long double>()) << "\t" << blueNorm(vd.cast<long double>()) << "\n";
00267 std::cout << "lapackNorm\t" << lapackNorm(vf) << "\t" << lapackNorm(vd) << "\t" << lapackNorm(vf.cast<long double>()) << "\t" << lapackNorm(vd.cast<long double>()) << "\n";
00268 std::cout << "twopassNorm\t" << twopassNorm(vf) << "\t" << twopassNorm(vd) << "\t" << twopassNorm(vf.cast<long double>()) << "\t" << twopassNorm(vd.cast<long double>()) << "\n";
00269
00270 }
00271
00272 int main(int argc, char** argv)
00273 {
00274 int tries = 10;
00275 int iters = 100000;
00276 double y = 1.1345743233455785456788e12 * internal::random<double>();
00277 VectorXf v = VectorXf::Ones(1024) * y;
00278
00279
00280 int s = 10000;
00281 double basef_ok = 1.1345743233455785456788e15;
00282 double based_ok = 1.1345743233455785456788e95;
00283
00284 double basef_under = 1.1345743233455785456788e-27;
00285 double based_under = 1.1345743233455785456788e-303;
00286
00287 double basef_over = 1.1345743233455785456788e+27;
00288 double based_over = 1.1345743233455785456788e+302;
00289
00290 std::cout.precision(20);
00291
00292 std::cerr << "\nNo under/overflow:\n";
00293 check_accuracy(basef_ok, based_ok, s);
00294
00295 std::cerr << "\nUnderflow:\n";
00296 check_accuracy(basef_under, based_under, s);
00297
00298 std::cerr << "\nOverflow:\n";
00299 check_accuracy(basef_over, based_over, s);
00300
00301 std::cerr << "\nVarying (over):\n";
00302 for (int k=0; k<1; ++k)
00303 {
00304 check_accuracy_var(20,27,190,302,s);
00305 std::cout << "\n";
00306 }
00307
00308 std::cerr << "\nVarying (under):\n";
00309 for (int k=0; k<1; ++k)
00310 {
00311 check_accuracy_var(-27,20,-302,-190,s);
00312 std::cout << "\n";
00313 }
00314
00315 std::cout.precision(4);
00316 std::cerr << "Performance (out of cache):\n";
00317 {
00318 int iters = 1;
00319 VectorXf vf = VectorXf::Random(1024*1024*32) * y;
00320 VectorXd vd = VectorXd::Random(1024*1024*32) * y;
00321 VectorXcf vcf = VectorXcf::Random(1024*1024*32) * y;
00322 BENCH_PERF(sqsumNorm);
00323 BENCH_PERF(blueNorm);
00324
00325
00326
00327
00328 BENCH_PERF(bl2passNorm);
00329 }
00330
00331 std::cerr << "\nPerformance (in cache):\n";
00332 {
00333 int iters = 100000;
00334 VectorXf vf = VectorXf::Random(512) * y;
00335 VectorXd vd = VectorXd::Random(512) * y;
00336 VectorXcf vcf = VectorXcf::Random(512) * y;
00337 BENCH_PERF(sqsumNorm);
00338 BENCH_PERF(blueNorm);
00339
00340
00341
00342
00343 BENCH_PERF(bl2passNorm);
00344 }
00345 }