20 #define EIGEN_TEST_SPECIFIC_BLOCKING_SIZES eigen_use_specific_block_size 21 #define EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_K eigen_block_size_k 22 #define EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_M eigen_block_size_m 23 #define EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_N eigen_block_size_n 28 using namespace Eigen;
56 static_assert((
maxsize & (
maxsize - 1)) == 0,
"maxsize must be a power of two");
57 static_assert((
minsize & (
minsize - 1)) == 0,
"minsize must be a power of two");
58 static_assert(
maxsize >
minsize,
"maxsize must be larger than minsize");
59 static_assert(
maxsize < (
minsize << 16),
"maxsize must be less than (minsize<<16)");
70 k = 1 << ((compact & 0xf00) >> 8);
71 m = 1 << ((compact & 0x0f0) >> 4);
72 n = 1 << ((compact & 0x00f) >> 0);
104 : compact_product_size(0)
105 , compact_block_size(0)
106 , use_default_block_size(false)
111 size_t bk,
size_t bm,
size_t bn)
114 , use_default_block_size(false)
119 , compact_block_size(0)
120 , use_default_block_size(true)
133 internal::computeProductBlockingSizes<Scalar, Scalar>(k,
m,
n);
134 s <<
" default(" << k <<
", " << m <<
", " <<
n <<
")";
157 if (use_default_block_size) {
170 const size_t combined_three_matrices_sizes =
172 (productsizes.
k * productsizes.
m +
173 productsizes.
k * productsizes.
n +
174 productsizes.
m * productsizes.
n);
179 const size_t unlikely_large_cache_size = 64 << 20;
181 const size_t working_set_size =
184 const size_t matrix_pool_size =
185 1 + working_set_size / combined_three_matrices_sizes;
191 for (
size_t i = 0;
i < matrix_pool_size;
i++) {
192 lhs[
i] = MatrixType::Zero(productsizes.
m, productsizes.
k);
193 rhs[
i] = MatrixType::Zero(productsizes.
k, productsizes.
n);
194 dst[
i] = MatrixType::Zero(productsizes.
m, productsizes.
n);
199 int iters_at_a_time = 1;
200 float time_per_iter = 0.0f;
201 size_t matrix_index = 0;
205 for (
int i = 0;
i < iters_at_a_time;
i++) {
206 dst[matrix_index].noalias() = lhs[matrix_index] * rhs[matrix_index];
208 if (matrix_index == matrix_pool_size) {
214 const float timing =
float(endtime - starttime);
217 time_per_iter = timing / iters_at_a_time;
221 iters_at_a_time *= 2;
228 gflops = 2
e-9 * productsizes.
k * productsizes.
m * productsizes.
n / time_per_iter;
234 cout <<
"contents of /proc/cpuinfo:" << endl;
236 ifstream cpuinfo(
"/proc/cpuinfo");
237 if (cpuinfo.is_open()) {
238 while (getline(cpuinfo, line)) {
239 cout << line << endl;
244 #elif defined __APPLE__ 245 cout <<
"output of sysctl hw:" << endl;
251 template <
typename T>
272 virtual void run()
const { abort(); }
277 const vector<unique_ptr<action_t>>& available_actions)
279 cerr <<
"usage: " << argv[0] <<
" <action> [options...]" << endl << endl;
280 cerr <<
"available actions:" << endl << endl;
281 for (
auto it = available_actions.begin(); it != available_actions.end(); ++it) {
282 cerr <<
" " << (*it)->invokation_name() << endl;
285 cerr <<
"options:" << endl << endl;
286 cerr <<
" --min-working-set-size=N:" << endl;
287 cerr <<
" Set the minimum working set size to N bytes." << endl;
288 cerr <<
" This is rounded up as needed to a multiple of matrix size." << endl;
289 cerr <<
" A larger working set lowers the chance of a warm cache." << endl;
290 cerr <<
" The default value 0 means use a large enough working" << endl;
291 cerr <<
" set to likely outsize caches." << endl;
292 cerr <<
" A value of 1 (that is, 1 byte) would mean don't do anything to" << endl;
293 cerr <<
" avoid warm caches." << endl;
299 cerr <<
"Measuring clock speed... \r" << flush;
301 vector<float> all_gflops;
302 for (
int i = 0;
i < 8;
i++) {
305 all_gflops.push_back(b.
gflops);
308 sort(all_gflops.begin(), all_gflops.end());
309 float stable_estimate = all_gflops[2] + all_gflops[3] + all_gflops[4] + all_gflops[5];
313 float result = stable_estimate * 123.456f;
327 if (remainder > 3600) {
328 int hours = remainder / 3600;
330 remainder -= hours * 3600;
332 if (remainder > 60) {
333 int minutes = remainder / 60;
334 s << minutes <<
" min ";
335 remainder -= minutes * 60;
338 s << remainder <<
" s";
347 FILE*
file = fopen(filename,
"w");
349 cerr <<
"Could not open file " << filename <<
" for writing." << endl;
350 cerr <<
"Do you have write permissions on the current working directory?" << endl;
353 size_t benchmarks_vector_size = benchmarks.size();
355 fwrite(&benchmarks_vector_size,
sizeof(benchmarks_vector_size), 1, file);
356 fwrite(&first_benchmark_to_run,
sizeof(first_benchmark_to_run), 1, file);
357 fwrite(benchmarks.data(),
sizeof(
benchmark_t), benchmarks.size(),
file);
363 FILE*
file = fopen(filename,
"r");
370 size_t benchmarks_vector_size = 0;
371 if (1 != fread(&benchmarks_vector_size,
sizeof(benchmarks_vector_size), 1, file)) {
374 if (1 != fread(&first_benchmark_to_run,
sizeof(first_benchmark_to_run), 1, file)) {
377 benchmarks.resize(benchmarks_vector_size);
378 if (benchmarks.size() != fread(benchmarks.data(),
sizeof(
benchmark_t), benchmarks.size(),
file)) {
386 vector<benchmark_t>& benchmarks,
388 size_t& first_benchmark_to_run)
390 if (first_benchmark_to_run == benchmarks.size()) {
394 double time_last_progress_update = 0;
395 double time_last_clock_speed_measurement = 0;
398 size_t benchmark_index = first_benchmark_to_run;
401 float ratio_done =
float(benchmark_index) / benchmarks.size();
405 if (benchmark_index == benchmarks.size() ||
406 time_now > time_last_clock_speed_measurement + 60.0f)
408 time_last_clock_speed_measurement = time_now;
418 const float clock_speed_tolerance = 0.02f;
420 if (current_clock_speed > (1 + clock_speed_tolerance) *
max_clock_speed) {
427 if (benchmark_index) {
428 cerr <<
"Restarting at " << 100.0f * ratio_done
429 <<
" % because clock speed increased. " << endl;
431 max_clock_speed = current_clock_speed;
432 first_benchmark_to_run = 0;
436 bool rerun_last_tests =
false;
438 if (current_clock_speed < (1 - clock_speed_tolerance) * max_clock_speed) {
439 cerr <<
"Measurements completed so far: " 440 << 100.0f * ratio_done
442 cerr <<
"Clock speed seems to be only " 443 << current_clock_speed/max_clock_speed
444 <<
" times what it used to be." << endl;
446 unsigned int seconds_to_sleep_if_lower_clock_speed = 1;
448 while (current_clock_speed < (1 - clock_speed_tolerance) * max_clock_speed) {
449 if (seconds_to_sleep_if_lower_clock_speed > 32) {
450 cerr <<
"Sleeping longer probably won't make a difference." << endl;
451 cerr <<
"Serializing benchmarks to " << session_filename << endl;
453 cerr <<
"Now restart this benchmark, and it should pick up where we left." << endl;
456 rerun_last_tests =
true;
458 << seconds_to_sleep_if_lower_clock_speed
459 <<
" s... \r" << endl;
460 sleep(seconds_to_sleep_if_lower_clock_speed);
462 seconds_to_sleep_if_lower_clock_speed *= 2;
466 if (rerun_last_tests) {
467 cerr <<
"Redoing the last " 468 << 100.0f *
float(benchmark_index - first_benchmark_to_run) / benchmarks.size()
469 <<
" % because clock speed had been low. " << endl;
475 first_benchmark_to_run = benchmark_index;
478 if (benchmark_index == benchmarks.size()) {
480 first_benchmark_to_run = benchmarks.size();
487 if (time_now > time_last_progress_update + 1.0
f) {
488 time_last_progress_update = time_now;
489 cerr <<
"Measurements... " << 100.0f * ratio_done
491 <<
human_duration_t(
float(time_now - time_start) * (1.0
f - ratio_done) / ratio_done)
496 benchmarks[benchmark_index].run();
503 size_t first_benchmark_to_run;
504 vector<benchmark_t> deserialized_benchmarks;
505 bool use_deserialized_benchmarks =
false;
507 cerr <<
"Found serialized session with " 508 << 100.0f * first_benchmark_to_run / deserialized_benchmarks.size()
509 <<
" % already done" << endl;
510 if (deserialized_benchmarks.size() == benchmarks.size() &&
511 first_benchmark_to_run > 0 &&
512 first_benchmark_to_run < benchmarks.size())
514 use_deserialized_benchmarks =
true;
518 if (use_deserialized_benchmarks) {
519 benchmarks = deserialized_benchmarks;
522 first_benchmark_to_run = 0;
528 random_shuffle(benchmarks.begin(), benchmarks.end());
531 for (
int i = 0;
i < 4;
i++) {
535 double time_start = 0.0;
536 while (first_benchmark_to_run < benchmarks.size()) {
537 if (first_benchmark_to_run == 0) {
542 first_benchmark_to_run);
548 sort(benchmarks.begin(), benchmarks.end());
551 vector<benchmark_t> best_benchmarks;
552 for (
auto it = benchmarks.begin(); it != benchmarks.end(); ++it) {
553 if (best_benchmarks.empty() ||
554 best_benchmarks.back().compact_product_size != it->compact_product_size ||
555 best_benchmarks.back().compact_block_size != it->compact_block_size)
557 best_benchmarks.push_back(*it);
562 benchmarks = best_benchmarks;
570 vector<benchmark_t> benchmarks;
575 for (
size_t kblock =
minsize; kblock <= ksize; kblock *= 2) {
576 for (
size_t mblock =
minsize; mblock <= msize; mblock *= 2) {
577 for (
size_t nblock =
minsize; nblock <= nsize; nblock *= 2) {
578 benchmarks.emplace_back(ksize, msize, nsize, kblock, mblock, nblock);
589 cout <<
"BEGIN MEASUREMENTS ALL POT SIZES" << endl;
590 for (
auto it = benchmarks.begin(); it != benchmarks.end(); ++it) {
601 vector<benchmark_t> benchmarks;
606 benchmarks.emplace_back(ksize, msize, nsize);
614 cout <<
"BEGIN MEASUREMENTS DEFAULT SIZES" << endl;
615 for (
auto it = benchmarks.begin(); it != benchmarks.end(); ++it) {
621 int main(
int argc,
char* argv[])
627 vector<unique_ptr<action_t>> available_actions;
631 auto action = available_actions.end();
636 for (
auto it = available_actions.begin(); it != available_actions.end(); ++it) {
637 if (!strcmp(argv[1], (*it)->invokation_name())) {
643 if (
action == available_actions.end()) {
647 for (
int i = 2;
i < argc;
i++) {
648 if (argv[
i] == strstr(argv[
i],
"--min-working-set-size=")) {
649 const char* equals_sign = strchr(argv[i],
'=');
652 cerr <<
"unrecognized option: " << argv[
i] << endl << endl;
659 cout <<
"benchmark parameters:" << endl;
660 cout <<
"pointer size: " << 8*
sizeof(
void*) <<
" bits" << endl;
661 cout <<
"scalar type: " << type_name<Scalar>() << endl;
663 cout <<
"minsize = " <<
minsize << endl;
664 cout <<
"maxsize = " <<
maxsize << endl;
668 if (min_working_set_size == 0) {
669 cout <<
" (try to outsize caches)";
671 cout << endl << endl;
void run_benchmarks(vector< benchmark_t > &benchmarks)
internal::packet_traits< Scalar >::type Packet
const int measurement_repetitions
const char session_filename[]
size_t min_working_set_size
virtual const char * invokation_name() const
uint16_t compact_product_size
void show_usage_and_exit(int, char *argv[], const vector< unique_ptr< action_t >> &available_actions)
bool operator<(const benchmark_t &b1, const benchmark_t &b2)
size_triple_t(uint16_t compact)
virtual const char * invokation_name() const
size_triple_t(const size_triple_t &o)
bool deserialize_benchmarks(const char *filename, vector< benchmark_t > &benchmarks, size_t &first_benchmark_to_run)
Namespace containing all symbols from the Eigen library.
uint8_t log2_pot(size_t x)
benchmark_t(size_t pk, size_t pm, size_t pn)
double getRealTime() const
const mpreal remainder(const mpreal &x, const mpreal &y, mp_rnd_t rnd_mode=mpreal::get_default_rnd())
MatrixType::Scalar Scalar
uint16_t compact_block_size
void try_run_some_benchmarks(vector< benchmark_t > &benchmarks, double time_start, size_t &first_benchmark_to_run)
static const Line3 l(Rot3(), 1, 1)
std::ostream & operator<<(std::ostream &os, const TensorBase< T, ReadOnlyAccessors > &expr)
double getCpuTime() const
bool eigen_use_specific_block_size
EIGEN_DEFAULT_DENSE_INDEX_TYPE Index
The Index type as used for the API.
Point2(* f)(const Point3 &, OptionalJacobian< 2, 3 >)
size_triple_t(size_t _k, size_t _m, size_t _n)
Array< double, 1, 3 > e(1./3., 0.5, 2.)
bool use_default_block_size
float measure_clock_speed()
benchmark_t(size_t pk, size_t pm, size_t pn, size_t bk, size_t bm, size_t bn)
const float min_accurate_time
uint16_t compact_size_triple(size_t k, size_t m, size_t n)
virtual const char * invokation_name() const
void run(Expr &expr, Dev &dev)
set noclip points set clip one set noclip two set bar set border lt lw set xdata set ydata set zdata set x2data set y2data set boxwidth set dummy x
string type_name< double >()
string type_name< float >()
int main(int argc, char *argv[])
void serialize_benchmarks(const char *filename, const vector< benchmark_t > &benchmarks, size_t first_benchmark_to_run)