20 #define EIGEN_TEST_SPECIFIC_BLOCKING_SIZES eigen_use_specific_block_size 21 #define EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_K eigen_block_size_k 22 #define EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_M eigen_block_size_m 23 #define EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_N eigen_block_size_n 28 using namespace Eigen;
56 static_assert((
maxsize & (
maxsize - 1)) == 0,
"maxsize must be a power of two");
57 static_assert((
minsize & (
minsize - 1)) == 0,
"minsize must be a power of two");
58 static_assert(
maxsize >
minsize,
"maxsize must be larger than minsize");
59 static_assert(
maxsize < (
minsize << 16),
"maxsize must be less than (minsize<<16)");
70 k = 1 << ((compact & 0xf00) >> 8);
71 m = 1 << ((compact & 0x0f0) >> 4);
72 n = 1 << ((compact & 0x00f) >> 0);
104 : compact_product_size(0)
105 , compact_block_size(0)
106 , use_default_block_size(false)
111 size_t bk,
size_t bm,
size_t bn)
114 , use_default_block_size(false)
119 , compact_block_size(0)
120 , use_default_block_size(true)
133 internal::computeProductBlockingSizes<Scalar, Scalar>(k,
m,
n);
134 s <<
" default(" << k <<
", " << m <<
", " <<
n <<
")";
157 if (use_default_block_size) {
170 const size_t combined_three_matrices_sizes =
172 (productsizes.
k * productsizes.
m +
173 productsizes.
k * productsizes.
n +
174 productsizes.
m * productsizes.
n);
179 const size_t unlikely_large_cache_size = 64 << 20;
181 const size_t working_set_size =
184 const size_t matrix_pool_size =
185 1 + working_set_size / combined_three_matrices_sizes;
191 for (
size_t i = 0;
i < matrix_pool_size;
i++) {
192 lhs[
i] = MatrixType::Zero(productsizes.
m, productsizes.
k);
193 rhs[
i] = MatrixType::Zero(productsizes.
k, productsizes.
n);
194 dst[
i] = MatrixType::Zero(productsizes.
m, productsizes.
n);
199 int iters_at_a_time = 1;
200 float time_per_iter = 0.0f;
201 size_t matrix_index = 0;
205 for (
int i = 0;
i < iters_at_a_time;
i++) {
206 dst[matrix_index].noalias() = lhs[matrix_index] * rhs[matrix_index];
208 if (matrix_index == matrix_pool_size) {
214 const float timing =
float(endtime - starttime);
217 time_per_iter = timing / iters_at_a_time;
221 iters_at_a_time *= 2;
228 gflops = 2
e-9 * productsizes.
k * productsizes.
m * productsizes.
n / time_per_iter;
234 cout <<
"contents of /proc/cpuinfo:" << endl;
236 ifstream cpuinfo(
"/proc/cpuinfo");
237 if (cpuinfo.is_open()) {
238 while (getline(cpuinfo, line)) {
239 cout << line << endl;
244 #elif defined __APPLE__ 245 cout <<
"output of sysctl hw:" << endl;
251 template <
typename T>
272 virtual void run()
const { abort(); }
277 const vector<unique_ptr<action_t>>& available_actions)
279 cerr <<
"usage: " << argv[0] <<
" <action> [options...]" << endl << endl;
280 cerr <<
"available actions:" << endl << endl;
281 for (
auto it = available_actions.begin(); it != available_actions.end(); ++it) {
282 cerr <<
" " << (*it)->invokation_name() << endl;
285 cerr <<
"options:" << endl << endl;
286 cerr <<
" --min-working-set-size=N:" << endl;
287 cerr <<
" Set the minimum working set size to N bytes." << endl;
288 cerr <<
" This is rounded up as needed to a multiple of matrix size." << endl;
289 cerr <<
" A larger working set lowers the chance of a warm cache." << endl;
290 cerr <<
" The default value 0 means use a large enough working" << endl;
291 cerr <<
" set to likely outsize caches." << endl;
292 cerr <<
" A value of 1 (that is, 1 byte) would mean don't do anything to" << endl;
293 cerr <<
" avoid warm caches." << endl;
299 cerr <<
"Measuring clock speed... \r" << flush;
301 vector<float> all_gflops;
302 for (
int i = 0;
i < 8;
i++) {
305 all_gflops.push_back(b.
gflops);
308 sort(all_gflops.begin(), all_gflops.end());
309 float stable_estimate = all_gflops[2] + all_gflops[3] + all_gflops[4] + all_gflops[5];
313 float result = stable_estimate * 123.456f;
327 if (remainder > 3600) {
328 int hours = remainder / 3600;
330 remainder -= hours * 3600;
332 if (remainder > 60) {
333 int minutes = remainder / 60;
334 s << minutes <<
" min ";
335 remainder -= minutes * 60;
338 s << remainder <<
" s";
347 FILE*
file = fopen(filename,
"w");
349 cerr <<
"Could not open file " << filename <<
" for writing." << endl;
350 cerr <<
"Do you have write permissions on the current working directory?" << endl;
353 size_t benchmarks_vector_size = benchmarks.size();
355 fwrite(&benchmarks_vector_size,
sizeof(benchmarks_vector_size), 1, file);
356 fwrite(&first_benchmark_to_run,
sizeof(first_benchmark_to_run), 1, file);
357 fwrite(benchmarks.data(),
sizeof(
benchmark_t), benchmarks.size(),
file);
363 FILE*
file = fopen(filename,
"r");
370 size_t benchmarks_vector_size = 0;
371 if (1 != fread(&benchmarks_vector_size,
sizeof(benchmarks_vector_size), 1, file)) {
374 if (1 != fread(&first_benchmark_to_run,
sizeof(first_benchmark_to_run), 1, file)) {
377 benchmarks.resize(benchmarks_vector_size);
378 if (benchmarks.size() != fread(benchmarks.data(),
sizeof(
benchmark_t), benchmarks.size(),
file)) {
386 vector<benchmark_t>& benchmarks,
388 size_t& first_benchmark_to_run)
390 if (first_benchmark_to_run == benchmarks.size()) {
394 double time_last_progress_update = 0;
395 double time_last_clock_speed_measurement = 0;
398 size_t benchmark_index = first_benchmark_to_run;
401 float ratio_done =
float(benchmark_index) / benchmarks.size();
405 if (benchmark_index == benchmarks.size() ||
406 time_now > time_last_clock_speed_measurement + 60.0f)
408 time_last_clock_speed_measurement = time_now;
418 const float clock_speed_tolerance = 0.02f;
420 if (current_clock_speed > (1 + clock_speed_tolerance) *
max_clock_speed) {
427 if (benchmark_index) {
428 cerr <<
"Restarting at " << 100.0f * ratio_done
429 <<
" % because clock speed increased. " << endl;
431 max_clock_speed = current_clock_speed;
432 first_benchmark_to_run = 0;
436 bool rerun_last_tests =
false;
438 if (current_clock_speed < (1 - clock_speed_tolerance) * max_clock_speed) {
439 cerr <<
"Measurements completed so far: " 440 << 100.0f * ratio_done
442 cerr <<
"Clock speed seems to be only " 443 << current_clock_speed/max_clock_speed
444 <<
" times what it used to be." << endl;
446 unsigned int seconds_to_sleep_if_lower_clock_speed = 1;
448 while (current_clock_speed < (1 - clock_speed_tolerance) * max_clock_speed) {
449 if (seconds_to_sleep_if_lower_clock_speed > 32) {
450 cerr <<
"Sleeping longer probably won't make a difference." << endl;
451 cerr <<
"Serializing benchmarks to " << session_filename << endl;
453 cerr <<
"Now restart this benchmark, and it should pick up where we left." << endl;
456 rerun_last_tests =
true;
458 << seconds_to_sleep_if_lower_clock_speed
459 <<
" s... \r" << endl;
460 sleep(seconds_to_sleep_if_lower_clock_speed);
462 seconds_to_sleep_if_lower_clock_speed *= 2;
466 if (rerun_last_tests) {
467 cerr <<
"Redoing the last " 468 << 100.0f *
float(benchmark_index - first_benchmark_to_run) / benchmarks.size()
469 <<
" % because clock speed had been low. " << endl;
475 first_benchmark_to_run = benchmark_index;
478 if (benchmark_index == benchmarks.size()) {
480 first_benchmark_to_run = benchmarks.size();
487 if (time_now > time_last_progress_update + 1.0
f) {
488 time_last_progress_update = time_now;
489 cerr <<
"Measurements... " << 100.0f * ratio_done
491 <<
human_duration_t(
float(time_now - time_start) * (1.0
f - ratio_done) / ratio_done)
496 benchmarks[benchmark_index].run();
503 size_t first_benchmark_to_run;
504 vector<benchmark_t> deserialized_benchmarks;
505 bool use_deserialized_benchmarks =
false;
507 cerr <<
"Found serialized session with " 508 << 100.0f * first_benchmark_to_run / deserialized_benchmarks.size()
509 <<
" % already done" << endl;
510 if (deserialized_benchmarks.size() == benchmarks.size() &&
511 first_benchmark_to_run > 0 &&
512 first_benchmark_to_run < benchmarks.size())
514 use_deserialized_benchmarks =
true;
518 if (use_deserialized_benchmarks) {
519 benchmarks = deserialized_benchmarks;
522 first_benchmark_to_run = 0;
528 random_shuffle(benchmarks.begin(), benchmarks.end());
531 for (
int i = 0;
i < 4;
i++) {
535 double time_start = 0.0;
536 while (first_benchmark_to_run < benchmarks.size()) {
537 if (first_benchmark_to_run == 0) {
542 first_benchmark_to_run);
548 sort(benchmarks.begin(), benchmarks.end());
551 vector<benchmark_t> best_benchmarks;
552 for (
auto it = benchmarks.begin(); it != benchmarks.end(); ++it) {
553 if (best_benchmarks.empty() ||
554 best_benchmarks.back().compact_product_size != it->compact_product_size ||
555 best_benchmarks.back().compact_block_size != it->compact_block_size)
557 best_benchmarks.push_back(*it);
562 benchmarks = best_benchmarks;
570 vector<benchmark_t> benchmarks;
575 for (
size_t kblock =
minsize; kblock <= ksize; kblock *= 2) {
576 for (
size_t mblock =
minsize; mblock <= msize; mblock *= 2) {
577 for (
size_t nblock =
minsize; nblock <= nsize; nblock *= 2) {
578 benchmarks.emplace_back(ksize, msize, nsize, kblock, mblock, nblock);
589 cout <<
"BEGIN MEASUREMENTS ALL POT SIZES" << endl;
590 for (
auto it = benchmarks.begin(); it != benchmarks.end(); ++it) {
601 vector<benchmark_t> benchmarks;
606 benchmarks.emplace_back(ksize, msize, nsize);
614 cout <<
"BEGIN MEASUREMENTS DEFAULT SIZES" << endl;
615 for (
auto it = benchmarks.begin(); it != benchmarks.end(); ++it) {
621 int main(
int argc,
char* argv[])
627 vector<unique_ptr<action_t>> available_actions;
631 auto action = available_actions.end();
636 for (
auto it = available_actions.begin(); it != available_actions.end(); ++it) {
637 if (!strcmp(argv[1], (*it)->invokation_name())) {
643 if (
action == available_actions.end()) {
647 for (
int i = 2;
i < argc;
i++) {
648 if (argv[
i] == strstr(argv[
i],
"--min-working-set-size=")) {
649 const char* equals_sign = strchr(argv[i],
'=');
652 cerr <<
"unrecognized option: " << argv[
i] << endl << endl;
659 cout <<
"benchmark parameters:" << endl;
660 cout <<
"pointer size: " << 8*
sizeof(
void*) <<
" bits" << endl;
661 cout <<
"scalar type: " << type_name<Scalar>() << endl;
663 cout <<
"minsize = " <<
minsize << endl;
664 cout <<
"maxsize = " <<
maxsize << endl;
668 if (min_working_set_size == 0) {
669 cout <<
" (try to outsize caches)";
671 cout << endl << endl;
void run_benchmarks(vector< benchmark_t > &benchmarks)
internal::packet_traits< Scalar >::type Packet
const int measurement_repetitions
const char session_filename[]
size_t min_working_set_size
uint16_t compact_product_size
void show_usage_and_exit(int, char *argv[], const vector< unique_ptr< action_t >> &available_actions)
double getCpuTime() const
bool operator<(const benchmark_t &b1, const benchmark_t &b2)
size_triple_t(uint16_t compact)
size_triple_t(const size_triple_t &o)
bool deserialize_benchmarks(const char *filename, vector< benchmark_t > &benchmarks, size_t &first_benchmark_to_run)
Namespace containing all symbols from the Eigen library.
CleanedUpDerType< DerType >::type() max(const AutoDiffScalar< DerType > &x, const T &y)
uint8_t log2_pot(size_t x)
benchmark_t(size_t pk, size_t pm, size_t pn)
virtual const char * invokation_name() const
MatrixType::Scalar Scalar
uint16_t compact_block_size
virtual const char * invokation_name() const
double getRealTime() const
void try_run_some_benchmarks(vector< benchmark_t > &benchmarks, double time_start, size_t &first_benchmark_to_run)
static const Line3 l(Rot3(), 1, 1)
bool eigen_use_specific_block_size
EIGEN_DEFAULT_DENSE_INDEX_TYPE Index
The Index type as used for the API.
Point2(* f)(const Point3 &, OptionalJacobian< 2, 3 >)
size_triple_t(size_t _k, size_t _m, size_t _n)
Array< double, 1, 3 > e(1./3., 0.5, 2.)
bool use_default_block_size
float measure_clock_speed()
benchmark_t(size_t pk, size_t pm, size_t pn, size_t bk, size_t bm, size_t bn)
const float min_accurate_time
uint16_t compact_size_triple(size_t k, size_t m, size_t n)
set noclip points set clip one set noclip two set bar set border lt lw set xdata set ydata set zdata set x2data set y2data set boxwidth set dummy x
string type_name< double >()
string type_name< float >()
std::ostream & operator<<(std::ostream &os, const DSizes< IndexType, NumDims > &dims)
int main(int argc, char *argv[])
virtual const char * invokation_name() const
void serialize_benchmarks(const char *filename, const vector< benchmark_t > &benchmarks, size_t first_benchmark_to_run)