analyze-blocking-sizes.cpp
Go to the documentation of this file.
1 // This file is part of Eigen, a lightweight C++ template library
2 // for linear algebra.
3 //
4 // Copyright (C) 2015 Benoit Jacob <benoitjacob@google.com>
5 //
6 // This Source Code Form is subject to the terms of the Mozilla
7 // Public License v. 2.0. If a copy of the MPL was not distributed
8 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
9 
10 #include <iostream>
11 #include <cstdint>
12 #include <cstdlib>
13 #include <vector>
14 #include <algorithm>
15 #include <fstream>
16 #include <string>
17 #include <cmath>
18 #include <cassert>
19 #include <cstring>
20 #include <memory>
21 
22 #include <Eigen/Core>
23 
24 using namespace std;
25 
26 const int default_precision = 4;
27 
28 // see --only-cubic-sizes
29 bool only_cubic_sizes = false;
30 
31 // see --dump-tables
32 bool dump_tables = false;
33 
34 uint8_t log2_pot(size_t x) {
35  size_t l = 0;
36  while (x >>= 1) l++;
37  return l;
38 }
39 
40 uint16_t compact_size_triple(size_t k, size_t m, size_t n)
41 {
42  return (log2_pot(k) << 8) | (log2_pot(m) << 4) | log2_pot(n);
43 }
44 
45 // just a helper to store a triple of K,M,N sizes for matrix product
47 {
48  uint16_t k, m, n;
49  size_triple_t() : k(0), m(0), n(0) {}
50  size_triple_t(size_t _k, size_t _m, size_t _n) : k(_k), m(_m), n(_n) {}
51  size_triple_t(const size_triple_t& o) : k(o.k), m(o.m), n(o.n) {}
53  {
54  k = 1 << ((compact & 0xf00) >> 8);
55  m = 1 << ((compact & 0x0f0) >> 4);
56  n = 1 << ((compact & 0x00f) >> 0);
57  }
58  bool is_cubic() const { return k == m && m == n; }
59 };
60 
61 ostream& operator<<(ostream& s, const size_triple_t& t)
62 {
63  return s << "(" << t.k << ", " << t.m << ", " << t.n << ")";
64 }
65 
67 {
71  float gflops;
72 };
73 
75 {
76  enum class type_t {
77  unknown,
78  all_pot_sizes,
79  default_sizes
80  };
81 
82  string filename;
83  vector<inputfile_entry_t> entries;
85 
86  inputfile_t(const string& fname)
87  : filename(fname)
88  , type(type_t::unknown)
89  {
90  ifstream stream(filename);
91  if (!stream.is_open()) {
92  cerr << "couldn't open input file: " << filename << endl;
93  exit(1);
94  }
95  string line;
96  while (getline(stream, line)) {
97  if (line.empty()) continue;
98  if (line.find("BEGIN MEASUREMENTS ALL POT SIZES") == 0) {
99  if (type != type_t::unknown) {
100  cerr << "Input file " << filename << " contains redundant BEGIN MEASUREMENTS lines";
101  exit(1);
102  }
103  type = type_t::all_pot_sizes;
104  continue;
105  }
106  if (line.find("BEGIN MEASUREMENTS DEFAULT SIZES") == 0) {
107  if (type != type_t::unknown) {
108  cerr << "Input file " << filename << " contains redundant BEGIN MEASUREMENTS lines";
109  exit(1);
110  }
111  type = type_t::default_sizes;
112  continue;
113  }
114 
115 
116  if (type == type_t::unknown) {
117  continue;
118  }
119  switch(type) {
120  case type_t::all_pot_sizes: {
121  unsigned int product_size, block_size;
122  float gflops;
123  int sscanf_result =
124  sscanf(line.c_str(), "%x %x %f",
125  &product_size,
126  &block_size,
127  &gflops);
128  if (3 != sscanf_result ||
129  !product_size ||
130  product_size > 0xfff ||
131  !block_size ||
132  block_size > 0xfff ||
133  !isfinite(gflops))
134  {
135  cerr << "ill-formed input file: " << filename << endl;
136  cerr << "offending line:" << endl << line << endl;
137  exit(1);
138  }
139  if (only_cubic_sizes && !size_triple_t(product_size).is_cubic()) {
140  continue;
141  }
142  inputfile_entry_t entry;
143  entry.product_size = uint16_t(product_size);
144  entry.pot_block_size = uint16_t(block_size);
145  entry.gflops = gflops;
146  entries.push_back(entry);
147  break;
148  }
149  case type_t::default_sizes: {
150  unsigned int product_size;
151  float gflops;
152  int bk, bm, bn;
153  int sscanf_result =
154  sscanf(line.c_str(), "%x default(%d, %d, %d) %f",
155  &product_size,
156  &bk, &bm, &bn,
157  &gflops);
158  if (5 != sscanf_result ||
159  !product_size ||
160  product_size > 0xfff ||
161  !isfinite(gflops))
162  {
163  cerr << "ill-formed input file: " << filename << endl;
164  cerr << "offending line:" << endl << line << endl;
165  exit(1);
166  }
167  if (only_cubic_sizes && !size_triple_t(product_size).is_cubic()) {
168  continue;
169  }
170  inputfile_entry_t entry;
171  entry.product_size = uint16_t(product_size);
172  entry.pot_block_size = 0;
173  entry.nonpot_block_size = size_triple_t(bk, bm, bn);
174  entry.gflops = gflops;
175  entries.push_back(entry);
176  break;
177  }
178 
179  default:
180  break;
181  }
182  }
183  stream.close();
184  if (type == type_t::unknown) {
185  cerr << "Unrecognized input file " << filename << endl;
186  exit(1);
187  }
188  if (entries.empty()) {
189  cerr << "didn't find any measurements in input file: " << filename << endl;
190  exit(1);
191  }
192  }
193 };
194 
196 {
199 
200  float efficiency;
201 };
202 
204 {
205  return e1.efficiency < e2.efficiency;
206 }
207 
209 {
210  string filename;
211  vector<preprocessed_inputfile_entry_t> entries;
212 
214  : filename(inputfile.filename)
215  {
216  if (inputfile.type != inputfile_t::type_t::all_pot_sizes) {
217  abort();
218  }
219  auto it = inputfile.entries.begin();
220  auto it_first_with_given_product_size = it;
221  while (it != inputfile.entries.end()) {
222  ++it;
223  if (it == inputfile.entries.end() ||
224  it->product_size != it_first_with_given_product_size->product_size)
225  {
226  import_input_file_range_one_product_size(it_first_with_given_product_size, it);
227  it_first_with_given_product_size = it;
228  }
229  }
230  }
231 
232 private:
234  const vector<inputfile_entry_t>::const_iterator& begin,
235  const vector<inputfile_entry_t>::const_iterator& end)
236  {
237  uint16_t product_size = begin->product_size;
238  float max_gflops = 0.0f;
239  for (auto it = begin; it != end; ++it) {
240  if (it->product_size != product_size) {
241  cerr << "Unexpected ordering of entries in " << filename << endl;
242  cerr << "(Expected all entries for product size " << hex << product_size << dec << " to be grouped)" << endl;
243  exit(1);
244  }
245  max_gflops = max(max_gflops, it->gflops);
246  }
247  for (auto it = begin; it != end; ++it) {
249  entry.product_size = it->product_size;
250  entry.block_size = it->pot_block_size;
251  entry.efficiency = it->gflops / max_gflops;
252  entries.push_back(entry);
253  }
254  }
255 };
256 
258  const vector<preprocessed_inputfile_t>& preprocessed_inputfiles)
259 {
260  if (preprocessed_inputfiles.empty()) {
261  return;
262  }
263 
264  const preprocessed_inputfile_t& first_file = preprocessed_inputfiles[0];
265  const size_t num_entries = first_file.entries.size();
266 
267  for (size_t i = 0; i < preprocessed_inputfiles.size(); i++) {
268  if (preprocessed_inputfiles[i].entries.size() != num_entries) {
269  cerr << "these files have different number of entries: "
270  << preprocessed_inputfiles[i].filename
271  << " and "
272  << first_file.filename
273  << endl;
274  exit(1);
275  }
276  }
277 
278  for (size_t entry_index = 0; entry_index < num_entries; entry_index++) {
279  const uint16_t entry_product_size = first_file.entries[entry_index].product_size;
280  const uint16_t entry_block_size = first_file.entries[entry_index].block_size;
281  for (size_t file_index = 0; file_index < preprocessed_inputfiles.size(); file_index++) {
282  const preprocessed_inputfile_t& cur_file = preprocessed_inputfiles[file_index];
283  if (cur_file.entries[entry_index].product_size != entry_product_size ||
284  cur_file.entries[entry_index].block_size != entry_block_size)
285  {
286  cerr << "entries not in same order between these files: "
287  << first_file.filename
288  << " and "
289  << cur_file.filename
290  << endl;
291  exit(1);
292  }
293  }
294  }
295 }
296 
298  const vector<preprocessed_inputfile_t>& preprocessed_inputfiles,
299  const vector<size_t>& subset)
300 {
301  if (subset.size() <= 1) {
302  return 1.0f;
303  }
304  const preprocessed_inputfile_t& first_file = preprocessed_inputfiles[subset[0]];
305  const size_t num_entries = first_file.entries.size();
306  float efficiency = 1.0f;
307  size_t entry_index = 0;
308  size_t first_entry_index_with_this_product_size = 0;
309  uint16_t product_size = first_file.entries[0].product_size;
310  while (entry_index < num_entries) {
311  ++entry_index;
312  if (entry_index == num_entries ||
313  first_file.entries[entry_index].product_size != product_size)
314  {
315  float efficiency_this_product_size = 0.0f;
316  for (size_t e = first_entry_index_with_this_product_size; e < entry_index; e++) {
317  float efficiency_this_entry = 1.0f;
318  for (auto i = subset.begin(); i != subset.end(); ++i) {
319  efficiency_this_entry = min(efficiency_this_entry, preprocessed_inputfiles[*i].entries[e].efficiency);
320  }
321  efficiency_this_product_size = max(efficiency_this_product_size, efficiency_this_entry);
322  }
323  efficiency = min(efficiency, efficiency_this_product_size);
324  if (entry_index < num_entries) {
325  first_entry_index_with_this_product_size = entry_index;
326  product_size = first_file.entries[entry_index].product_size;
327  }
328  }
329  }
330 
331  return efficiency;
332 }
333 
335  const vector<preprocessed_inputfile_t>& preprocessed_inputfiles,
336  const vector<size_t>& subset)
337 {
338  const preprocessed_inputfile_t& first_file = preprocessed_inputfiles[subset[0]];
339  const size_t num_entries = first_file.entries.size();
340  size_t entry_index = 0;
341  size_t first_entry_index_with_this_product_size = 0;
342  uint16_t product_size = first_file.entries[0].product_size;
343  size_t i = 0;
344  size_triple_t min_product_size(first_file.entries.front().product_size);
345  size_triple_t max_product_size(first_file.entries.back().product_size);
346  if (!min_product_size.is_cubic() || !max_product_size.is_cubic()) {
347  abort();
348  }
349  if (only_cubic_sizes) {
350  cerr << "Can't generate tables with --only-cubic-sizes." << endl;
351  abort();
352  }
353  cout << "struct LookupTable {" << endl;
354  cout << " static const size_t BaseSize = " << min_product_size.k << ";" << endl;
355  const size_t NumSizes = log2_pot(max_product_size.k / min_product_size.k) + 1;
356  const size_t TableSize = NumSizes * NumSizes * NumSizes;
357  cout << " static const size_t NumSizes = " << NumSizes << ";" << endl;
358  cout << " static const unsigned short* Data() {" << endl;
359  cout << " static const unsigned short data[" << TableSize << "] = {";
360  while (entry_index < num_entries) {
361  ++entry_index;
362  if (entry_index == num_entries ||
363  first_file.entries[entry_index].product_size != product_size)
364  {
365  float best_efficiency_this_product_size = 0.0f;
366  uint16_t best_block_size_this_product_size = 0;
367  for (size_t e = first_entry_index_with_this_product_size; e < entry_index; e++) {
368  float efficiency_this_entry = 1.0f;
369  for (auto i = subset.begin(); i != subset.end(); ++i) {
370  efficiency_this_entry = min(efficiency_this_entry, preprocessed_inputfiles[*i].entries[e].efficiency);
371  }
372  if (efficiency_this_entry > best_efficiency_this_product_size) {
373  best_efficiency_this_product_size = efficiency_this_entry;
374  best_block_size_this_product_size = first_file.entries[e].block_size;
375  }
376  }
377  if ((i++) % NumSizes) {
378  cout << " ";
379  } else {
380  cout << endl << " ";
381  }
382  cout << "0x" << hex << best_block_size_this_product_size << dec;
383  if (entry_index < num_entries) {
384  cout << ",";
385  first_entry_index_with_this_product_size = entry_index;
386  product_size = first_file.entries[entry_index].product_size;
387  }
388  }
389  }
390  if (i != TableSize) {
391  cerr << endl << "Wrote " << i << " table entries, expected " << TableSize << endl;
392  abort();
393  }
394  cout << endl << " };" << endl;
395  cout << " return data;" << endl;
396  cout << " }" << endl;
397  cout << "};" << endl;
398 }
399 
401  const vector<preprocessed_inputfile_t>& preprocessed_inputfiles,
402  const vector<vector<size_t>>& partition)
403 {
404  float efficiency = 1.0f;
405  for (auto s = partition.begin(); s != partition.end(); ++s) {
406  efficiency = min(efficiency, efficiency_of_subset(preprocessed_inputfiles, *s));
407  }
408  return efficiency;
409 }
410 
411 void make_first_subset(size_t subset_size, vector<size_t>& out_subset, size_t set_size)
412 {
413  assert(subset_size >= 1 && subset_size <= set_size);
414  out_subset.resize(subset_size);
415  for (size_t i = 0; i < subset_size; i++) {
416  out_subset[i] = i;
417  }
418 }
419 
420 bool is_last_subset(const vector<size_t>& subset, size_t set_size)
421 {
422  return subset[0] == set_size - subset.size();
423 }
424 
425 void next_subset(vector<size_t>& inout_subset, size_t set_size)
426 {
427  if (is_last_subset(inout_subset, set_size)) {
428  cerr << "iterating past the last subset" << endl;
429  abort();
430  }
431  size_t i = 1;
432  while (inout_subset[inout_subset.size() - i] == set_size - i) {
433  i++;
434  assert(i <= inout_subset.size());
435  }
436  size_t first_index_to_change = inout_subset.size() - i;
437  inout_subset[first_index_to_change]++;
438  size_t p = inout_subset[first_index_to_change];
439  for (size_t j = first_index_to_change + 1; j < inout_subset.size(); j++) {
440  inout_subset[j] = ++p;
441  }
442 }
443 
444 const size_t number_of_subsets_limit = 100;
446 
447 bool is_number_of_subsets_feasible(size_t n, size_t p)
448 {
449  assert(n>0 && p>0 && p<=n);
450  uint64_t numerator = 1, denominator = 1;
451  for (size_t i = 0; i < p; i++) {
452  numerator *= n - i;
453  denominator *= i + 1;
454  if (numerator > denominator * number_of_subsets_limit) {
455  return false;
456  }
457  }
458  return true;
459 }
460 
462 {
463  assert(n > 0);
464  const size_t minresult = min<size_t>(n-1, always_search_subsets_of_size_at_least);
465  for (size_t p = 1; p <= n - 1; p++) {
466  if (!is_number_of_subsets_feasible(n, p+1)) {
467  return max(p, minresult);
468  }
469  }
470  return n - 1;
471 }
472 
474  const vector<preprocessed_inputfile_t>& preprocessed_inputfiles,
475  float required_efficiency_to_beat,
476  vector<size_t>& inout_remainder,
477  vector<size_t>& out_subset)
478 {
479  out_subset.resize(0);
480 
481  if (required_efficiency_to_beat >= 1.0f) {
482  cerr << "can't beat efficiency 1." << endl;
483  abort();
484  }
485 
486  while (!inout_remainder.empty()) {
487 
488  vector<size_t> candidate_indices(inout_remainder.size());
489  for (size_t i = 0; i < candidate_indices.size(); i++) {
490  candidate_indices[i] = i;
491  }
492 
493  size_t candidate_indices_subset_size = max_feasible_subset_size(candidate_indices.size());
494  while (candidate_indices_subset_size >= 1) {
495  vector<size_t> candidate_indices_subset;
496  make_first_subset(candidate_indices_subset_size,
497  candidate_indices_subset,
498  candidate_indices.size());
499 
500  vector<size_t> best_candidate_indices_subset;
501  float best_efficiency = 0.0f;
502  vector<size_t> trial_subset = out_subset;
503  trial_subset.resize(out_subset.size() + candidate_indices_subset_size);
504  while (true)
505  {
506  for (size_t i = 0; i < candidate_indices_subset_size; i++) {
507  trial_subset[out_subset.size() + i] = inout_remainder[candidate_indices_subset[i]];
508  }
509 
510  float trial_efficiency = efficiency_of_subset(preprocessed_inputfiles, trial_subset);
511  if (trial_efficiency > best_efficiency) {
512  best_efficiency = trial_efficiency;
513  best_candidate_indices_subset = candidate_indices_subset;
514  }
515  if (is_last_subset(candidate_indices_subset, candidate_indices.size())) {
516  break;
517  }
518  next_subset(candidate_indices_subset, candidate_indices.size());
519  }
520 
521  if (best_efficiency > required_efficiency_to_beat) {
522  for (size_t i = 0; i < best_candidate_indices_subset.size(); i++) {
523  candidate_indices[i] = candidate_indices[best_candidate_indices_subset[i]];
524  }
525  candidate_indices.resize(best_candidate_indices_subset.size());
526  }
527  candidate_indices_subset_size--;
528  }
529 
530  size_t candidate_index = candidate_indices[0];
531  auto candidate_iterator = inout_remainder.begin() + candidate_index;
532  vector<size_t> trial_subset = out_subset;
533 
534  trial_subset.push_back(*candidate_iterator);
535  float trial_efficiency = efficiency_of_subset(preprocessed_inputfiles, trial_subset);
536  if (trial_efficiency > required_efficiency_to_beat) {
537  out_subset.push_back(*candidate_iterator);
538  inout_remainder.erase(candidate_iterator);
539  } else {
540  break;
541  }
542  }
543 }
544 
546  const vector<preprocessed_inputfile_t>& preprocessed_inputfiles,
547  float required_efficiency_to_beat,
548  vector<vector<size_t>>& out_partition)
549 {
550  out_partition.resize(0);
551 
552  vector<size_t> remainder;
553  for (size_t i = 0; i < preprocessed_inputfiles.size(); i++) {
554  remainder.push_back(i);
555  }
556 
557  while (!remainder.empty()) {
558  vector<size_t> new_subset;
560  preprocessed_inputfiles,
561  required_efficiency_to_beat,
562  remainder,
563  new_subset);
564  out_partition.push_back(new_subset);
565  }
566 }
567 
569  const vector<preprocessed_inputfile_t>& preprocessed_inputfiles,
570  const vector<vector<size_t>>& partition)
571 {
572  float efficiency = efficiency_of_partition(preprocessed_inputfiles, partition);
573  cout << "Partition into " << partition.size() << " subsets for " << efficiency * 100.0f << "% efficiency" << endl;
574  for (auto subset = partition.begin(); subset != partition.end(); ++subset) {
575  cout << " Subset " << (subset - partition.begin())
576  << ", efficiency " << efficiency_of_subset(preprocessed_inputfiles, *subset) * 100.0f << "%:"
577  << endl;
578  for (auto file = subset->begin(); file != subset->end(); ++file) {
579  cout << " " << preprocessed_inputfiles[*file].filename << endl;
580  }
581  if (dump_tables) {
582  cout << " Table:" << endl;
583  dump_table_for_subset(preprocessed_inputfiles, *subset);
584  }
585  }
586  cout << endl;
587 }
588 
589 struct action_t
590 {
591  virtual const char* invokation_name() const { abort(); return nullptr; }
592  virtual void run(const vector<string>&) const { abort(); }
593  virtual ~action_t() {}
594 };
595 
597 {
598  virtual const char* invokation_name() const override { return "partition"; }
599  virtual void run(const vector<string>& input_filenames) const override
600  {
601  vector<preprocessed_inputfile_t> preprocessed_inputfiles;
602 
603  if (input_filenames.empty()) {
604  cerr << "The " << invokation_name() << " action needs a list of input files." << endl;
605  exit(1);
606  }
607 
608  for (auto it = input_filenames.begin(); it != input_filenames.end(); ++it) {
609  inputfile_t inputfile(*it);
610  switch (inputfile.type) {
612  preprocessed_inputfiles.emplace_back(inputfile);
613  break;
615  cerr << "The " << invokation_name() << " action only uses measurements for all pot sizes, and "
616  << "has no use for " << *it << " which contains measurements for default sizes." << endl;
617  exit(1);
618  break;
619  default:
620  cerr << "Unrecognized input file: " << *it << endl;
621  exit(1);
622  }
623  }
624 
625  check_all_files_in_same_exact_order(preprocessed_inputfiles);
626 
627  float required_efficiency_to_beat = 0.0f;
628  vector<vector<vector<size_t>>> partitions;
629  cerr << "searching for partitions...\r" << flush;
630  while (true)
631  {
632  vector<vector<size_t>> partition;
634  preprocessed_inputfiles,
635  required_efficiency_to_beat,
636  partition);
637  float actual_efficiency = efficiency_of_partition(preprocessed_inputfiles, partition);
638  cerr << "partition " << preprocessed_inputfiles.size() << " files into " << partition.size()
639  << " subsets for " << 100.0f * actual_efficiency
640  << " % efficiency"
641  << " \r" << flush;
642  partitions.push_back(partition);
643  if (partition.size() == preprocessed_inputfiles.size() || actual_efficiency == 1.0f) {
644  break;
645  }
646  required_efficiency_to_beat = actual_efficiency;
647  }
648  cerr << " " << endl;
649  while (true) {
650  bool repeat = false;
651  for (size_t i = 0; i < partitions.size() - 1; i++) {
652  if (partitions[i].size() >= partitions[i+1].size()) {
653  partitions.erase(partitions.begin() + i);
654  repeat = true;
655  break;
656  }
657  }
658  if (!repeat) {
659  break;
660  }
661  }
662  for (auto it = partitions.begin(); it != partitions.end(); ++it) {
663  print_partition(preprocessed_inputfiles, *it);
664  }
665  }
666 };
667 
669 {
677  };
678  friend ostream& operator<<(ostream& s, const results_entry_t& entry)
679  {
680  return s
681  << "Product size " << size_triple_t(entry.product_size)
682  << ": default block size " << entry.default_block_size
683  << " -> " << entry.default_gflops
684  << " GFlop/s = " << entry.default_efficiency * 100.0f << " %"
685  << " of best POT block size " << size_triple_t(entry.best_pot_block_size)
686  << " -> " << entry.best_pot_gflops
687  << " GFlop/s" << dec;
688  }
689  static bool lower_efficiency(const results_entry_t& e1, const results_entry_t& e2) {
690  return e1.default_efficiency < e2.default_efficiency;
691  }
692  virtual const char* invokation_name() const override { return "evaluate-defaults"; }
693  void show_usage_and_exit() const
694  {
695  cerr << "usage: " << invokation_name() << " default-sizes-data all-pot-sizes-data" << endl;
696  cerr << "checks how well the performance with default sizes compares to the best "
697  << "performance measured over all POT sizes." << endl;
698  exit(1);
699  }
700  virtual void run(const vector<string>& input_filenames) const override
701  {
702  if (input_filenames.size() != 2) {
704  }
705  inputfile_t inputfile_default_sizes(input_filenames[0]);
706  inputfile_t inputfile_all_pot_sizes(input_filenames[1]);
707  if (inputfile_default_sizes.type != inputfile_t::type_t::default_sizes) {
708  cerr << inputfile_default_sizes.filename << " is not an input file with default sizes." << endl;
710  }
711  if (inputfile_all_pot_sizes.type != inputfile_t::type_t::all_pot_sizes) {
712  cerr << inputfile_all_pot_sizes.filename << " is not an input file with all POT sizes." << endl;
714  }
715  vector<results_entry_t> results;
716  vector<results_entry_t> cubic_results;
717 
718  uint16_t product_size = 0;
719  auto it_all_pot_sizes = inputfile_all_pot_sizes.entries.begin();
720  for (auto it_default_sizes = inputfile_default_sizes.entries.begin();
721  it_default_sizes != inputfile_default_sizes.entries.end();
722  ++it_default_sizes)
723  {
724  if (it_default_sizes->product_size == product_size) {
725  continue;
726  }
727  product_size = it_default_sizes->product_size;
728  while (it_all_pot_sizes != inputfile_all_pot_sizes.entries.end() &&
729  it_all_pot_sizes->product_size != product_size)
730  {
731  ++it_all_pot_sizes;
732  }
733  if (it_all_pot_sizes == inputfile_all_pot_sizes.entries.end()) {
734  break;
735  }
736  uint16_t best_pot_block_size = 0;
737  float best_pot_gflops = 0;
738  for (auto it = it_all_pot_sizes;
739  it != inputfile_all_pot_sizes.entries.end() && it->product_size == product_size;
740  ++it)
741  {
742  if (it->gflops > best_pot_gflops) {
743  best_pot_gflops = it->gflops;
744  best_pot_block_size = it->pot_block_size;
745  }
746  }
747  results_entry_t entry;
748  entry.product_size = product_size;
749  entry.default_block_size = it_default_sizes->nonpot_block_size;
750  entry.best_pot_block_size = best_pot_block_size;
751  entry.default_gflops = it_default_sizes->gflops;
752  entry.best_pot_gflops = best_pot_gflops;
753  entry.default_efficiency = entry.default_gflops / entry.best_pot_gflops;
754  results.push_back(entry);
755 
756  size_triple_t t(product_size);
757  if (t.k == t.m && t.m == t.n) {
758  cubic_results.push_back(entry);
759  }
760  }
761 
762  cout << "All results:" << endl;
763  for (auto it = results.begin(); it != results.end(); ++it) {
764  cout << *it << endl;
765  }
766  cout << endl;
767 
768  sort(results.begin(), results.end(), lower_efficiency);
769 
770  const size_t n = min<size_t>(20, results.size());
771  cout << n << " worst results:" << endl;
772  for (size_t i = 0; i < n; i++) {
773  cout << results[i] << endl;
774  }
775  cout << endl;
776 
777  cout << "cubic results:" << endl;
778  for (auto it = cubic_results.begin(); it != cubic_results.end(); ++it) {
779  cout << *it << endl;
780  }
781  cout << endl;
782 
783  sort(cubic_results.begin(), cubic_results.end(), lower_efficiency);
784 
785  cout.precision(2);
786  vector<float> a = {0.5f, 0.20f, 0.10f, 0.05f, 0.02f, 0.01f};
787  for (auto it = a.begin(); it != a.end(); ++it) {
788  size_t n = min(results.size() - 1, size_t(*it * results.size()));
789  cout << (100.0f * n / (results.size() - 1))
790  << " % of product sizes have default efficiency <= "
791  << 100.0f * results[n].default_efficiency << " %" << endl;
792  }
793  cout.precision(default_precision);
794  }
795 };
796 
797 
798 void show_usage_and_exit(int argc, char* argv[],
799  const vector<unique_ptr<action_t>>& available_actions)
800 {
801  cerr << "usage: " << argv[0] << " <action> [options...] <input files...>" << endl;
802  cerr << "available actions:" << endl;
803  for (auto it = available_actions.begin(); it != available_actions.end(); ++it) {
804  cerr << " " << (*it)->invokation_name() << endl;
805  }
806  cerr << "the input files should each contain an output of benchmark-blocking-sizes" << endl;
807  exit(1);
808 }
809 
810 int main(int argc, char* argv[])
811 {
812  cout.precision(default_precision);
813  cerr.precision(default_precision);
814 
815  vector<unique_ptr<action_t>> available_actions;
816  available_actions.emplace_back(new partition_action_t);
817  available_actions.emplace_back(new evaluate_defaults_action_t);
818 
819  vector<string> input_filenames;
820 
821  action_t* action = nullptr;
822 
823  if (argc < 2) {
824  show_usage_and_exit(argc, argv, available_actions);
825  }
826  for (int i = 1; i < argc; i++) {
827  bool arg_handled = false;
828  // Step 1. Try to match action invocation names.
829  for (auto it = available_actions.begin(); it != available_actions.end(); ++it) {
830  if (!strcmp(argv[i], (*it)->invokation_name())) {
831  if (!action) {
832  action = it->get();
833  arg_handled = true;
834  break;
835  } else {
836  cerr << "can't specify more than one action!" << endl;
837  show_usage_and_exit(argc, argv, available_actions);
838  }
839  }
840  }
841  if (arg_handled) {
842  continue;
843  }
844  // Step 2. Try to match option names.
845  if (argv[i][0] == '-') {
846  if (!strcmp(argv[i], "--only-cubic-sizes")) {
847  only_cubic_sizes = true;
848  arg_handled = true;
849  }
850  if (!strcmp(argv[i], "--dump-tables")) {
851  dump_tables = true;
852  arg_handled = true;
853  }
854  if (!arg_handled) {
855  cerr << "Unrecognized option: " << argv[i] << endl;
856  show_usage_and_exit(argc, argv, available_actions);
857  }
858  }
859  if (arg_handled) {
860  continue;
861  }
862  // Step 3. Default to interpreting args as input filenames.
863  input_filenames.emplace_back(argv[i]);
864  }
865 
866  if (dump_tables && only_cubic_sizes) {
867  cerr << "Incompatible options: --only-cubic-sizes and --dump-tables." << endl;
868  show_usage_and_exit(argc, argv, available_actions);
869  }
870 
871  if (!action) {
872  show_usage_and_exit(argc, argv, available_actions);
873  }
874 
875  action->run(input_filenames);
876 }
Matrix3f m
unsigned char uint8_t
Definition: ms_stdint.h:83
float efficiency
virtual void run(const vector< string > &input_filenames) const override
size_t max_feasible_subset_size(size_t n)
float gflops
int main(int argc, char *argv[])
#define max(a, b)
Definition: datatypes.h:20
vector< preprocessed_inputfile_entry_t > entries
void make_first_subset(size_t subset_size, vector< size_t > &out_subset, size_t set_size)
uint16_t pot_block_size
size_triple_t(uint16_t compact)
bool dump_tables
#define min(a, b)
Definition: datatypes.h:19
virtual void run(const vector< string > &) const
size_triple_t(const size_triple_t &o)
bool lower_efficiency(const preprocessed_inputfile_entry_t &e1, const preprocessed_inputfile_entry_t &e2)
int n
Definition: BFloat16.h:88
unsigned short uint16_t
Definition: ms_stdint.h:84
#define isfinite(X)
Definition: main.h:95
float efficiency_of_subset(const vector< preprocessed_inputfile_t > &preprocessed_inputfiles, const vector< size_t > &subset)
uint8_t log2_pot(size_t x)
void find_partition_with_efficiency_higher_than(const vector< preprocessed_inputfile_t > &preprocessed_inputfiles, float required_efficiency_to_beat, vector< vector< size_t >> &out_partition)
vector< inputfile_entry_t > entries
Scalar Scalar int size
Definition: benchVecAdd.cpp:17
float efficiency_of_partition(const vector< preprocessed_inputfile_t > &preprocessed_inputfiles, const vector< vector< size_t >> &partition)
static const Line3 l(Rot3(), 1, 1)
size_triple_t nonpot_block_size
void next_subset(vector< size_t > &inout_subset, size_t set_size)
uint16_t block_size
void show_usage_and_exit(int argc, char *argv[], const vector< unique_ptr< action_t >> &available_actions)
const size_t number_of_subsets_limit
static bool lower_efficiency(const results_entry_t &e1, const results_entry_t &e2)
bool is_last_subset(const vector< size_t > &subset, size_t set_size)
inputfile_t(const string &fname)
bool is_number_of_subsets_feasible(size_t n, size_t p)
unsigned __int64 uint64_t
Definition: ms_stdint.h:95
uint16_t compact_size_triple(size_t k, size_t m, size_t n)
float default_gflops
Point2(* f)(const Point3 &, OptionalJacobian< 2, 3 >)
size_triple_t(size_t _k, size_t _m, size_t _n)
Array< double, 1, 3 > e(1./3., 0.5, 2.)
RealScalar s
void find_subset_with_efficiency_higher_than(const vector< preprocessed_inputfile_t > &preprocessed_inputfiles, float required_efficiency_to_beat, vector< size_t > &inout_remainder, vector< size_t > &out_subset)
virtual const char * invokation_name() const override
ostream & operator<<(ostream &s, const size_triple_t &t)
const int default_precision
void print_partition(const vector< preprocessed_inputfile_t > &preprocessed_inputfiles, const vector< vector< size_t >> &partition)
uint16_t product_size
const size_t always_search_subsets_of_size_at_least
uint16_t product_size
void dump_table_for_subset(const vector< preprocessed_inputfile_t > &preprocessed_inputfiles, const vector< size_t > &subset)
std::map< std::string, Array< float, 1, 8, DontAlign|RowMajor > > results
static EIGEN_DEPRECATED const end_t end
float * p
void import_input_file_range_one_product_size(const vector< inputfile_entry_t >::const_iterator &begin, const vector< inputfile_entry_t >::const_iterator &end)
uint16_t product_size
float best_pot_gflops
virtual void run(const vector< string > &input_filenames) const override
bool only_cubic_sizes
uint16_t best_pot_block_size
set noclip points set clip one set noclip two set bar set border lt lw set xdata set ydata set zdata set x2data set y2data set boxwidth set dummy x
size_triple_t default_block_size
void check_all_files_in_same_exact_order(const vector< preprocessed_inputfile_t > &preprocessed_inputfiles)
friend ostream & operator<<(ostream &s, const results_entry_t &entry)
std::ptrdiff_t j
float default_efficiency
preprocessed_inputfile_t(const inputfile_t &inputfile)
Point2 t(10, 10)
virtual const char * invokation_name() const override
virtual const char * invokation_name() const
constexpr array< t, n > repeat(t v)
Definition: CXX11Meta.h:483


gtsam
Author(s):
autogenerated on Tue Jul 4 2023 02:33:53