54 k = 1 << ((compact & 0xf00) >> 8);
55 m = 1 << ((compact & 0x0f0) >> 4);
56 n = 1 << ((compact & 0x00f) >> 0);
63 return s <<
"(" << t.
k <<
", " << t.
m <<
", " << t.
n <<
")";
91 if (!stream.is_open()) {
92 cerr <<
"couldn't open input file: " << filename << endl;
96 while (getline(stream, line)) {
97 if (line.empty())
continue;
98 if (line.find(
"BEGIN MEASUREMENTS ALL POT SIZES") == 0) {
99 if (type != type_t::unknown) {
100 cerr <<
"Input file " << filename <<
" contains redundant BEGIN MEASUREMENTS lines";
103 type = type_t::all_pot_sizes;
106 if (line.find(
"BEGIN MEASUREMENTS DEFAULT SIZES") == 0) {
107 if (type != type_t::unknown) {
108 cerr <<
"Input file " << filename <<
" contains redundant BEGIN MEASUREMENTS lines";
111 type = type_t::default_sizes;
116 if (type == type_t::unknown) {
120 case type_t::all_pot_sizes: {
121 unsigned int product_size, block_size;
124 sscanf(line.c_str(),
"%x %x %f",
128 if (3 != sscanf_result ||
130 product_size > 0xfff ||
132 block_size > 0xfff ||
135 cerr <<
"ill-formed input file: " << filename << endl;
136 cerr <<
"offending line:" << endl << line << endl;
146 entries.push_back(entry);
149 case type_t::default_sizes: {
150 unsigned int product_size;
154 sscanf(line.c_str(),
"%x default(%d, %d, %d) %f",
158 if (5 != sscanf_result ||
160 product_size > 0xfff ||
163 cerr <<
"ill-formed input file: " << filename << endl;
164 cerr <<
"offending line:" << endl << line << endl;
175 entries.push_back(entry);
184 if (type == type_t::unknown) {
185 cerr <<
"Unrecognized input file " << filename << endl;
188 if (entries.empty()) {
189 cerr <<
"didn't find any measurements in input file: " << filename << endl;
211 vector<preprocessed_inputfile_entry_t>
entries;
214 : filename(inputfile.filename)
219 auto it = inputfile.
entries.begin();
220 auto it_first_with_given_product_size = it;
221 while (it != inputfile.
entries.end()) {
223 if (it == inputfile.
entries.end() ||
224 it->product_size != it_first_with_given_product_size->product_size)
226 import_input_file_range_one_product_size(it_first_with_given_product_size, it);
227 it_first_with_given_product_size = it;
234 const vector<inputfile_entry_t>::const_iterator& begin,
235 const vector<inputfile_entry_t>::const_iterator&
end)
237 uint16_t product_size = begin->product_size;
238 float max_gflops = 0.0f;
239 for (
auto it = begin; it !=
end; ++it) {
240 if (it->product_size != product_size) {
241 cerr <<
"Unexpected ordering of entries in " << filename << endl;
242 cerr <<
"(Expected all entries for product size " << hex << product_size << dec <<
" to be grouped)" << endl;
245 max_gflops =
max(max_gflops, it->gflops);
247 for (
auto it = begin; it !=
end; ++it) {
252 entries.push_back(entry);
258 const vector<preprocessed_inputfile_t>& preprocessed_inputfiles)
260 if (preprocessed_inputfiles.empty()) {
265 const size_t num_entries = first_file.
entries.size();
267 for (
size_t i = 0;
i < preprocessed_inputfiles.size();
i++) {
268 if (preprocessed_inputfiles[
i].entries.size() != num_entries) {
269 cerr <<
"these files have different number of entries: " 270 << preprocessed_inputfiles[
i].filename
278 for (
size_t entry_index = 0; entry_index < num_entries; entry_index++) {
279 const uint16_t entry_product_size = first_file.
entries[entry_index].product_size;
280 const uint16_t entry_block_size = first_file.
entries[entry_index].block_size;
281 for (
size_t file_index = 0; file_index < preprocessed_inputfiles.size(); file_index++) {
283 if (cur_file.
entries[entry_index].product_size != entry_product_size ||
284 cur_file.
entries[entry_index].block_size != entry_block_size)
286 cerr <<
"entries not in same order between these files: " 298 const vector<preprocessed_inputfile_t>& preprocessed_inputfiles,
299 const vector<size_t>& subset)
301 if (subset.size() <= 1) {
305 const size_t num_entries = first_file.
entries.size();
306 float efficiency = 1.0f;
307 size_t entry_index = 0;
308 size_t first_entry_index_with_this_product_size = 0;
310 while (entry_index < num_entries) {
312 if (entry_index == num_entries ||
313 first_file.
entries[entry_index].product_size != product_size)
315 float efficiency_this_product_size = 0.0f;
316 for (
size_t e = first_entry_index_with_this_product_size;
e < entry_index;
e++) {
317 float efficiency_this_entry = 1.0f;
318 for (
auto i = subset.begin();
i != subset.end(); ++
i) {
319 efficiency_this_entry =
min(efficiency_this_entry, preprocessed_inputfiles[*
i].entries[
e].efficiency);
321 efficiency_this_product_size =
max(efficiency_this_product_size, efficiency_this_entry);
323 efficiency =
min(efficiency, efficiency_this_product_size);
324 if (entry_index < num_entries) {
325 first_entry_index_with_this_product_size = entry_index;
326 product_size = first_file.
entries[entry_index].product_size;
335 const vector<preprocessed_inputfile_t>& preprocessed_inputfiles,
336 const vector<size_t>& subset)
339 const size_t num_entries = first_file.
entries.size();
340 size_t entry_index = 0;
341 size_t first_entry_index_with_this_product_size = 0;
346 if (!min_product_size.is_cubic() || !max_product_size.is_cubic()) {
350 cerr <<
"Can't generate tables with --only-cubic-sizes." << endl;
353 cout <<
"struct LookupTable {" << endl;
354 cout <<
" static const size_t BaseSize = " << min_product_size.k <<
";" << endl;
355 const size_t NumSizes =
log2_pot(max_product_size.k / min_product_size.k) + 1;
356 const size_t TableSize = NumSizes * NumSizes * NumSizes;
357 cout <<
" static const size_t NumSizes = " << NumSizes <<
";" << endl;
358 cout <<
" static const unsigned short* Data() {" << endl;
359 cout <<
" static const unsigned short data[" << TableSize <<
"] = {";
360 while (entry_index < num_entries) {
362 if (entry_index == num_entries ||
363 first_file.
entries[entry_index].product_size != product_size)
365 float best_efficiency_this_product_size = 0.0f;
366 uint16_t best_block_size_this_product_size = 0;
367 for (
size_t e = first_entry_index_with_this_product_size;
e < entry_index;
e++) {
368 float efficiency_this_entry = 1.0f;
369 for (
auto i = subset.begin(); i != subset.end(); ++
i) {
370 efficiency_this_entry =
min(efficiency_this_entry, preprocessed_inputfiles[*i].entries[
e].efficiency);
372 if (efficiency_this_entry > best_efficiency_this_product_size) {
373 best_efficiency_this_product_size = efficiency_this_entry;
374 best_block_size_this_product_size = first_file.
entries[
e].block_size;
377 if ((i++) % NumSizes) {
382 cout <<
"0x" << hex << best_block_size_this_product_size << dec;
383 if (entry_index < num_entries) {
385 first_entry_index_with_this_product_size = entry_index;
386 product_size = first_file.
entries[entry_index].product_size;
390 if (i != TableSize) {
391 cerr << endl <<
"Wrote " << i <<
" table entries, expected " << TableSize << endl;
394 cout << endl <<
" };" << endl;
395 cout <<
" return data;" << endl;
396 cout <<
" }" << endl;
397 cout <<
"};" << endl;
401 const vector<preprocessed_inputfile_t>& preprocessed_inputfiles,
402 const vector<vector<size_t>>& partition)
404 float efficiency = 1.0f;
405 for (
auto s = partition.begin();
s != partition.end(); ++
s) {
413 assert(subset_size >= 1 && subset_size <= set_size);
414 out_subset.resize(subset_size);
415 for (
size_t i = 0;
i < subset_size;
i++) {
422 return subset[0] == set_size - subset.size();
428 cerr <<
"iterating past the last subset" << endl;
432 while (inout_subset[inout_subset.size() -
i] == set_size -
i) {
434 assert(i <= inout_subset.size());
436 size_t first_index_to_change = inout_subset.size() -
i;
437 inout_subset[first_index_to_change]++;
438 size_t p = inout_subset[first_index_to_change];
439 for (
size_t j = first_index_to_change + 1;
j < inout_subset.size();
j++) {
440 inout_subset[
j] = ++
p;
449 assert(n>0 && p>0 && p<=n);
450 uint64_t numerator = 1, denominator = 1;
451 for (
size_t i = 0;
i <
p;
i++) {
453 denominator *= i + 1;
465 for (
size_t p = 1;
p <= n - 1;
p++) {
467 return max(
p, minresult);
474 const vector<preprocessed_inputfile_t>& preprocessed_inputfiles,
475 float required_efficiency_to_beat,
476 vector<size_t>& inout_remainder,
477 vector<size_t>& out_subset)
479 out_subset.resize(0);
481 if (required_efficiency_to_beat >= 1.0
f) {
482 cerr <<
"can't beat efficiency 1." << endl;
486 while (!inout_remainder.empty()) {
488 vector<size_t> candidate_indices(inout_remainder.size());
489 for (
size_t i = 0;
i < candidate_indices.size();
i++) {
490 candidate_indices[
i] =
i;
494 while (candidate_indices_subset_size >= 1) {
495 vector<size_t> candidate_indices_subset;
497 candidate_indices_subset,
498 candidate_indices.size());
500 vector<size_t> best_candidate_indices_subset;
501 float best_efficiency = 0.0f;
502 vector<size_t> trial_subset = out_subset;
503 trial_subset.resize(out_subset.size() + candidate_indices_subset_size);
506 for (
size_t i = 0;
i < candidate_indices_subset_size;
i++) {
507 trial_subset[out_subset.size() +
i] = inout_remainder[candidate_indices_subset[
i]];
511 if (trial_efficiency > best_efficiency) {
512 best_efficiency = trial_efficiency;
513 best_candidate_indices_subset = candidate_indices_subset;
515 if (
is_last_subset(candidate_indices_subset, candidate_indices.size())) {
518 next_subset(candidate_indices_subset, candidate_indices.size());
521 if (best_efficiency > required_efficiency_to_beat) {
522 for (
size_t i = 0;
i < best_candidate_indices_subset.size();
i++) {
523 candidate_indices[
i] = candidate_indices[best_candidate_indices_subset[
i]];
525 candidate_indices.resize(best_candidate_indices_subset.size());
527 candidate_indices_subset_size--;
530 size_t candidate_index = candidate_indices[0];
531 auto candidate_iterator = inout_remainder.begin() + candidate_index;
532 vector<size_t> trial_subset = out_subset;
534 trial_subset.push_back(*candidate_iterator);
536 if (trial_efficiency > required_efficiency_to_beat) {
537 out_subset.push_back(*candidate_iterator);
538 inout_remainder.erase(candidate_iterator);
546 const vector<preprocessed_inputfile_t>& preprocessed_inputfiles,
547 float required_efficiency_to_beat,
548 vector<vector<size_t>>& out_partition)
550 out_partition.resize(0);
552 vector<size_t> remainder;
553 for (
size_t i = 0;
i < preprocessed_inputfiles.size();
i++) {
554 remainder.push_back(
i);
557 while (!remainder.empty()) {
558 vector<size_t> new_subset;
560 preprocessed_inputfiles,
561 required_efficiency_to_beat,
564 out_partition.push_back(new_subset);
569 const vector<preprocessed_inputfile_t>& preprocessed_inputfiles,
570 const vector<vector<size_t>>& partition)
573 cout <<
"Partition into " << partition.size() <<
" subsets for " << efficiency * 100.0f <<
"% efficiency" << endl;
574 for (
auto subset = partition.begin(); subset != partition.end(); ++subset) {
575 cout <<
" Subset " << (subset - partition.begin())
578 for (
auto file = subset->begin();
file != subset->end(); ++
file) {
579 cout <<
" " << preprocessed_inputfiles[*
file].filename << endl;
582 cout <<
" Table:" << endl;
592 virtual void run(
const vector<string>&)
const { abort(); }
599 virtual void run(
const vector<string>& input_filenames)
const override 601 vector<preprocessed_inputfile_t> preprocessed_inputfiles;
603 if (input_filenames.empty()) {
604 cerr <<
"The " << invokation_name() <<
" action needs a list of input files." << endl;
608 for (
auto it = input_filenames.begin(); it != input_filenames.end(); ++it) {
610 switch (inputfile.
type) {
612 preprocessed_inputfiles.emplace_back(inputfile);
615 cerr <<
"The " << invokation_name() <<
" action only uses measurements for all pot sizes, and " 616 <<
"has no use for " << *it <<
" which contains measurements for default sizes." << endl;
620 cerr <<
"Unrecognized input file: " << *it << endl;
627 float required_efficiency_to_beat = 0.0f;
628 vector<vector<vector<size_t>>> partitions;
629 cerr <<
"searching for partitions...\r" << flush;
632 vector<vector<size_t>> partition;
634 preprocessed_inputfiles,
635 required_efficiency_to_beat,
638 cerr <<
"partition " << preprocessed_inputfiles.size() <<
" files into " << partition.size()
639 <<
" subsets for " << 100.0f * actual_efficiency
642 partitions.push_back(partition);
643 if (partition.size() == preprocessed_inputfiles.size() || actual_efficiency == 1.0f) {
646 required_efficiency_to_beat = actual_efficiency;
651 for (
size_t i = 0;
i < partitions.size() - 1;
i++) {
652 if (partitions[
i].
size() >= partitions[
i+1].size()) {
653 partitions.erase(partitions.begin() +
i);
662 for (
auto it = partitions.begin(); it != partitions.end(); ++it) {
687 <<
" GFlop/s" << dec;
695 cerr <<
"usage: " << invokation_name() <<
" default-sizes-data all-pot-sizes-data" << endl;
696 cerr <<
"checks how well the performance with default sizes compares to the best " 697 <<
"performance measured over all POT sizes." << endl;
700 virtual void run(
const vector<string>& input_filenames)
const override 702 if (input_filenames.size() != 2) {
705 inputfile_t inputfile_default_sizes(input_filenames[0]);
706 inputfile_t inputfile_all_pot_sizes(input_filenames[1]);
708 cerr << inputfile_default_sizes.
filename <<
" is not an input file with default sizes." << endl;
712 cerr << inputfile_all_pot_sizes.
filename <<
" is not an input file with all POT sizes." << endl;
715 vector<results_entry_t>
results;
716 vector<results_entry_t> cubic_results;
719 auto it_all_pot_sizes = inputfile_all_pot_sizes.
entries.begin();
720 for (
auto it_default_sizes = inputfile_default_sizes.
entries.begin();
721 it_default_sizes != inputfile_default_sizes.
entries.end();
724 if (it_default_sizes->product_size == product_size) {
727 product_size = it_default_sizes->product_size;
728 while (it_all_pot_sizes != inputfile_all_pot_sizes.
entries.end() &&
729 it_all_pot_sizes->product_size != product_size)
733 if (it_all_pot_sizes == inputfile_all_pot_sizes.
entries.end()) {
737 float best_pot_gflops = 0;
738 for (
auto it = it_all_pot_sizes;
739 it != inputfile_all_pot_sizes.
entries.end() && it->product_size == product_size;
742 if (it->gflops > best_pot_gflops) {
743 best_pot_gflops = it->gflops;
744 best_pot_block_size = it->pot_block_size;
754 results.push_back(entry);
757 if (t.
k == t.
m && t.
m == t.
n) {
758 cubic_results.push_back(entry);
762 cout <<
"All results:" << endl;
763 for (
auto it = results.begin(); it != results.end(); ++it) {
770 const size_t n = min<size_t>(20, results.size());
771 cout << n <<
" worst results:" << endl;
772 for (
size_t i = 0;
i <
n;
i++) {
773 cout << results[
i] << endl;
777 cout <<
"cubic results:" << endl;
778 for (
auto it = cubic_results.begin(); it != cubic_results.end(); ++it) {
786 vector<float>
a = {0.5f, 0.20f, 0.10f, 0.05f, 0.02f, 0.01f};
787 for (
auto it = a.begin(); it != a.end(); ++it) {
788 size_t n =
min(results.size() - 1,
size_t(*it * results.size()));
789 cout << (100.0f * n / (results.size() - 1))
790 <<
" % of product sizes have default efficiency <= " 791 << 100.0f * results[
n].default_efficiency <<
" %" << endl;
799 const vector<unique_ptr<action_t>>& available_actions)
801 cerr <<
"usage: " << argv[0] <<
" <action> [options...] <input files...>" << endl;
802 cerr <<
"available actions:" << endl;
803 for (
auto it = available_actions.begin(); it != available_actions.end(); ++it) {
804 cerr <<
" " << (*it)->invokation_name() << endl;
806 cerr <<
"the input files should each contain an output of benchmark-blocking-sizes" << endl;
810 int main(
int argc,
char* argv[])
815 vector<unique_ptr<action_t>> available_actions;
819 vector<string> input_filenames;
826 for (
int i = 1;
i < argc;
i++) {
827 bool arg_handled =
false;
829 for (
auto it = available_actions.begin(); it != available_actions.end(); ++it) {
830 if (!strcmp(argv[
i], (*it)->invokation_name())) {
836 cerr <<
"can't specify more than one action!" << endl;
845 if (argv[
i][0] ==
'-') {
846 if (!strcmp(argv[
i],
"--only-cubic-sizes")) {
850 if (!strcmp(argv[i],
"--dump-tables")) {
855 cerr <<
"Unrecognized option: " << argv[
i] << endl;
863 input_filenames.emplace_back(argv[
i]);
867 cerr <<
"Incompatible options: --only-cubic-sizes and --dump-tables." << endl;
875 action->
run(input_filenames);
virtual void run(const vector< string > &input_filenames) const override
size_t max_feasible_subset_size(size_t n)
int main(int argc, char *argv[])
void make_first_subset(size_t subset_size, vector< size_t > &out_subset, size_t set_size)
size_triple_t(uint16_t compact)
virtual void run(const vector< string > &) const
size_triple_t(const size_triple_t &o)
bool lower_efficiency(const preprocessed_inputfile_entry_t &e1, const preprocessed_inputfile_entry_t &e2)
float efficiency_of_subset(const vector< preprocessed_inputfile_t > &preprocessed_inputfiles, const vector< size_t > &subset)
uint8_t log2_pot(size_t x)
void find_partition_with_efficiency_higher_than(const vector< preprocessed_inputfile_t > &preprocessed_inputfiles, float required_efficiency_to_beat, vector< vector< size_t >> &out_partition)
float efficiency_of_partition(const vector< preprocessed_inputfile_t > &preprocessed_inputfiles, const vector< vector< size_t >> &partition)
static const Line3 l(Rot3(), 1, 1)
size_triple_t nonpot_block_size
void next_subset(vector< size_t > &inout_subset, size_t set_size)
void show_usage_and_exit(int argc, char *argv[], const vector< unique_ptr< action_t >> &available_actions)
const size_t number_of_subsets_limit
static bool lower_efficiency(const results_entry_t &e1, const results_entry_t &e2)
bool is_last_subset(const vector< size_t > &subset, size_t set_size)
bool is_number_of_subsets_feasible(size_t n, size_t p)
unsigned __int64 uint64_t
void show_usage_and_exit() const
uint16_t compact_size_triple(size_t k, size_t m, size_t n)
Point2(* f)(const Point3 &, OptionalJacobian< 2, 3 >)
size_triple_t(size_t _k, size_t _m, size_t _n)
Array< double, 1, 3 > e(1./3., 0.5, 2.)
void find_subset_with_efficiency_higher_than(const vector< preprocessed_inputfile_t > &preprocessed_inputfiles, float required_efficiency_to_beat, vector< size_t > &inout_remainder, vector< size_t > &out_subset)
virtual const char * invokation_name() const override
ostream & operator<<(ostream &s, const size_triple_t &t)
const int default_precision
void print_partition(const vector< preprocessed_inputfile_t > &preprocessed_inputfiles, const vector< vector< size_t >> &partition)
const size_t always_search_subsets_of_size_at_least
void dump_table_for_subset(const vector< preprocessed_inputfile_t > &preprocessed_inputfiles, const vector< size_t > &subset)
std::map< std::string, Array< float, 1, 8, DontAlign|RowMajor > > results
static EIGEN_DEPRECATED const end_t end
virtual void run(const vector< string > &input_filenames) const override
uint16_t best_pot_block_size
set noclip points set clip one set noclip two set bar set border lt lw set xdata set ydata set zdata set x2data set y2data set boxwidth set dummy x
size_triple_t default_block_size
void check_all_files_in_same_exact_order(const vector< preprocessed_inputfile_t > &preprocessed_inputfiles)
friend ostream & operator<<(ostream &s, const results_entry_t &entry)
virtual const char * invokation_name() const override
virtual const char * invokation_name() const
constexpr array< t, n > repeat(t v)