54     k = 1 << ((compact & 0xf00) >> 8);
 
   55     m = 1 << ((compact & 0x0f0) >> 4);
 
   56     n = 1 << ((compact & 0x00f) >> 0);
 
   63   return s << 
"(" << 
t.k << 
", " << 
t.m << 
", " << 
t.n << 
")";
 
   92       cerr << 
"couldn't open input file: " << 
filename << endl;
 
   96     while (getline(
stream, line)) {
 
   97       if (line.empty()) 
continue;
 
   98       if (line.find(
"BEGIN MEASUREMENTS ALL POT SIZES") == 0) {
 
   99         if (
type != type_t::unknown) {
 
  100           cerr << 
"Input file " << 
filename << 
" contains redundant BEGIN MEASUREMENTS lines";
 
  103         type = type_t::all_pot_sizes;
 
  106       if (line.find(
"BEGIN MEASUREMENTS DEFAULT SIZES") == 0) {
 
  107         if (
type != type_t::unknown) {
 
  108           cerr << 
"Input file " << 
filename << 
" contains redundant BEGIN MEASUREMENTS lines";
 
  111         type = type_t::default_sizes;
 
  116       if (
type == type_t::unknown) {
 
  120         case type_t::all_pot_sizes: {
 
  121           unsigned int product_size, block_size;
 
  124             sscanf(line.c_str(), 
"%x %x %f",
 
  128           if (3 != sscanf_result ||
 
  130               product_size > 0xfff ||
 
  132               block_size > 0xfff ||
 
  135             cerr << 
"ill-formed input file: " << 
filename << endl;
 
  136             cerr << 
"offending line:" << endl << line << endl;
 
  146           entries.push_back(entry);
 
  149         case type_t::default_sizes: {
 
  150           unsigned int product_size;
 
  154             sscanf(line.c_str(), 
"%x default(%d, %d, %d) %f",
 
  158           if (5 != sscanf_result ||
 
  160               product_size > 0xfff ||
 
  163             cerr << 
"ill-formed input file: " << 
filename << endl;
 
  164             cerr << 
"offending line:" << endl << line << endl;
 
  175           entries.push_back(entry);
 
  184     if (
type == type_t::unknown) {
 
  185       cerr << 
"Unrecognized input file " << 
filename << endl;
 
  188     if (entries.empty()) {
 
  189       cerr << 
"didn't find any measurements in input file: " << 
filename << endl;
 
  211   vector<preprocessed_inputfile_entry_t> 
entries;
 
  219     auto it = inputfile.
entries.begin();
 
  220     auto it_first_with_given_product_size = it;
 
  221     while (it != inputfile.
entries.end()) {
 
  223       if (it == inputfile.
entries.end() ||
 
  224         it->product_size != it_first_with_given_product_size->product_size)
 
  226         import_input_file_range_one_product_size(it_first_with_given_product_size, it);
 
  227         it_first_with_given_product_size = it;
 
  234     const vector<inputfile_entry_t>::const_iterator& begin,
 
  235     const vector<inputfile_entry_t>::const_iterator& 
end)
 
  237     uint16_t product_size = begin->product_size;
 
  238     float max_gflops = 0.0f;
 
  239     for (
auto it = begin; it != 
end; ++it) {
 
  240       if (it->product_size != product_size) {
 
  241         cerr << 
"Unexpected ordering of entries in " << 
filename << endl;
 
  242         cerr << 
"(Expected all entries for product size " << hex << product_size << dec << 
" to be grouped)" << endl;
 
  245       max_gflops = 
max(max_gflops, it->gflops);
 
  247     for (
auto it = begin; it != 
end; ++it) {
 
  252       entries.push_back(entry);
 
  258        const vector<preprocessed_inputfile_t>& preprocessed_inputfiles)
 
  260   if (preprocessed_inputfiles.empty()) {
 
  265   const size_t num_entries = first_file.
entries.size();
 
  267   for (
size_t i = 0; 
i < preprocessed_inputfiles.size(); 
i++) {
 
  268     if (preprocessed_inputfiles[
i].entries.size() != num_entries) {
 
  269       cerr << 
"these files have different number of entries: " 
  270            << preprocessed_inputfiles[
i].filename
 
  278   for (
size_t entry_index = 0; entry_index < num_entries; entry_index++) {
 
  279     const uint16_t entry_product_size = first_file.
entries[entry_index].product_size;
 
  280     const uint16_t entry_block_size = first_file.
entries[entry_index].block_size;
 
  281     for (
size_t file_index = 0; file_index < preprocessed_inputfiles.size(); file_index++) {
 
  283       if (cur_file.
entries[entry_index].product_size != entry_product_size ||
 
  284           cur_file.
entries[entry_index].block_size != entry_block_size)
 
  286         cerr << 
"entries not in same order between these files: " 
  298         const vector<preprocessed_inputfile_t>& preprocessed_inputfiles,
 
  299         const vector<size_t>& subset)
 
  301   if (subset.size() <= 1) {
 
  305   const size_t num_entries = first_file.
entries.size();
 
  306   float efficiency = 1.0f;
 
  307   size_t entry_index = 0;
 
  308   size_t first_entry_index_with_this_product_size = 0;
 
  310   while (entry_index < num_entries) {
 
  312     if (entry_index == num_entries ||
 
  313         first_file.
entries[entry_index].product_size != product_size)
 
  315       float efficiency_this_product_size = 0.0f;
 
  316       for (
size_t e = first_entry_index_with_this_product_size; 
e < entry_index; 
e++) {
 
  317         float efficiency_this_entry = 1.0f;
 
  318         for (
auto i = subset.begin(); 
i != subset.end(); ++
i) {
 
  319           efficiency_this_entry = 
min(efficiency_this_entry, preprocessed_inputfiles[*
i].entries[
e].efficiency);
 
  321         efficiency_this_product_size = 
max(efficiency_this_product_size, efficiency_this_entry);
 
  323       efficiency = 
min(efficiency, efficiency_this_product_size);
 
  324       if (entry_index < num_entries) {
 
  325         first_entry_index_with_this_product_size = entry_index;
 
  326         product_size = first_file.
entries[entry_index].product_size;
 
  335         const vector<preprocessed_inputfile_t>& preprocessed_inputfiles,
 
  336         const vector<size_t>& subset)
 
  339   const size_t num_entries = first_file.
entries.size();
 
  340   size_t entry_index = 0;
 
  341   size_t first_entry_index_with_this_product_size = 0;
 
  350     cerr << 
"Can't generate tables with --only-cubic-sizes." << endl;
 
  353   cout << 
"struct LookupTable {" << endl;
 
  354   cout << 
"  static const size_t BaseSize = " << min_product_size.
k << 
";" << endl;
 
  355   const size_t NumSizes = 
log2_pot(max_product_size.
k / min_product_size.
k) + 1;
 
  356   const size_t TableSize = NumSizes * NumSizes * NumSizes;
 
  357   cout << 
"  static const size_t NumSizes = " << NumSizes << 
";" << endl;
 
  358   cout << 
"  static const unsigned short* Data() {" << endl;
 
  359   cout << 
"    static const unsigned short data[" << TableSize << 
"] = {";
 
  360   while (entry_index < num_entries) {
 
  362     if (entry_index == num_entries ||
 
  363         first_file.
entries[entry_index].product_size != product_size)
 
  365       float best_efficiency_this_product_size = 0.0f;
 
  366       uint16_t best_block_size_this_product_size = 0;
 
  367       for (
size_t e = first_entry_index_with_this_product_size; 
e < entry_index; 
e++) {
 
  368         float efficiency_this_entry = 1.0f;
 
  369         for (
auto i = subset.begin(); 
i != subset.end(); ++
i) {
 
  370           efficiency_this_entry = 
min(efficiency_this_entry, preprocessed_inputfiles[*
i].entries[
e].efficiency);
 
  372         if (efficiency_this_entry > best_efficiency_this_product_size) {
 
  373           best_efficiency_this_product_size = efficiency_this_entry;
 
  374           best_block_size_this_product_size = first_file.
entries[
e].block_size;
 
  377       if ((
i++) % NumSizes) {
 
  382       cout << 
"0x" << hex << best_block_size_this_product_size << dec;
 
  383       if (entry_index < num_entries) {
 
  385         first_entry_index_with_this_product_size = entry_index;
 
  386         product_size = first_file.
entries[entry_index].product_size;
 
  390   if (
i != TableSize) {
 
  391     cerr << endl << 
"Wrote " << 
i << 
" table entries, expected " << TableSize << endl;
 
  394   cout << endl << 
"    };" << endl;
 
  395   cout << 
"    return data;" << endl;
 
  396   cout << 
"  }" << endl;
 
  397   cout << 
"};" << endl;
 
  401         const vector<preprocessed_inputfile_t>& preprocessed_inputfiles,
 
  402         const vector<vector<size_t>>& partition)
 
  404   float efficiency = 1.0f;
 
  405   for (
auto s = partition.begin(); 
s != partition.end(); ++
s) {
 
  413   assert(subset_size >= 1 && subset_size <= set_size);
 
  414   out_subset.resize(subset_size);
 
  415   for (
size_t i = 0; 
i < subset_size; 
i++) {
 
  422   return subset[0] == set_size - subset.size();
 
  428     cerr << 
"iterating past the last subset" << endl;
 
  432   while (inout_subset[inout_subset.size() - 
i] == set_size - 
i) {
 
  434     assert(
i <= inout_subset.size());
 
  436   size_t first_index_to_change = inout_subset.size() - 
i;
 
  437   inout_subset[first_index_to_change]++;
 
  438   size_t p = inout_subset[first_index_to_change];
 
  439   for (
size_t j = first_index_to_change + 1; 
j < inout_subset.size(); 
j++) {
 
  440     inout_subset[
j] = ++
p;
 
  449   assert(
n>0 && 
p>0 && 
p<=
n);
 
  450   uint64_t numerator = 1, denominator = 1;
 
  451   for (
size_t i = 0; 
i < 
p; 
i++) {
 
  453     denominator *= 
i + 1;
 
  465   for (
size_t p = 1; 
p <= 
n - 1; 
p++) {
 
  467       return max(
p, minresult);
 
  474        const vector<preprocessed_inputfile_t>& preprocessed_inputfiles,
 
  475        float required_efficiency_to_beat,
 
  476        vector<size_t>& inout_remainder,
 
  477        vector<size_t>& out_subset)
 
  479   out_subset.resize(0);
 
  481   if (required_efficiency_to_beat >= 1.0
f) {
 
  482     cerr << 
"can't beat efficiency 1." << endl;
 
  486   while (!inout_remainder.empty()) {
 
  488     vector<size_t> candidate_indices(inout_remainder.size());
 
  489     for (
size_t i = 0; 
i < candidate_indices.size(); 
i++) {
 
  490       candidate_indices[
i] = 
i;
 
  494     while (candidate_indices_subset_size >= 1) {
 
  495       vector<size_t> candidate_indices_subset;
 
  497                         candidate_indices_subset,
 
  498                         candidate_indices.size());
 
  500       vector<size_t> best_candidate_indices_subset;
 
  501       float best_efficiency = 0.0f;
 
  502       vector<size_t> trial_subset = out_subset;
 
  503       trial_subset.resize(out_subset.size() + candidate_indices_subset_size);
 
  506         for (
size_t i = 0; 
i < candidate_indices_subset_size; 
i++) {
 
  507           trial_subset[out_subset.size() + 
i] = inout_remainder[candidate_indices_subset[
i]];
 
  511         if (trial_efficiency > best_efficiency) {
 
  512           best_efficiency = trial_efficiency;
 
  513           best_candidate_indices_subset = candidate_indices_subset;
 
  515         if (
is_last_subset(candidate_indices_subset, candidate_indices.size())) {
 
  518         next_subset(candidate_indices_subset, candidate_indices.size());
 
  521       if (best_efficiency > required_efficiency_to_beat) {
 
  522         for (
size_t i = 0; 
i < best_candidate_indices_subset.size(); 
i++) {
 
  523           candidate_indices[
i] = candidate_indices[best_candidate_indices_subset[
i]];
 
  525         candidate_indices.resize(best_candidate_indices_subset.size());
 
  527       candidate_indices_subset_size--;
 
  530     size_t candidate_index = candidate_indices[0];
 
  531     auto candidate_iterator = inout_remainder.begin() + candidate_index;
 
  532     vector<size_t> trial_subset = out_subset;
 
  534     trial_subset.push_back(*candidate_iterator);
 
  536     if (trial_efficiency > required_efficiency_to_beat) {
 
  537       out_subset.push_back(*candidate_iterator);
 
  538       inout_remainder.erase(candidate_iterator);
 
  546        const vector<preprocessed_inputfile_t>& preprocessed_inputfiles,
 
  547        float required_efficiency_to_beat,
 
  548        vector<vector<size_t>>& out_partition)
 
  550   out_partition.resize(0);
 
  552   vector<size_t> remainder;
 
  553   for (
size_t i = 0; 
i < preprocessed_inputfiles.size(); 
i++) {
 
  554     remainder.push_back(
i);
 
  557   while (!remainder.empty()) {
 
  558     vector<size_t> new_subset;
 
  560       preprocessed_inputfiles,
 
  561       required_efficiency_to_beat,
 
  564     out_partition.push_back(new_subset);
 
  569        const vector<preprocessed_inputfile_t>& preprocessed_inputfiles,
 
  570        const vector<vector<size_t>>& partition)
 
  573   cout << 
"Partition into " << partition.size() << 
" subsets for " << efficiency * 100.0f << 
"% efficiency"  << endl;
 
  574   for (
auto subset = partition.begin(); subset != partition.end(); ++subset) {
 
  575     cout << 
"  Subset " << (subset - partition.begin())
 
  578     for (
auto file = subset->begin(); 
file != subset->end(); ++
file) {
 
  579       cout << 
"    " << preprocessed_inputfiles[*
file].filename << endl;
 
  582       cout << 
"  Table:" << endl;
 
  592   virtual void run(
const vector<string>&)
 const { abort(); }
 
  599   virtual void run(
const vector<string>& input_filenames)
 const override 
  601     vector<preprocessed_inputfile_t> preprocessed_inputfiles;
 
  603     if (input_filenames.empty()) {
 
  604       cerr << 
"The " << invokation_name() << 
" action needs a list of input files." << endl;
 
  608     for (
auto it = input_filenames.begin(); it != input_filenames.end(); ++it) {
 
  610       switch (inputfile.
type) {
 
  612           preprocessed_inputfiles.emplace_back(inputfile);
 
  615           cerr << 
"The " << invokation_name() << 
" action only uses measurements for all pot sizes, and " 
  616                << 
"has no use for " << *it << 
" which contains measurements for default sizes." << endl;
 
  620           cerr << 
"Unrecognized input file: " << *it << endl;
 
  627     float required_efficiency_to_beat = 0.0f;
 
  628     vector<vector<vector<size_t>>> partitions;
 
  629     cerr << 
"searching for partitions...\r" << flush;
 
  632       vector<vector<size_t>> partition;
 
  634         preprocessed_inputfiles,
 
  635         required_efficiency_to_beat,
 
  638       cerr << 
"partition " << preprocessed_inputfiles.size() << 
" files into " << partition.size()
 
  639            << 
" subsets for " << 100.0f * actual_efficiency
 
  642       partitions.push_back(partition);
 
  643       if (partition.size() == preprocessed_inputfiles.size() || actual_efficiency == 1.0f) {
 
  646       required_efficiency_to_beat = actual_efficiency;
 
  651       for (
size_t i = 0; 
i < partitions.size() - 1; 
i++) {
 
  652         if (partitions[
i].
size() >= partitions[
i+1].
size()) {
 
  653           partitions.erase(partitions.begin() + 
i);
 
  662     for (
auto it = partitions.begin(); it != partitions.end(); ++it) {
 
  687       << 
" GFlop/s" << dec;
 
  695     cerr << 
"usage: " << invokation_name() << 
" default-sizes-data all-pot-sizes-data" << endl;
 
  696     cerr << 
"checks how well the performance with default sizes compares to the best " 
  697          << 
"performance measured over all POT sizes." << endl;
 
  700   virtual void run(
const vector<string>& input_filenames)
 const override 
  702     if (input_filenames.size() != 2) {
 
  705     inputfile_t inputfile_default_sizes(input_filenames[0]);
 
  706     inputfile_t inputfile_all_pot_sizes(input_filenames[1]);
 
  708       cerr << inputfile_default_sizes.
filename << 
" is not an input file with default sizes." << endl;
 
  712       cerr << inputfile_all_pot_sizes.
filename << 
" is not an input file with all POT sizes." << endl;
 
  715     vector<results_entry_t> 
results;
 
  716     vector<results_entry_t> cubic_results;
 
  719     auto it_all_pot_sizes = inputfile_all_pot_sizes.
entries.begin();
 
  720     for (
auto it_default_sizes = inputfile_default_sizes.
entries.begin();
 
  721          it_default_sizes != inputfile_default_sizes.
entries.end();
 
  724       if (it_default_sizes->product_size == product_size) {
 
  727       product_size = it_default_sizes->product_size;
 
  728       while (it_all_pot_sizes != inputfile_all_pot_sizes.
entries.end() &&
 
  729              it_all_pot_sizes->product_size != product_size)
 
  733       if (it_all_pot_sizes == inputfile_all_pot_sizes.
entries.end()) {
 
  737       float best_pot_gflops = 0;
 
  738       for (
auto it = it_all_pot_sizes;
 
  739            it != inputfile_all_pot_sizes.
entries.end() && it->product_size == product_size;
 
  742         if (it->gflops > best_pot_gflops) {
 
  743           best_pot_gflops = it->gflops;
 
  744           best_pot_block_size = it->pot_block_size;
 
  757       if (
t.k == 
t.m && 
t.m == 
t.n) {
 
  758         cubic_results.push_back(entry);
 
  762     cout << 
"All results:" << endl;
 
  770     const size_t n = min<size_t>(20, 
results.size());
 
  771     cout << 
n << 
" worst results:" << endl;
 
  772     for (
size_t i = 0; 
i < 
n; 
i++) {
 
  777     cout << 
"cubic results:" << endl;
 
  778     for (
auto it = cubic_results.begin(); it != cubic_results.end(); ++it) {
 
  786     vector<float> 
a = {0.5f, 0.20f, 0.10f, 0.05f, 0.02f, 0.01f};
 
  787     for (
auto it = 
a.begin(); it != 
a.end(); ++it) {
 
  789       cout << (100.0f * 
n / (
results.size() - 1))
 
  790            << 
" % of product sizes have default efficiency <= " 
  791            << 100.0f * 
results[
n].default_efficiency << 
" %" << endl;
 
  799                          const vector<unique_ptr<action_t>>& available_actions)
 
  801   cerr << 
"usage: " << argv[0] << 
" <action> [options...] <input files...>" << endl;
 
  802   cerr << 
"available actions:" << endl;
 
  803   for (
auto it = available_actions.begin(); it != available_actions.end(); ++it) {
 
  804     cerr << 
"  " << (*it)->invokation_name() << endl;
 
  806   cerr << 
"the input files should each contain an output of benchmark-blocking-sizes" << endl;
 
  810 int main(
int argc, 
char* argv[])
 
  815   vector<unique_ptr<action_t>> available_actions;
 
  819   vector<string> input_filenames;
 
  826   for (
int i = 1; 
i < argc; 
i++) {
 
  827     bool arg_handled = 
false;
 
  829     for (
auto it = available_actions.begin(); it != available_actions.end(); ++it) {
 
  830       if (!strcmp(argv[
i], (*it)->invokation_name())) {
 
  836           cerr << 
"can't specify more than one action!" << endl;
 
  845     if (argv[
i][0] == 
'-') {
 
  846       if (!strcmp(argv[
i], 
"--only-cubic-sizes")) {
 
  850       if (!strcmp(argv[
i], 
"--dump-tables")) {
 
  855         cerr << 
"Unrecognized option: " << argv[
i] << endl;
 
  863     input_filenames.emplace_back(argv[
i]);
 
  867     cerr << 
"Incompatible options: --only-cubic-sizes and --dump-tables." << endl;
 
  875   action->run(input_filenames);