15 #include "absl/random/internal/nanobenchmark.h"
17 #include <sys/types.h>
30 #include "absl/base/attributes.h"
31 #include "absl/base/internal/raw_logging.h"
32 #include "absl/random/internal/platform.h"
33 #include "absl/random/internal/randen_engine.h"
36 #if defined(_WIN32) || defined(_WIN64)
40 #elif defined(__ANDROID__)
41 #define ABSL_OS_ANDROID
43 #elif defined(__linux__)
46 #include <sys/syscall.h>
49 #if defined(ABSL_ARCH_X86_64) && !defined(ABSL_OS_WIN)
54 #if defined(ABSL_ARCH_PPC)
55 #include <sys/platform/ppc.h>
59 #if defined(ABSL_ARCH_ARM) || defined(ABSL_ARCH_AARCH64)
64 #if ABSL_HAVE_ATTRIBUTE(noinline) || (defined(__GNUC__) && !defined(__clang__))
65 #define ABSL_RANDOM_INTERNAL_ATTRIBUTE_NEVER_INLINE __attribute__((noinline))
66 #elif defined(_MSC_VER)
67 #define ABSL_RANDOM_INTERNAL_ATTRIBUTE_NEVER_INLINE __declspec(noinline)
69 #define ABSL_RANDOM_INTERNAL_ATTRIBUTE_NEVER_INLINE
74 namespace random_internal_nanobenchmark {
79 #if defined(ABSL_ARCH_X86_64)
84 #if defined(ABSL_OS_WIN)
87 for (
int i = 0;
i < 4; ++
i) {
101 char brand_string[49];
105 Cpuid(0x80000000U, 0, abcd);
106 if (abcd[0] < 0x80000004U) {
110 for (
int i = 0;
i < 3; ++
i) {
111 Cpuid(0x80000002U +
i, 0, abcd);
112 memcpy(brand_string +
i * 16, &abcd,
sizeof(abcd));
114 brand_string[48] = 0;
120 double NominalClockRate() {
124 const char* prefixes[3] = {
"MHz",
"GHz",
"THz"};
125 const double multipliers[3] = {1E6, 1E9, 1E12};
126 for (
size_t i = 0;
i < 3; ++
i) {
127 const size_t pos_prefix = brand_string.find(prefixes[
i]);
128 if (pos_prefix != std::string::npos) {
129 const size_t pos_space = brand_string.rfind(
' ', pos_prefix - 1);
130 if (pos_space != std::string::npos) {
132 brand_string.substr(pos_space + 1, pos_prefix - pos_space - 1);
133 return std::stod(digits) * multipliers[
i];
141 #endif // ABSL_ARCH_X86_64
146 inline void PreventElision(
T&&
output) {
151 asm volatile(
"" :
"+r"(
output) : :
"memory");
157 static std::atomic<T>
dummy(
T{});
217 #if defined(ABSL_ARCH_PPC)
218 asm volatile(
"mfspr %0, %1" :
"=r"(t) :
"i"(268));
219 #elif defined(ABSL_ARCH_X86_64)
220 #if defined(ABSL_OS_WIN)
239 :
"rdx",
"memory",
"cc");
244 clock_gettime(CLOCK_REALTIME, &ts);
245 t = ts.tv_sec * 1000000000
LL + ts.tv_nsec;
252 #if defined(ABSL_ARCH_X86_64)
253 #if defined(ABSL_OS_WIN)
271 :
"rcx",
"rdx",
"memory",
"cc");
284 #if defined(ABSL_ARCH_X86_64)
285 #if defined(ABSL_OS_WIN)
289 t =
static_cast<uint32_t>(__rdtsc());
304 t =
static_cast<uint32_t>(Start64());
311 #if defined(ABSL_ARCH_X86_64)
312 #if defined(ABSL_OS_WIN)
315 t =
static_cast<uint32_t>(__rdtscp(&aux));
327 :
"rcx",
"rdx",
"memory");
330 t =
static_cast<uint32_t>(Stop64());
337 namespace robust_statistics {
342 void CountingSort(
T*
values,
size_t num_values) {
344 using Unique = std::pair<T, int>;
345 std::vector<Unique>
unique;
346 for (
size_t i = 0;
i < num_values; ++
i) {
350 [
value](
const Unique
u) { return u.first == value; });
363 for (
const auto& value_count :
unique) {
364 std::fill(p, p + value_count.second, value_count.first);
365 p += value_count.second;
372 template <
typename T>
374 const size_t idx_begin,
const size_t half_count) {
378 for (
size_t idx = idx_begin;
idx < idx_begin + half_count; ++
idx) {
381 if (
range < min_range) {
396 template <
typename T>
398 const size_t num_values) {
399 size_t idx_begin = 0;
400 size_t half_count = num_values / 2;
401 while (half_count > 1) {
402 idx_begin = MinRange(sorted, idx_begin, half_count);
406 const T x = sorted[idx_begin + 0];
407 if (half_count == 0) {
411 const T average = (
x + sorted[idx_begin + 1] + 1) / 2;
416 template <
typename T>
417 T Mode(
T*
values,
const size_t num_values) {
418 CountingSort(
values, num_values);
419 return ModeOfSorted(
values, num_values);
422 template <
typename T,
size_t N>
428 template <
typename T>
429 T Median(
T*
values,
const size_t num_values) {
432 const size_t half = num_values / 2;
434 if (num_values % 2) {
442 template <
typename T>
443 T MedianAbsoluteDeviation(
const T*
values,
const size_t num_values,
446 std::vector<T> abs_deviations;
447 abs_deviations.reserve(num_values);
448 for (
size_t i = 0;
i < num_values; ++
i) {
450 abs_deviations.push_back(
static_cast<T>(abs));
452 return Median(abs_deviations.data(), num_values);
463 Ticks TimerResolution() {
469 const Ticks
t0 = timer::Start32();
470 const Ticks
t1 = timer::Stop32();
471 samples[
i] =
t1 -
t0;
473 repetitions[
rep] = robust_statistics::Mode(samples);
475 return robust_statistics::Mode(repetitions);
478 static const Ticks timer_resolution = TimerResolution();
482 template <
class Lambda>
483 Ticks SampleUntilStable(
const double max_rel_mad,
double* rel_mad,
484 const Params& p,
const Lambda& lambda) {
485 auto measure_duration = [&lambda]() -> Ticks {
486 const Ticks
t0 = timer::Start32();
488 const Ticks
t1 = timer::Stop32();
493 Ticks est = measure_duration();
495 const size_t ticks_per_eval = ticks_per_second *
p.seconds_per_eval;
496 size_t samples_per_eval = ticks_per_eval / est;
497 samples_per_eval = (
std::max)(samples_per_eval,
p.min_samples_per_eval);
499 std::vector<Ticks> samples;
500 samples.reserve(1 + samples_per_eval);
501 samples.push_back(est);
505 const Ticks max_abs_mad = (timer_resolution + 99) / 100;
508 for (
size_t eval = 0; eval <
p.max_evals; ++eval, samples_per_eval *= 2) {
509 samples.reserve(samples.size() + samples_per_eval);
510 for (
size_t i = 0;
i < samples_per_eval; ++
i) {
511 const Ticks
r = measure_duration();
512 samples.push_back(
r);
515 if (samples.size() >=
p.min_mode_samples) {
516 est = robust_statistics::Mode(samples.data(), samples.size());
519 est = robust_statistics::Median(samples.data(), samples.size());
524 const Ticks abs_mad = robust_statistics::MedianAbsoluteDeviation(
525 samples.data(), samples.size(), est);
526 *rel_mad =
static_cast<double>(
static_cast<int>(abs_mad)) / est;
528 if (*rel_mad <= max_rel_mad || abs_mad <= max_abs_mad) {
531 "%6zu samples => %5u (abs_mad=%4u, rel_mad=%4.2f%%)\n",
532 samples.size(), est, abs_mad, *rel_mad * 100.0);
540 "rel_mad=%4.2f%% still exceeds %4.2f%% after %6zu samples.\n",
541 *rel_mad * 100.0, max_rel_mad * 100.0, samples.size());
546 using InputVec = std::vector<FuncInput>;
549 InputVec UniqueInputs(
const FuncInput* inputs,
const size_t num_inputs) {
550 InputVec
unique(inputs, inputs + num_inputs);
561 Ticks min_duration = ~0
u;
569 if (elapsed >= (1
ULL << 30)) {
571 "Measurement failed: need 64-bit timer for input=%zu\n",
572 static_cast<size_t>(
input));
577 const Ticks
total = SampleUntilStable(
578 p.target_rel_mad, &rel_mad, p,
579 [
func,
arg,
input]() { PreventElision(func(arg, input)); });
580 min_duration = (
std::min)(min_duration,
total - timer_resolution);
584 const size_t max_skip =
p.precision_divisor;
586 const size_t num_skip =
587 min_duration == 0 ? 0 : (max_skip + min_duration - 1) / min_duration;
590 timer_resolution, max_skip, min_duration, num_skip);
596 InputVec ReplicateInputs(
const FuncInput* inputs,
const size_t num_inputs,
597 const size_t num_unique,
const size_t num_skip,
600 if (num_unique == 1) {
601 full.assign(
p.subset_ratio * num_skip, inputs[0]);
605 full.reserve(
p.subset_ratio * num_skip * num_inputs);
606 for (
size_t i = 0;
i <
p.subset_ratio * num_skip; ++
i) {
607 full.insert(full.end(), inputs, inputs + num_inputs);
610 std::shuffle(full.begin(), full.end(), rng);
616 void FillSubset(
const InputVec& full,
const FuncInput input_to_skip,
617 const size_t num_skip, InputVec* subset) {
618 const size_t count =
std::count(full.begin(), full.end(), input_to_skip);
620 std::vector<uint32_t> omit;
623 for (
size_t i = 0;
i <
count; ++
i) {
629 std::shuffle(omit.begin(), omit.end(), rng);
630 omit.resize(num_skip);
631 std::sort(omit.begin(), omit.end());
635 size_t idx_subset = 0;
637 if (
next == input_to_skip) {
640 if (idx_omit < num_skip) {
642 if (occurrence == omit[idx_omit]) {
648 if (idx_subset < subset->
size()) {
649 (*subset)[idx_subset++] =
next;
652 ABSL_RAW_CHECK(idx_subset == subset->size(),
"idx_subset not at end");
658 Ticks TotalDuration(
const Func func,
const void*
arg,
const InputVec* inputs,
659 const Params& p,
double* max_rel_mad) {
661 const Ticks duration =
662 SampleUntilStable(
p.target_rel_mad, &rel_mad, p, [
func,
arg, inputs]() {
663 for (const FuncInput input : *inputs) {
664 PreventElision(func(arg, input));
667 *max_rel_mad = (
std::max)(*max_rel_mad, rel_mad);
679 Ticks Overhead(
const void*
arg,
const InputVec* inputs,
const Params& p) {
682 return SampleUntilStable(0.0, &rel_mad, p, [
arg, inputs]() {
684 PreventElision(EmptyFunc(
arg,
input));
694 #if defined(ABSL_OS_WIN)
696 cpu =
static_cast<int>(GetCurrentProcessorNumber());
705 }
else if (cpu >= 64) {
709 const DWORD_PTR prev = SetThreadAffinityMask(GetCurrentThread(), 1
ULL << cpu);
711 #elif defined(ABSL_OS_LINUX) && !defined(ABSL_OS_ANDROID)
713 cpu = sched_getcpu();
720 const int err = sched_setaffinity(pid,
sizeof(
set), &
set);
728 #if defined(ABSL_ARCH_PPC)
729 return __ppc_get_timebase_freq();
730 #elif defined(ABSL_ARCH_X86_64)
732 return platform::NominalClockRate();
740 const InputVec&
unique,
const InputVec& full,
742 const float mul = 1.0f /
static_cast<int>(num_skip);
744 InputVec subset(full.size() - num_skip);
746 const Ticks overhead_skip = Overhead(
arg, &subset, p);
755 subset.size(),
overhead, overhead_skip);
758 double max_rel_mad = 0.0;
759 const Ticks
total = TotalDuration(
func,
arg, &full, p, &max_rel_mad);
761 for (
size_t i = 0;
i <
unique.size(); ++
i) {
762 FillSubset(full,
unique[
i], num_skip, &subset);
763 const Ticks total_skip = TotalDuration(
func,
arg, &subset, p, &max_rel_mad);
765 if (
total < total_skip) {
771 const Ticks duration = (
total -
overhead) - (total_skip - overhead_skip);
774 results[
i].variability = max_rel_mad;
784 const InputVec
unique = UniqueInputs(inputs, num_inputs);
786 if (num_skip == 0)
return 0;
788 const InputVec full =
789 ReplicateInputs(inputs, num_inputs,
unique.size(), num_skip, p);
792 for (
size_t i = 0;
i < p.max_measure_retries;
i++) {