TensorCostModel.h
Go to the documentation of this file.
1 // This file is part of Eigen, a lightweight C++ template library
2 // for linear algebra.
3 //
4 // Copyright (C) 2016 Rasmus Munk Larsen <rmlarsen@google.com>
5 //
6 // This Source Code Form is subject to the terms of the Mozilla
7 // Public License v. 2.0. If a copy of the MPL was not distributed
8 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
9 
10 #ifndef EIGEN_CXX11_TENSOR_TENSOR_COST_MODEL_H
11 #define EIGEN_CXX11_TENSOR_TENSOR_COST_MODEL_H
12 
13 namespace Eigen {
14 
23 // Class storing the cost of evaluating a tensor expression in terms of the
24 // estimated number of operand bytes loads, bytes stored, and compute cycles.
25 class TensorOpCost {
26  public:
27  // TODO(rmlarsen): Fix the scalar op costs in Eigen proper. Even a simple
28  // model based on minimal reciprocal throughput numbers from Intel or
29  // Agner Fog's tables would be better than what is there now.
30  template <typename ArgType>
34  }
35  template <typename ArgType>
38  }
39  template <typename ArgType>
43  }
44  template <typename ArgType>
47  }
48  template <typename SrcType, typename TargetType>
52  }
53 
61 
64  bool vectorized, double packet_size)
67  compute_cycles_(vectorized ? compute_cycles / packet_size
68  : compute_cycles) {
72  }
73 
75  return bytes_loaded_;
76  }
78  return bytes_stored_;
79  }
81  return compute_cycles_;
82  }
84  double load_cost, double store_cost, double compute_cost) const {
85  return load_cost * bytes_loaded_ + store_cost * bytes_stored_ +
86  compute_cost * compute_cycles_;
87  }
88 
89  // Drop memory access component. Intended for cases when memory accesses are
90  // sequential or are completely masked by computations.
92  bytes_loaded_ = 0;
93  bytes_stored_ = 0;
94  }
95 
96  // TODO(rmlarsen): Define min in terms of total cost, not elementwise.
98  const TensorOpCost& rhs) const {
103  }
104 
105  // TODO(rmlarsen): Define max in terms of total cost, not elementwise.
107  const TensorOpCost& rhs) const {
112  }
113 
115  const TensorOpCost& rhs) {
116  bytes_loaded_ += rhs.bytes_loaded();
117  bytes_stored_ += rhs.bytes_stored();
119  return *this;
120  }
121 
123  bytes_loaded_ *= rhs;
124  bytes_stored_ *= rhs;
125  compute_cycles_ *= rhs;
126  return *this;
127  }
128 
130  TensorOpCost lhs, const TensorOpCost& rhs) {
131  lhs += rhs;
132  return lhs;
133  }
135  TensorOpCost lhs, double rhs) {
136  lhs *= rhs;
137  return lhs;
138  }
140  double lhs, TensorOpCost rhs) {
141  rhs *= lhs;
142  return rhs;
143  }
144 
145  friend std::ostream& operator<<(std::ostream& os, const TensorOpCost& tc) {
146  return os << "[bytes_loaded = " << tc.bytes_loaded()
147  << ", bytes_stored = " << tc.bytes_stored()
148  << ", compute_cycles = " << tc.compute_cycles() << "]";
149  }
150 
151  private:
155 };
156 
157 // TODO(rmlarsen): Implement a policy that chooses an "optimal" number of theads
158 // in [1:max_threads] instead of just switching multi-threading off for small
159 // work units.
160 template <typename Device>
162  public:
163  // Scaling from Eigen compute cost to device cycles.
164  static const int kDeviceCyclesPerComputeCycle = 1;
165 
166  // Costs in device cycles.
167  static const int kStartupCycles = 100000;
168  static const int kPerThreadCycles = 100000;
169  static const int kTaskSize = 40000;
170 
171  // Returns the number of threads in [1:max_threads] to use for
172  // evaluating an expression with the given output size and cost per
173  // coefficient.
175  double output_size, const TensorOpCost& cost_per_coeff, int max_threads) {
176  double cost = totalCost(output_size, cost_per_coeff);
177  double threads = (cost - kStartupCycles) / kPerThreadCycles + 0.9;
178  // Make sure we don't invoke undefined behavior when we convert to an int.
179  threads = numext::mini<double>(threads, GenericNumTraits<int>::highest());
180  return numext::mini(max_threads,
181  numext::maxi<int>(1, static_cast<int>(threads)));
182  }
183 
184  // taskSize assesses parallel task size.
185  // Value of 1.0 means ideal parallel task size. Values < 1.0 mean that task
186  // granularity needs to be increased to mitigate parallelization overheads.
188  double output_size, const TensorOpCost& cost_per_coeff) {
189  return totalCost(output_size, cost_per_coeff) / kTaskSize;
190  }
191 
193  double output_size, const TensorOpCost& cost_per_coeff) {
194  // Cost of memory fetches from L2 cache. 64 is typical cache line size.
195  // 11 is L2 cache latency on Haswell.
196  // We don't know whether data is in L1, L2 or L3. But we are most interested
197  // in single-threaded computational time around 100us-10ms (smaller time
198  // is too small for parallelization, larger time is not interesting
199  // either because we are probably using all available threads already).
200  // And for the target time range, L2 seems to be what matters. Data set
201  // fitting into L1 is too small to take noticeable time. Data set fitting
202  // only into L3 presumably will take more than 10ms to load and process.
203  const double kLoadCycles = 1.0 / 64 * 11;
204  const double kStoreCycles = 1.0 / 64 * 11;
205  // Scaling from Eigen compute cost to device cycles.
206  return output_size *
207  cost_per_coeff.total_cost(kLoadCycles, kStoreCycles,
209  }
210 };
211 
212 } // namespace Eigen
213 
214 #endif // EIGEN_CXX11_TENSOR_TENSOR_COST_MODEL_H
Eigen::TensorOpCost::operator*=
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost & operator*=(double rhs)
Definition: TensorCostModel.h:122
Eigen::TensorOpCost::compute_cycles
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double compute_cycles() const
Definition: TensorCostModel.h:80
EIGEN_DEVICE_FUNC
#define EIGEN_DEVICE_FUNC
Definition: Macros.h:976
Eigen
Namespace containing all symbols from the Eigen library.
Definition: jet.h:637
Eigen::TensorOpCost::TensorOpCost
EIGEN_DEVICE_FUNC TensorOpCost()
Definition: TensorCostModel.h:55
Eigen::internal::scalar_quotient_op
Definition: BinaryFunctors.h:378
Eigen::TensorOpCost::TensorOpCost
EIGEN_DEVICE_FUNC TensorOpCost(double bytes_loaded, double bytes_stored, double compute_cycles, bool vectorized, double packet_size)
Definition: TensorCostModel.h:63
Eigen::numext::isfinite
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool() isfinite(const Eigen::bfloat16 &h)
Definition: BFloat16.h:671
Eigen::TensorOpCost::operator+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE friend TensorOpCost operator+(TensorOpCost lhs, const TensorOpCost &rhs)
Definition: TensorCostModel.h:129
eigen_assert
#define eigen_assert(x)
Definition: Macros.h:1037
Eigen::TensorOpCost::operator*
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE friend TensorOpCost operator*(double lhs, TensorOpCost rhs)
Definition: TensorCostModel.h:139
Eigen::internal::scalar_cast_op
Definition: UnaryFunctors.h:160
Eigen::TensorOpCost::compute_cycles_
double compute_cycles_
Definition: TensorCostModel.h:154
Eigen::TensorOpCost::ModCost
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int ModCost()
Definition: TensorCostModel.h:45
os
ofstream os("timeSchurFactors.csv")
Eigen::TensorOpCost::dropMemoryCost
EIGEN_DEVICE_FUNC void dropMemoryCost()
Definition: TensorCostModel.h:91
Eigen::TensorOpCost::bytes_loaded
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double bytes_loaded() const
Definition: TensorCostModel.h:74
Eigen::internal::scalar_product_op
Definition: BinaryFunctors.h:70
Eigen::GenericNumTraits
Definition: NumTraits.h:152
Eigen::TensorOpCost::cwiseMin
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost cwiseMin(const TensorOpCost &rhs) const
Definition: TensorCostModel.h:97
Eigen::TensorOpCost::bytes_stored
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double bytes_stored() const
Definition: TensorCostModel.h:77
Eigen::TensorOpCost::AddCost
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int AddCost()
Definition: TensorCostModel.h:36
Eigen::numext::mini
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T mini(const T &x, const T &y)
Definition: Eigen/src/Core/MathFunctions.h:1085
Eigen::TensorOpCost::total_cost
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double total_cost(double load_cost, double store_cost, double compute_cost) const
Definition: TensorCostModel.h:83
EIGEN_STRONG_INLINE
#define EIGEN_STRONG_INLINE
Definition: Macros.h:917
Eigen::TensorOpCost::DivCost
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int DivCost()
Definition: TensorCostModel.h:40
Eigen::TensorCostModel::kPerThreadCycles
static const int kPerThreadCycles
Definition: TensorCostModel.h:168
Eigen::TensorOpCost::operator<<
friend std::ostream & operator<<(std::ostream &os, const TensorOpCost &tc)
Definition: TensorCostModel.h:145
Eigen::TensorCostModel::kDeviceCyclesPerComputeCycle
static const int kDeviceCyclesPerComputeCycle
Definition: TensorCostModel.h:164
Eigen::TensorCostModel::totalCost
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double totalCost(double output_size, const TensorOpCost &cost_per_coeff)
Definition: TensorCostModel.h:192
Eigen::TensorCostModel::kStartupCycles
static const int kStartupCycles
Definition: TensorCostModel.h:167
Eigen::TensorOpCost::bytes_stored_
double bytes_stored_
Definition: TensorCostModel.h:153
Eigen::TensorCostModel::kTaskSize
static const int kTaskSize
Definition: TensorCostModel.h:169
Eigen::TensorCostModel::taskSize
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double taskSize(double output_size, const TensorOpCost &cost_per_coeff)
Definition: TensorCostModel.h:187
Eigen::TensorCostModel::numThreads
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int numThreads(double output_size, const TensorOpCost &cost_per_coeff, int max_threads)
Definition: TensorCostModel.h:174
Eigen::TensorOpCost::MulCost
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int MulCost()
Definition: TensorCostModel.h:31
Eigen::internal::functor_traits
Definition: XprHelper.h:175
Eigen::TensorOpCost::operator+=
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost & operator+=(const TensorOpCost &rhs)
Definition: TensorCostModel.h:114
Eigen::numext::maxi
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T maxi(const T &x, const T &y)
Definition: Eigen/src/Core/MathFunctions.h:1093
Eigen::TensorOpCost::TensorOpCost
EIGEN_DEVICE_FUNC TensorOpCost(double bytes_loaded, double bytes_stored, double compute_cycles)
Definition: TensorCostModel.h:57
Eigen::TensorCostModel
Definition: TensorCostModel.h:161
Eigen::TensorOpCost::operator*
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE friend TensorOpCost operator*(TensorOpCost lhs, double rhs)
Definition: TensorCostModel.h:134
Eigen::TensorOpCost::cwiseMax
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost cwiseMax(const TensorOpCost &rhs) const
Definition: TensorCostModel.h:106
Eigen::TensorOpCost
Definition: TensorCostModel.h:25
Eigen::TensorOpCost::bytes_loaded_
double bytes_loaded_
Definition: TensorCostModel.h:152
Eigen::TensorOpCost::CastCost
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int CastCost()
Definition: TensorCostModel.h:49


gtsam
Author(s):
autogenerated on Tue Jan 7 2025 04:05:36