10 #ifndef EIGEN_CXX11_TENSOR_TENSOR_COST_MODEL_H 11 #define EIGEN_CXX11_TENSOR_TENSOR_COST_MODEL_H 30 template <
typename ArgType>
35 template <
typename ArgType>
39 template <
typename ArgType>
44 template <
typename ArgType>
48 template <
typename SrcType,
typename TargetType>
64 bool vectorized,
double packet_size)
84 double load_cost,
double store_cost,
double compute_cost)
const {
102 return TensorOpCost(bytes_loaded, bytes_stored, compute_cycles);
111 return TensorOpCost(bytes_loaded, bytes_stored, compute_cycles);
160 template <
typename Device>
164 static const int kDeviceCyclesPerComputeCycle = 1;
167 static const int kStartupCycles = 100000;
168 static const int kPerThreadCycles = 100000;
169 static const int kTaskSize = 40000;
175 double output_size,
const TensorOpCost& cost_per_coeff,
int max_threads) {
176 double cost = totalCost(output_size, cost_per_coeff);
177 int threads = (cost - kStartupCycles) / kPerThreadCycles + 0.9;
178 return numext::mini(max_threads, numext::maxi(1, threads));
185 double output_size,
const TensorOpCost& cost_per_coeff) {
186 return totalCost(output_size, cost_per_coeff) / kTaskSize;
191 double output_size,
const TensorOpCost& cost_per_coeff) {
201 const double kLoadCycles = 1.0 / 64 * 11;
202 const double kStoreCycles = 1.0 / 64 * 11;
205 cost_per_coeff.
total_cost(kLoadCycles, kStoreCycles,
206 kDeviceCyclesPerComputeCycle);
212 #endif // EIGEN_CXX11_TENSOR_TENSOR_COST_MODEL_H EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool() isfinite(const half &a)
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int numThreads(double output_size, const TensorOpCost &cost_per_coeff, int max_threads)
#define EIGEN_STRONG_INLINE
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE friend TensorOpCost operator+(TensorOpCost lhs, const TensorOpCost &rhs)
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int ModCost()
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost & operator*=(double rhs)
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double bytes_loaded() const
EIGEN_DEVICE_FUNC TensorOpCost()
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double compute_cycles() const
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int CastCost()
friend std::ostream & operator<<(std::ostream &os, const TensorOpCost &tc)
EIGEN_DEVICE_FUNC TensorOpCost(double bytes_loaded, double bytes_stored, double compute_cycles)
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost cwiseMin(const TensorOpCost &rhs) const
EIGEN_DEVICE_FUNC void dropMemoryCost()
EIGEN_DEVICE_FUNC TensorOpCost(double bytes_loaded, double bytes_stored, double compute_cycles, bool vectorized, double packet_size)
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE friend TensorOpCost operator*(double lhs, TensorOpCost rhs)
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost & operator+=(const TensorOpCost &rhs)
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double bytes_stored() const
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int DivCost()
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double total_cost(double load_cost, double store_cost, double compute_cost) const
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int AddCost()
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double totalCost(double output_size, const TensorOpCost &cost_per_coeff)
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost cwiseMax(const TensorOpCost &rhs) const
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double taskSize(double output_size, const TensorOpCost &cost_per_coeff)
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE friend TensorOpCost operator*(TensorOpCost lhs, double rhs)
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int MulCost()