TensorRandom.h
Go to the documentation of this file.
1 // This file is part of Eigen, a lightweight C++ template library
2 // for linear algebra.
3 //
4 // Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com>
5 // Copyright (C) 2018 Mehdi Goli <eigen@codeplay.com> Codeplay Software Ltd.
6 //
7 // This Source Code Form is subject to the terms of the Mozilla
8 // Public License v. 2.0. If a copy of the MPL was not distributed
9 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
10 
11 #ifndef EIGEN_CXX11_TENSOR_TENSOR_RANDOM_H
12 #define EIGEN_CXX11_TENSOR_TENSOR_RANDOM_H
13 
14 namespace Eigen {
15 namespace internal {
16 
17 namespace {
18 
19 EIGEN_DEVICE_FUNC uint64_t get_random_seed() {
20 #if defined(EIGEN_GPU_COMPILE_PHASE)
21  // We don't support 3d kernels since we currently only use 1 and
22  // 2d kernels.
23  gpu_assert(threadIdx.z == 0);
24  return blockIdx.x * blockDim.x + threadIdx.x
25  + gridDim.x * blockDim.x * (blockIdx.y * blockDim.y + threadIdx.y);
26 #else
27  // Rely on Eigen's random implementation.
28  return random<uint64_t>();
29 #endif
30 }
31 
32 static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE unsigned PCG_XSH_RS_generator(uint64_t* state, uint64_t stream) {
33  // TODO: Unify with the implementation in the non blocking thread pool.
34  uint64_t current = *state;
35  // Update the internal state
36  *state = current * 6364136223846793005ULL + (stream << 1 | 1);
37  // Generate the random output (using the PCG-XSH-RS scheme)
38  return static_cast<unsigned>((current ^ (current >> 22)) >> (22 + (current >> 61)));
39 }
40 
41 static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE uint64_t PCG_XSH_RS_state(uint64_t seed) {
42  seed = seed ? seed : get_random_seed();
43  return seed * 6364136223846793005ULL + 0xda3e39cb94b95bdbULL;
44 }
45 
46 } // namespace
47 
48 
49 template <typename T> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
51  unsigned rnd = PCG_XSH_RS_generator(state, stream);
52  return static_cast<T>(rnd);
53 }
54 
55 
57 Eigen::half RandomToTypeUniform<Eigen::half>(uint64_t* state, uint64_t stream) {
58  // Generate 10 random bits for the mantissa, merge with exponent.
59  unsigned rnd = PCG_XSH_RS_generator(state, stream);
60  const uint16_t half_bits = static_cast<uint16_t>(rnd & 0x3ffu) | (static_cast<uint16_t>(15) << 10);
62  // Return the final result
63  return result - Eigen::half(1.0f);
64 }
65 
67 Eigen::bfloat16 RandomToTypeUniform<Eigen::bfloat16>(uint64_t* state, uint64_t stream) {
68 
69  // Generate 7 random bits for the mantissa, merge with exponent.
70  unsigned rnd = PCG_XSH_RS_generator(state, stream);
71  const uint16_t half_bits = static_cast<uint16_t>(rnd & 0x7fu) | (static_cast<uint16_t>(127) << 7);
73  // Return the final result
74  return result - Eigen::bfloat16(1.0f);
75 }
76 
79  typedef union {
80  uint32_t raw;
81  float fp;
82  } internal;
83  internal result;
84  // Generate 23 random bits for the mantissa mantissa
85  const unsigned rnd = PCG_XSH_RS_generator(state, stream);
86  result.raw = rnd & 0x7fffffu;
87  // Set the exponent
88  result.raw |= (static_cast<uint32_t>(127) << 23);
89  // Return the final result
90  return result.fp - 1.0f;
91 }
92 
95  typedef union {
96  uint64_t raw;
97  double dp;
98  } internal;
99  internal result;
100  result.raw = 0;
101  // Generate 52 random bits for the mantissa
102  // First generate the upper 20 bits
103  unsigned rnd1 = PCG_XSH_RS_generator(state, stream) & 0xfffffu;
104  // The generate the lower 32 bits
105  unsigned rnd2 = PCG_XSH_RS_generator(state, stream);
106  result.raw = (static_cast<uint64_t>(rnd1) << 32) | rnd2;
107  // Set the exponent
108  result.raw |= (static_cast<uint64_t>(1023) << 52);
109  // Return the final result
110  return result.dp - 1.0;
111 }
112 
114 std::complex<float> RandomToTypeUniform<std::complex<float> >(uint64_t* state, uint64_t stream) {
115  return std::complex<float>(RandomToTypeUniform<float>(state, stream),
117 }
119 std::complex<double> RandomToTypeUniform<std::complex<double> >(uint64_t* state, uint64_t stream) {
120  return std::complex<double>(RandomToTypeUniform<double>(state, stream),
122 }
123 
124 template <typename T> class UniformRandomGenerator {
125  public:
126  static const bool PacketAccess = true;
127 
128  // Uses the given "seed" if non-zero, otherwise uses a random seed.
130  uint64_t seed = 0) {
131  m_state = PCG_XSH_RS_state(seed);
132  #ifdef EIGEN_USE_SYCL
133  // In SYCL it is not possible to build PCG_XSH_RS_state in one step.
134  // Therefor, we need two step to initializate the m_state.
135  // IN SYCL, the constructor of the functor is s called on the CPU
136  // and we get the clock seed here from the CPU. However, This seed is
137  //the same for all the thread. As unlike CUDA, the thread.ID, BlockID, etc is not a global function.
138  // and only available on the Operator() function (which is called on the GPU).
139  // Thus for CUDA (((CLOCK + global_thread_id)* 6364136223846793005ULL) + 0xda3e39cb94b95bdbULL) is passed to each thread
140  // but for SYCL ((CLOCK * 6364136223846793005ULL) + 0xda3e39cb94b95bdbULL) is passed to each thread and each thread adds
141  // the (global_thread_id* 6364136223846793005ULL) for itself only once, in order to complete the construction
142  // similar to CUDA Therefore, the thread Id injection is not available at this stage.
143  //However when the operator() is called the thread ID will be avilable. So inside the opeator,
144  // we add the thrreadID, BlockId,... (which is equivalent of i)
145  //to the seed and construct the unique m_state per thead similar to cuda.
146  m_exec_once =false;
147  #endif
148  }
150  const UniformRandomGenerator& other) {
151  m_state = other.m_state;
152  #ifdef EIGEN_USE_SYCL
153  m_exec_once =other.m_exec_once;
154  #endif
155  }
156 
157  template<typename Index> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
158  T operator()(Index i) const {
159  #ifdef EIGEN_USE_SYCL
160  if(!m_exec_once) {
161  // This is the second stage of adding thread Id to the CPU clock seed and build unique seed per thread
162  // The (i * 6364136223846793005ULL) is the remaining part of the PCG_XSH_RS_state on the GPU side
163  m_state += (i * 6364136223846793005ULL);
164  m_exec_once =true;
165  }
166  #endif
167  T result = RandomToTypeUniform<T>(&m_state, i);
168  return result;
169  }
170 
171  template<typename Packet, typename Index> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
173  const int packetSize = internal::unpacket_traits<Packet>::size;
174  EIGEN_ALIGN_MAX T values[packetSize];
175  #ifdef EIGEN_USE_SYCL
176  if(!m_exec_once) {
177  // This is the second stage of adding thread Id to the CPU clock seed and build unique seed per thread
178  m_state += (i * 6364136223846793005ULL);
179  m_exec_once =true;
180  }
181  #endif
183  for (int j = 0; j < packetSize; ++j) {
184  values[j] = RandomToTypeUniform<T>(&m_state, i);
185  }
186  return internal::pload<Packet>(values);
187  }
188 
189  private:
190  mutable uint64_t m_state;
191  #ifdef EIGEN_USE_SYCL
192  mutable bool m_exec_once;
193  #endif
194 };
195 
196 template <typename Scalar>
198  enum {
199  // Rough estimate for floating point, multiplied by ceil(sizeof(T) / sizeof(float)).
201  ((sizeof(Scalar) + sizeof(float) - 1) / sizeof(float)),
203  };
204 };
205 
206 
207 
208 template <typename T> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
210  // Use the ratio of uniform method to generate numbers following a normal
211  // distribution. See for example Numerical Recipes chapter 7.3.9 for the
212  // details.
213  T u, v, q;
214  do {
215  u = RandomToTypeUniform<T>(state, stream);
216  v = T(1.7156) * (RandomToTypeUniform<T>(state, stream) - T(0.5));
217  const T x = u - T(0.449871);
218  const T y = numext::abs(v) + T(0.386595);
219  q = x*x + y * (T(0.196)*y - T(0.25472)*x);
220  } while (q > T(0.27597) &&
221  (q > T(0.27846) || v*v > T(-4) * numext::log(u) * u*u));
222 
223  return v/u;
224 }
225 
227 std::complex<float> RandomToTypeNormal<std::complex<float> >(uint64_t* state, uint64_t stream) {
228  return std::complex<float>(RandomToTypeNormal<float>(state, stream),
229  RandomToTypeNormal<float>(state, stream));
230 }
232 std::complex<double> RandomToTypeNormal<std::complex<double> >(uint64_t* state, uint64_t stream) {
233  return std::complex<double>(RandomToTypeNormal<double>(state, stream),
234  RandomToTypeNormal<double>(state, stream));
235 }
236 
237 
238 template <typename T> class NormalRandomGenerator {
239  public:
240  static const bool PacketAccess = true;
241 
242  // Uses the given "seed" if non-zero, otherwise uses a random seed.
244  m_state = PCG_XSH_RS_state(seed);
245  #ifdef EIGEN_USE_SYCL
246  // In SYCL it is not possible to build PCG_XSH_RS_state in one step.
247  // Therefor, we need two steps to initializate the m_state.
248  // IN SYCL, the constructor of the functor is s called on the CPU
249  // and we get the clock seed here from the CPU. However, This seed is
250  //the same for all the thread. As unlike CUDA, the thread.ID, BlockID, etc is not a global function.
251  // and only available on the Operator() function (which is called on the GPU).
252  // Therefore, the thread Id injection is not available at this stage. However when the operator()
253  //is called the thread ID will be avilable. So inside the opeator,
254  // we add the thrreadID, BlockId,... (which is equivalent of i)
255  //to the seed and construct the unique m_state per thead similar to cuda.
256  m_exec_once =false;
257  #endif
258  }
260  const NormalRandomGenerator& other) {
261  m_state = other.m_state;
262 #ifdef EIGEN_USE_SYCL
263  m_exec_once=other.m_exec_once;
264 #endif
265  }
266 
267  template<typename Index> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
268  T operator()(Index i) const {
269  #ifdef EIGEN_USE_SYCL
270  if(!m_exec_once) {
271  // This is the second stage of adding thread Id to the CPU clock seed and build unique seed per thread
272  m_state += (i * 6364136223846793005ULL);
273  m_exec_once =true;
274  }
275  #endif
276  T result = RandomToTypeNormal<T>(&m_state, i);
277  return result;
278  }
279 
280  template<typename Packet, typename Index> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
282  const int packetSize = internal::unpacket_traits<Packet>::size;
283  EIGEN_ALIGN_MAX T values[packetSize];
284  #ifdef EIGEN_USE_SYCL
285  if(!m_exec_once) {
286  // This is the second stage of adding thread Id to the CPU clock seed and build unique seed per thread
287  m_state += (i * 6364136223846793005ULL);
288  m_exec_once =true;
289  }
290  #endif
292  for (int j = 0; j < packetSize; ++j) {
293  values[j] = RandomToTypeNormal<T>(&m_state, i);
294  }
295  return internal::pload<Packet>(values);
296  }
297 
298  private:
299  mutable uint64_t m_state;
300  #ifdef EIGEN_USE_SYCL
301  mutable bool m_exec_once;
302  #endif
303 };
304 
305 
306 template <typename Scalar>
308  enum {
309  // On average, we need to generate about 3 random numbers
310  // 15 mul, 8 add, 1.5 logs
313  3 * functor_traits<scalar_log_op<Scalar> >::Cost / 2,
315  };
316 };
317 
318 
319 } // end namespace internal
320 } // end namespace Eigen
321 
322 #endif // EIGEN_CXX11_TENSOR_TENSOR_RANDOM_H
SCALAR Scalar
Definition: bench_gemm.cpp:46
#define EIGEN_STRONG_INLINE
Definition: Macros.h:917
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE NormalRandomGenerator(const NormalRandomGenerator &other)
Definition: TensorRandom.h:259
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(Index i) const
Definition: TensorRandom.h:281
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T RandomToTypeNormal(uint64_t *state, uint64_t stream)
Definition: TensorRandom.h:209
leaf::MyValues values
Namespace containing all symbols from the Eigen library.
Definition: jet.h:637
dim3 threadIdx
Definition: gpu_common.h:19
unsigned short uint16_t
Definition: ms_stdint.h:84
Holds information about the various numeric (i.e. scalar) types allowed by Eigen. ...
Definition: NumTraits.h:232
#define EIGEN_ALIGN_MAX
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T operator()(Index i) const
Definition: TensorRandom.h:158
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE internal::enable_if< NumTraits< T >::IsSigned||NumTraits< T >::IsComplex, typename NumTraits< T >::Real >::type abs(const T &x)
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double RandomToTypeUniform< double >(uint64_t *state, uint64_t stream)
Definition: TensorRandom.h:94
Values result
EIGEN_DEFAULT_DENSE_INDEX_TYPE Index
The Index type as used for the API.
Definition: Meta.h:74
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE UniformRandomGenerator(uint64_t seed=0)
Definition: TensorRandom.h:129
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE NormalRandomGenerator(uint64_t seed=0)
Definition: TensorRandom.h:243
Array< int, Dynamic, 1 > v
unsigned int uint32_t
Definition: ms_stdint.h:85
unsigned __int64 uint64_t
Definition: ms_stdint.h:95
Eigen::Triplet< double > T
Point2(* f)(const Point3 &, OptionalJacobian< 2, 3 >)
EIGEN_DEVICE_FUNC const Scalar & q
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE UniformRandomGenerator(const UniformRandomGenerator &other)
Definition: TensorRandom.h:149
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float RandomToTypeUniform< float >(uint64_t *state, uint64_t stream)
Definition: TensorRandom.h:78
#define EIGEN_DEVICE_FUNC
Definition: Macros.h:976
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Tgt bit_cast(const Src &src)
Definition: NumTraits.h:85
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T RandomToTypeUniform(uint64_t *state, uint64_t stream)
Definition: TensorRandom.h:50
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T operator()(Index i) const
Definition: TensorRandom.h:268
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T log(const T &x)
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(Index i) const
Definition: TensorRandom.h:172
set noclip points set clip one set noclip two set bar set border lt lw set xdata set ydata set zdata set x2data set y2data set boxwidth set dummy x
dim3 blockIdx
Definition: gpu_common.h:19
std::ptrdiff_t j
#define EIGEN_UNROLL_LOOP
Definition: Macros.h:1461
dim3 blockDim
Definition: gpu_common.h:19


gtsam
Author(s):
autogenerated on Tue Jul 4 2023 02:37:13