TensorReductionGpu.h
Go to the documentation of this file.
1 // This file is part of Eigen, a lightweight C++ template library
2 // for linear algebra.
3 //
4 // Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
5 //
6 // This Source Code Form is subject to the terms of the Mozilla
7 // Public License v. 2.0. If a copy of the MPL was not distributed
8 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
9 
10 #ifndef EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_GPU_H
11 #define EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_GPU_H
12 
13 namespace Eigen {
14 namespace internal {
15 
16 
17 #if defined(EIGEN_USE_GPU) && defined(EIGEN_GPUCC)
18 // Full reducers for GPU, don't vectorize for now
19 
20 // Reducer function that enables multiple gpu thread to safely accumulate at the same
21 // output address. It basically reads the current value of the output variable, and
22 // attempts to update it with the new value. If in the meantime another gpu thread
23 // updated the content of the output address it will try again.
24 template <typename T, typename R>
25 __device__ EIGEN_ALWAYS_INLINE void atomicReduce(T* output, T accum, R& reducer) {
26 #if (defined(EIGEN_HIP_DEVICE_COMPILE) && defined(__HIP_ARCH_HAS_WARP_SHUFFLE__)) || (EIGEN_CUDA_ARCH >= 300)
27  if (sizeof(T) == 4)
28  {
29  unsigned int oldval = *reinterpret_cast<unsigned int*>(output);
30  unsigned int newval = oldval;
31  reducer.reduce(accum, reinterpret_cast<T*>(&newval));
32  if (newval == oldval) {
33  return;
34  }
35  unsigned int readback;
36  while ((readback = atomicCAS((unsigned int*)output, oldval, newval)) != oldval) {
37  oldval = readback;
38  newval = oldval;
39  reducer.reduce(accum, reinterpret_cast<T*>(&newval));
40  if (newval == oldval) {
41  return;
42  }
43  }
44  }
45  else if (sizeof(T) == 8) {
46  unsigned long long oldval = *reinterpret_cast<unsigned long long*>(output);
47  unsigned long long newval = oldval;
48  reducer.reduce(accum, reinterpret_cast<T*>(&newval));
49  if (newval == oldval) {
50  return;
51  }
52  unsigned long long readback;
53  while ((readback = atomicCAS((unsigned long long*)output, oldval, newval)) != oldval) {
54  oldval = readback;
55  newval = oldval;
56  reducer.reduce(accum, reinterpret_cast<T*>(&newval));
57  if (newval == oldval) {
58  return;
59  }
60  }
61  }
62  else {
63  gpu_assert(0 && "Wordsize not supported");
64  }
65 #else // EIGEN_CUDA_ARCH >= 300
66  gpu_assert(0 && "Shouldn't be called on unsupported device");
67 #endif // EIGEN_CUDA_ARCH >= 300
68 }
69 
70 // We extend atomicExch to support extra data types
71 template <typename Type>
72 __device__ inline Type atomicExchCustom(Type* address, Type val) {
73  return atomicExch(address, val);
74 }
75 
76 template <>
77 __device__ inline double atomicExchCustom(double* address, double val) {
78  unsigned long long int* address_as_ull = reinterpret_cast<unsigned long long int*>(address);
79  return __longlong_as_double(atomicExch(address_as_ull, __double_as_longlong(val)));
80 }
81 
82 #ifdef EIGEN_HAS_GPU_FP16
83 template <typename R>
84 __device__ inline void atomicReduce(half2* output, half2 accum, R& reducer) {
85  unsigned int oldval = *reinterpret_cast<unsigned int*>(output);
86  unsigned int newval = oldval;
87  reducer.reducePacket(accum, reinterpret_cast<half2*>(&newval));
88  if (newval == oldval) {
89  return;
90  }
91  unsigned int readback;
92  while ((readback = atomicCAS((unsigned int*)output, oldval, newval)) != oldval) {
93  oldval = readback;
94  newval = oldval;
95  reducer.reducePacket(accum, reinterpret_cast<half2*>(&newval));
96  if (newval == oldval) {
97  return;
98  }
99  }
100 }
101 // reduction should be associative since reduction is not atomic in wide vector but atomic in half2 operations
102 template <typename R>
103 __device__ inline void atomicReduce(Packet4h2* output, Packet4h2 accum, R& reducer) {
104  half2* houtput=reinterpret_cast<half2*>(output);
105  half2* haccum=reinterpret_cast<half2*>(&accum);
106  for(int i=0;i<4;++i){
107  atomicReduce(houtput+i,*(haccum+i),reducer);
108  }
109 }
110 #endif // EIGEN_HAS_GPU_FP16
111 
112 template <>
113 __device__ inline void atomicReduce(float* output, float accum, SumReducer<float>&) {
114 #if (defined(EIGEN_HIP_DEVICE_COMPILE) && defined(__HIP_ARCH_HAS_WARP_SHUFFLE__)) || (EIGEN_CUDA_ARCH >= 300)
115  atomicAdd(output, accum);
116 #else // EIGEN_CUDA_ARCH >= 300
117  gpu_assert(0 && "Shouldn't be called on unsupported device");
118 #endif // EIGEN_CUDA_ARCH >= 300
119 }
120 
121 
122 template <typename CoeffType, typename Index>
123 __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void ReductionInitKernel(const CoeffType val, Index num_preserved_coeffs, CoeffType* output) {
124  const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x;
125  const Index num_threads = blockDim.x * gridDim.x;
126  for (Index i = thread_id; i < num_preserved_coeffs; i += num_threads) {
127  output[i] = val;
128  }
129 }
130 
131 
132 template <int BlockSize, int NumPerThread, typename Self,
133  typename Reducer, typename Index>
134 __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void FullReductionKernel(Reducer reducer, const Self input, Index num_coeffs,
135  typename Self::CoeffReturnType* output, unsigned int* semaphore) {
136 #if (defined(EIGEN_HIP_DEVICE_COMPILE) && defined(__HIP_ARCH_HAS_WARP_SHUFFLE__)) || (EIGEN_CUDA_ARCH >= 300)
137  // Initialize the output value
138  const Index first_index = blockIdx.x * BlockSize * NumPerThread + threadIdx.x;
139  if (gridDim.x == 1) {
140  if (first_index == 0) {
141  *output = reducer.initialize();
142  }
143  }
144  else {
145  if (threadIdx.x == 0) {
146  unsigned int block = atomicCAS(semaphore, 0u, 1u);
147  if (block == 0) {
148  // We're the first block to run, initialize the output value
149  atomicExchCustom(output, reducer.initialize());
150  __threadfence();
151  atomicExch(semaphore, 2u);
152  }
153  else {
154  // Wait for the first block to initialize the output value.
155  // Use atomicCAS here to ensure that the reads aren't cached
156  unsigned int val;
157  do {
158  val = atomicCAS(semaphore, 2u, 2u);
159  }
160  while (val < 2u);
161  }
162  }
163  }
164 
165  __syncthreads();
166 
167  eigen_assert(gridDim.x == 1 || *semaphore >= 2u);
168 
169  typename Self::CoeffReturnType accum = reducer.initialize();
170  Index max_iter = numext::mini<Index>(num_coeffs - first_index, NumPerThread*BlockSize);
171  for (Index i = 0; i < max_iter; i+=BlockSize) {
172  const Index index = first_index + i;
173  eigen_assert(index < num_coeffs);
174  typename Self::CoeffReturnType val = input.m_impl.coeff(index);
175  reducer.reduce(val, &accum);
176  }
177 
178 #pragma unroll
179  for (int offset = warpSize/2; offset > 0; offset /= 2) {
180  #if defined(EIGEN_HIPCC)
181  // use std::is_floating_point to determine the type of reduced_val
182  // This is needed because when Type == double, hipcc will give a "call to __shfl_down is ambguous" error
183  // and list the float and int versions of __shfl_down as the candidate functions.
185  reducer.reduce(__shfl_down(static_cast<float>(accum), offset, warpSize), &accum);
186  } else {
187  reducer.reduce(__shfl_down(static_cast<int>(accum), offset, warpSize), &accum);
188  }
189  #elif defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000
190  reducer.reduce(__shfl_down(accum, offset, warpSize), &accum);
191  #else
192  reducer.reduce(__shfl_down_sync(0xFFFFFFFF, accum, offset, warpSize), &accum);
193  #endif
194  }
195 
196  if ((threadIdx.x & (warpSize - 1)) == 0) {
197  atomicReduce(output, accum, reducer);
198  }
199 
200  if (gridDim.x > 1 && threadIdx.x == 0) {
201  // Let the last block reset the semaphore
202  atomicInc(semaphore, gridDim.x + 1);
203 #if defined(EIGEN_HIPCC)
204  __threadfence_system();
205 #endif
206  }
207 #else // EIGEN_CUDA_ARCH >= 300
208  gpu_assert(0 && "Shouldn't be called on unsupported device");
209 #endif // EIGEN_CUDA_ARCH >= 300
210 }
211 
212 
213 #ifdef EIGEN_HAS_GPU_FP16
214 template <typename Self,
215  typename Reducer, typename Index>
216 __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void ReductionInitFullReduxKernelHalfFloat(Reducer reducer, const Self input, Index num_coeffs,
218  eigen_assert(blockDim.x == 1);
219  eigen_assert(gridDim.x == 1);
220  typedef packet_traits<Eigen::half>::type packet_type;
221  Index packet_remainder =
223  if (packet_remainder != 0) {
224  half2* h2scratch = reinterpret_cast<half2*>(scratch);
225  for (Index i = num_coeffs - packet_remainder; i + 2 <= num_coeffs; i += 2) {
226  *h2scratch =
227  __halves2half2(input.m_impl.coeff(i), input.m_impl.coeff(i + 1));
228  h2scratch++;
229  }
230  if ((num_coeffs & 1) != 0) {
231  half lastCoeff = input.m_impl.coeff(num_coeffs - 1);
232  *h2scratch = __halves2half2(lastCoeff, reducer.initialize());
233  }
234  } else {
235  *scratch = reducer.template initializePacket<packet_type>();
236  }
237 }
238 
239 template <typename Self,
240  typename Reducer, typename Index>
241 __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void ReductionInitKernelHalfFloat(Reducer reducer, const Self input, Index num_coeffs, half* output) {
242  const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x;
243  const Index num_threads = blockDim.x * gridDim.x;
244  typedef typename packet_traits<Eigen::half>::type PacketType;
245 
246  const Index num_packets =
248  PacketType* p_output = reinterpret_cast<PacketType*>(output);
249  for (Index i = thread_id; i < num_packets; i += num_threads) {
250  p_output[i] = reducer.template initializePacket<PacketType>();
251  }
252  Index packet_remainder =
254  if (thread_id < packet_remainder) {
255  output[num_coeffs - packet_remainder + thread_id] = reducer.initialize();
256  }
257 }
258 
259 template <int BlockSize, int NumPerThread, typename Self,
260  typename Reducer, typename Index>
261 __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void FullReductionKernelHalfFloat(Reducer reducer, const Self input, Index num_coeffs,
262  half* output, packet_traits<Eigen::half>::type* scratch) {
263  typedef typename packet_traits<Eigen::half>::type PacketType;
264  const int packet_width = unpacket_traits<PacketType>::size;
265  eigen_assert(NumPerThread % packet_width == 0);
266  const Index first_index =
267  blockIdx.x * BlockSize * NumPerThread + packet_width * threadIdx.x;
268 
269  // Initialize the output value if it wasn't initialized by the ReductionInitKernel
270 
271  if (gridDim.x == 1) {
272  if (first_index == 0) {
273  int rem = num_coeffs % packet_width;
274  if (rem != 0) {
275  half2* p_scratch = reinterpret_cast<half2*>(scratch);
276  *scratch = reducer.template initializePacket<PacketType>();
277  for (int i = 0; i < rem / 2; i++) {
278  *p_scratch = __halves2half2(
279  input.m_impl.coeff(num_coeffs - packet_width + 2 * i),
280  input.m_impl.coeff(num_coeffs - packet_width + 2 * i + 1));
281  p_scratch++;
282  }
283  if ((num_coeffs & 1) != 0) {
284  half last = input.m_impl.coeff(num_coeffs - 1);
285  *p_scratch = __halves2half2(last, reducer.initialize());
286  }
287  } else {
288  *scratch = reducer.template initializePacket<PacketType>();
289  }
290  }
291  __syncthreads();
292  }
293 
294  PacketType accum = reducer.template initializePacket<PacketType>();
295  const Index max_iter =
296  numext::mini<Index>((num_coeffs - first_index) / packet_width,
297  NumPerThread * BlockSize / packet_width);
298  for (Index i = 0; i < max_iter; i += BlockSize) {
299  const Index index = first_index + packet_width * i;
300  eigen_assert(index + packet_width < num_coeffs);
301  PacketType val = input.m_impl.template packet<Unaligned>(index);
302  reducer.reducePacket(val, &accum);
303  }
304 
305 #pragma unroll
306  for (int offset = warpSize/2; offset > 0; offset /= 2) {
307  #if defined(EIGEN_HIPCC)
308  PacketType r1;
309  half2* hr = reinterpret_cast<half2*>(&r1);
310  half2* hacc = reinterpret_cast<half2*>(&accum);
311  for (int i = 0; i < packet_width / 2; i++) {
312  // FIXME : remove this workaround once we have native half/half2 support for __shfl_down
313  union { int i; half2 h; } wka_in, wka_out;
314  wka_in.h = hacc[i];
315  wka_out.i = __shfl_down(wka_in.i, offset, warpSize);
316  hr[i] = wka_out.h;
317  }
318  reducer.reducePacket(r1, &accum);
319  #elif defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000
320  PacketType r1;
321  half2* hr = reinterpret_cast<half2*>(&r1);
322  half2* hacc = reinterpret_cast<half2*>(&accum);
323  for (int i = 0; i < packet_width / 2; i++) {
324  hr[i] = __shfl_down(hacc[i], offset, warpSize);
325  }
326  reducer.reducePacket(r1, &accum);
327  #else
328  PacketType r1;
329  half2* hr = reinterpret_cast<half2*>(&r1);
330  half2* hacc = reinterpret_cast<half2*>(&accum);
331  for (int i = 0; i < packet_width / 2; i++) {
332  hr[i] = __shfl_down_sync(0xFFFFFFFF, hacc[i], (unsigned)offset, warpSize);
333  }
334  reducer.reducePacket(r1, &accum);
335 
336  #endif
337  }
338 
339  if ((threadIdx.x & (warpSize - 1)) == 0) {
340  atomicReduce(scratch, accum, reducer);
341  }
342 
343  __syncthreads();
344  half2* rv1 = reinterpret_cast<half2*>(scratch);
345  if (packet_width > 2) {
346  reducer.reducePacket(rv1[2], rv1);
347  reducer.reducePacket(rv1[3], rv1 + 1);
348  reducer.reducePacket(rv1[1], rv1);
349  }
350  if (gridDim.x == 1) {
351  if (first_index == 0) {
352  half tmp = __low2half(*rv1);
353  reducer.reduce(__high2half(*rv1), &tmp);
354  *output = tmp;
355  }
356  }
357 }
358 
359 template <typename Op>
360 __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void ReductionCleanupKernelHalfFloat(Op reducer, half* output, packet_traits<Eigen::half>::type* scratch) {
361  eigen_assert(threadIdx.x == 1);
362  half2* pscratch = reinterpret_cast<half2*>(scratch);
363  half tmp = __float2half(0.f);
364  typedef packet_traits<Eigen::half>::type packet_type;
365  for (int i = 0; i < unpacket_traits<packet_type>::size; i += 2) {
366  reducer.reduce(__low2half(*pscratch), &tmp);
367  reducer.reduce(__high2half(*pscratch), &tmp);
368  pscratch++;
369  }
370  *output = tmp;
371 }
372 
373 #endif // EIGEN_HAS_GPU_FP16
374 
375 template <typename Self, typename Op, typename OutputType, bool PacketAccess, typename Enabled = void>
376 struct FullReductionLauncher {
377  static void run(const Self&, Op&, const GpuDevice&, OutputType*, typename Self::Index) {
378  gpu_assert(false && "Should only be called on doubles, floats and half floats");
379  }
380 };
381 
382 // Specialization for float and double
383 template <typename Self, typename Op, typename OutputType, bool PacketAccess>
384 struct FullReductionLauncher<
385  Self, Op, OutputType, PacketAccess,
386  typename internal::enable_if<
387  internal::is_same<float, OutputType>::value ||
388  internal::is_same<double, OutputType>::value,
389  void>::type> {
390  static void run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output, typename Self::Index num_coeffs) {
391 
392  typedef typename Self::Index Index;
393  const int block_size = 256;
394  const int num_per_thread = 128;
395  const int num_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
396 
397  unsigned int* semaphore = NULL;
398  if (num_blocks > 1) {
399  semaphore = device.semaphore();
400  }
401 
402  LAUNCH_GPU_KERNEL((FullReductionKernel<block_size, num_per_thread, Self, Op, Index>),
403  num_blocks, block_size, 0, device, reducer, self, num_coeffs, output, semaphore);
404  }
405 };
406 
407 #ifdef EIGEN_HAS_GPU_FP16
408 template <typename Self, typename Op>
409 struct FullReductionLauncher<Self, Op, Eigen::half, false> {
410  static void run(const Self&, Op&, const GpuDevice&, half*, typename Self::Index) {
411  gpu_assert(false && "Should not be called since there is no packet accessor");
412  }
413 };
414 
415 template <typename Self, typename Op>
416 struct FullReductionLauncher<Self, Op, Eigen::half, true> {
417  static void run(const Self& self, Op& reducer, const GpuDevice& device, half* output, typename Self::Index num_coeffs) {
418  typedef typename Self::Index Index;
419  typedef typename packet_traits<Eigen::half>::type PacketType;
420 
421  const int block_size = 256;
422  const int num_per_thread = 128;
423  const int num_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
424  PacketType* scratch = static_cast<PacketType*>(device.scratchpad());
425  // half2* scratch = static_cast<half2*>(device.scratchpad());
426 
427  if (num_blocks > 1) {
428  // We initialize the output and the scrathpad outside the reduction kernel when we can't be sure that there
429  // won't be a race conditions between multiple thread blocks.
430  LAUNCH_GPU_KERNEL((ReductionInitFullReduxKernelHalfFloat<Self, Op, Index>),
431  1, 1, 0, device, reducer, self, num_coeffs, scratch);
432  }
433 
434  LAUNCH_GPU_KERNEL((FullReductionKernelHalfFloat<block_size, num_per_thread, Self, Op, Index>),
435  num_blocks, block_size, 0, device, reducer, self, num_coeffs, output, scratch);
436 
437  if (num_blocks > 1) {
438  LAUNCH_GPU_KERNEL((ReductionCleanupKernelHalfFloat<Op>),
439  1, 1, 0, device, reducer, output, scratch);
440  }
441  }
442 };
443 #endif // EIGEN_HAS_GPU_FP16
444 
445 
446 template <typename Self, typename Op, bool Vectorizable>
447 struct FullReducer<Self, Op, GpuDevice, Vectorizable> {
448  // Unfortunately nvidia doesn't support well exotic types such as complex,
449  // so reduce the scope of the optimized version of the code to the simple cases
450  // of doubles, floats and half floats
451 #ifdef EIGEN_HAS_GPU_FP16
452  static const bool HasOptimizedImplementation = !Self::ReducerTraits::IsStateful &&
456 #else // EIGEN_HAS_GPU_FP16
457  static const bool HasOptimizedImplementation = !Self::ReducerTraits::IsStateful &&
460 #endif // EIGEN_HAS_GPU_FP16
461 
462  template <typename OutputType>
463  static void run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output) {
464  gpu_assert(HasOptimizedImplementation && "Should only be called on doubles, floats or half floats");
465  const Index num_coeffs = array_prod(self.m_impl.dimensions());
466  // Don't crash when we're called with an input tensor of size 0.
467  if (num_coeffs == 0) {
468  return;
469  }
470 
471  FullReductionLauncher<Self, Op, OutputType, reducer_traits<Op, GpuDevice>::PacketAccess>::run(self, reducer, device, output, num_coeffs);
472  }
473 };
474 
475 
476 template <int NumPerThread, typename Self,
477  typename Reducer, typename Index>
478 __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void InnerReductionKernel(Reducer reducer, const Self input, Index num_coeffs_to_reduce, Index num_preserved_coeffs,
479  typename Self::CoeffReturnType* output) {
480 #if (defined(EIGEN_HIP_DEVICE_COMPILE) && defined(__HIP_ARCH_HAS_WARP_SHUFFLE__)) || (EIGEN_CUDA_ARCH >= 300)
481  typedef typename Self::CoeffReturnType Type;
482  eigen_assert(blockDim.y == 1);
483  eigen_assert(blockDim.z == 1);
484  eigen_assert(gridDim.y == 1);
485  eigen_assert(gridDim.z == 1);
486 
487  const int unroll_times = 16;
488  eigen_assert(NumPerThread % unroll_times == 0);
489 
490  const Index input_col_blocks = divup<Index>(num_coeffs_to_reduce, blockDim.x * NumPerThread);
491  const Index num_input_blocks = input_col_blocks * num_preserved_coeffs;
492 
493  const Index num_threads = blockDim.x * gridDim.x;
494  const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x;
495 
496  // Initialize the output values if they weren't initialized by the ReductionInitKernel
497  if (gridDim.x == 1) {
498  for (Index i = thread_id; i < num_preserved_coeffs; i += num_threads) {
499  output[i] = reducer.initialize();
500  }
501  __syncthreads();
502  }
503 
504  for (Index i = blockIdx.x; i < num_input_blocks; i += gridDim.x) {
505  const Index row = i / input_col_blocks;
506 
507  if (row < num_preserved_coeffs) {
508  const Index col_block = i % input_col_blocks;
509  const Index col_begin = col_block * blockDim.x * NumPerThread + threadIdx.x;
510 
511  Type reduced_val = reducer.initialize();
512 
513  for (Index j = 0; j < NumPerThread; j += unroll_times) {
514  const Index last_col = col_begin + blockDim.x * (j + unroll_times - 1);
515  if (last_col >= num_coeffs_to_reduce) {
516  for (Index col = col_begin + blockDim.x * j; col < num_coeffs_to_reduce; col += blockDim.x) {
517  const Type val = input.m_impl.coeff(row * num_coeffs_to_reduce + col);
518  reducer.reduce(val, &reduced_val);
519  }
520  break;
521  } else {
522  // Faster version of the loop with no branches after unrolling.
523 #pragma unroll
524  for (int k = 0; k < unroll_times; ++k) {
525  const Index col = col_begin + blockDim.x * (j + k);
526  reducer.reduce(input.m_impl.coeff(row * num_coeffs_to_reduce + col), &reduced_val);
527  }
528  }
529  }
530 
531 #pragma unroll
532  for (int offset = warpSize/2; offset > 0; offset /= 2) {
533  #if defined(EIGEN_HIPCC)
534  // use std::is_floating_point to determine the type of reduced_val
535  // This is needed because when Type == double, hipcc will give a "call to __shfl_down is ambguous" error
536  // and list the float and int versions of __shfl_down as the candidate functions.
538  reducer.reduce(__shfl_down(static_cast<float>(reduced_val), offset), &reduced_val);
539  } else {
540  reducer.reduce(__shfl_down(static_cast<int>(reduced_val), offset), &reduced_val);
541  }
542  #elif defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000
543  reducer.reduce(__shfl_down(reduced_val, offset), &reduced_val);
544  #else
545  reducer.reduce(__shfl_down_sync(0xFFFFFFFF, reduced_val, offset), &reduced_val);
546  #endif
547  }
548 
549  if ((threadIdx.x & (warpSize - 1)) == 0) {
550  atomicReduce(&(output[row]), reduced_val, reducer);
551  }
552  }
553  }
554 #else // EIGEN_CUDA_ARCH >= 300
555  gpu_assert(0 && "Shouldn't be called on unsupported device");
556 #endif // EIGEN_CUDA_ARCH >= 300
557 }
558 
559 #ifdef EIGEN_HAS_GPU_FP16
560 
561 template <int NumPerThread, typename Self,
562  typename Reducer, typename Index>
563 __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void InnerReductionKernelHalfFloat(Reducer reducer, const Self input, Index num_coeffs_to_reduce, Index num_preserved_coeffs,
564  half* output) {
565  eigen_assert(blockDim.y == 1);
566  eigen_assert(blockDim.z == 1);
567  eigen_assert(gridDim.y == 1);
568  eigen_assert(gridDim.z == 1);
569 
570  typedef typename packet_traits<Eigen::half>::type PacketType;
571  const int packet_width = unpacket_traits<PacketType>::size;
572  const int unroll_times = 16 / packet_width;
573  eigen_assert(NumPerThread % unroll_times == 0);
574  eigen_assert(unroll_times % 2 == 0);
575 
576  const Index input_col_blocks = divup<Index>(num_coeffs_to_reduce, blockDim.x * NumPerThread * 2);
577  const Index num_input_blocks = divup<Index>(input_col_blocks * num_preserved_coeffs, 2);
578 
579  const Index num_threads = blockDim.x * gridDim.x;
580  const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x;
581 
582  // Initialize the output values if they weren't initialized by the ReductionInitKernel
583  if (gridDim.x == 1) {
584  Index i = packet_width * thread_id;
585  for (; i + packet_width <= num_preserved_coeffs;
586  i += packet_width * num_threads) {
587  PacketType* poutput = reinterpret_cast<PacketType*>(output + i);
588  *poutput = reducer.template initializePacket<PacketType>();
589  }
590  if (i < num_preserved_coeffs) {
591  output[i] = reducer.initialize();
592  }
593  __syncthreads();
594  }
595 
596  for (Index i = blockIdx.x; i < num_input_blocks; i += gridDim.x) {
597  const Index row = 2 * (i / input_col_blocks); // everybody takes 2 rows
598 
599  if (row + 1 < num_preserved_coeffs) {
600  const Index col_block = i % input_col_blocks;
601  const Index col_begin =
602  packet_width * (col_block * blockDim.x * NumPerThread + threadIdx.x);
603 
604  PacketType reduced_val1 = reducer.template initializePacket<PacketType>();
605  PacketType reduced_val2 = reducer.template initializePacket<PacketType>();
606 
607  for (Index j = 0; j < NumPerThread; j += unroll_times) {
608  const Index last_col =
609  col_begin + blockDim.x * (j + unroll_times - 1) * packet_width;
610  if (last_col >= num_coeffs_to_reduce) {
611  Index col = col_begin + blockDim.x * j;
612  for (; col + packet_width <= num_coeffs_to_reduce;
613  col += blockDim.x) {
614  const PacketType val1 = input.m_impl.template packet<Unaligned>(
615  row * num_coeffs_to_reduce + col);
616  reducer.reducePacket(val1, &reduced_val1);
617  const PacketType val2 = input.m_impl.template packet<Unaligned>(
618  (row + 1) * num_coeffs_to_reduce + col);
619  reducer.reducePacket(val2, &reduced_val2);
620  }
621  if (col < num_coeffs_to_reduce) {
622  PacketType r1 = reducer.template initializePacket<PacketType>();
623  PacketType r2 = reducer.template initializePacket<PacketType>();
624  half2* hr1 = reinterpret_cast<half2*>(&r1);
625  half2* hr2 = reinterpret_cast<half2*>(&r2);
626  while (col + 1 < num_coeffs_to_reduce) {
627  *hr1 = __halves2half2(
628  input.m_impl.coeff(row * num_coeffs_to_reduce + col),
629  input.m_impl.coeff(row * num_coeffs_to_reduce + col + 1));
630  *hr2 = __halves2half2(
631  input.m_impl.coeff((row + 1) * num_coeffs_to_reduce + col),
632  input.m_impl.coeff((row + 1) * num_coeffs_to_reduce + col +
633  1));
634  hr1++;
635  hr2++;
636  col += 2;
637  }
638  if (col < num_coeffs_to_reduce) {
639  // Peel;
640  const half last1 =
641  input.m_impl.coeff(row * num_coeffs_to_reduce + col);
642  *hr1 = __halves2half2(last1, reducer.initialize());
643  const half last2 =
644  input.m_impl.coeff((row + 1) * num_coeffs_to_reduce + col);
645  *hr2 = __halves2half2(last2, reducer.initialize());
646  }
647  reducer.reducePacket(r1, &reduced_val1);
648  reducer.reducePacket(r2, &reduced_val2);
649  }
650  break;
651  } else {
652  // Faster version of the loop with no branches after unrolling.
653 #pragma unroll
654  for (int k = 0; k < unroll_times; ++k) {
655  const Index col = col_begin + blockDim.x * (j + k) * packet_width;
656  reducer.reducePacket(input.m_impl.template packet<Unaligned>(
657  row * num_coeffs_to_reduce + col),
658  &reduced_val1);
659  reducer.reducePacket(input.m_impl.template packet<Unaligned>(
660  (row + 1) * num_coeffs_to_reduce + col),
661  &reduced_val2);
662  }
663  }
664  }
665 
666 #pragma unroll
667  for (int offset = warpSize/2; offset > 0; offset /= 2) {
668  #if defined(EIGEN_HIPCC)
669  PacketType r1;
670  PacketType r2;
671  half2* hr1 = reinterpret_cast<half2*>(&r1);
672  half2* hr2 = reinterpret_cast<half2*>(&r2);
673  half2* rv1 = reinterpret_cast<half2*>(&reduced_val1);
674  half2* rv2 = reinterpret_cast<half2*>(&reduced_val2);
675  for (int i = 0; i < packet_width / 2; i++) {
676  // FIXME : remove this workaround once we have native half/half2 support for __shfl_down
677  union { int i; half2 h; } wka_in1, wka_out1;
678  wka_in1.h = rv1[i];
679  wka_out1.i = __shfl_down(wka_in1.i, offset, warpSize);
680  hr1[i] = wka_out1.h;
681 
682  union { int i; half2 h; } wka_in2, wka_out2;
683  wka_in2.h = rv2[i];
684  wka_out2.i = __shfl_down(wka_in2.i, offset, warpSize);
685  hr2[i] = wka_out2.h;
686  }
687  reducer.reducePacket(r1, &reduced_val1);
688  reducer.reducePacket(r2, &reduced_val2);
689  #elif defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000
690  PacketType r1;
691  PacketType r2;
692  half2* hr1 = reinterpret_cast<half2*>(&r1);
693  half2* hr2 = reinterpret_cast<half2*>(&r2);
694  half2* rv1 = reinterpret_cast<half2*>(&reduced_val1);
695  half2* rv2 = reinterpret_cast<half2*>(&reduced_val2);
696  for (int i = 0; i < packet_width / 2; i++) {
697  hr1[i] = __shfl_down(rv1[i], offset, warpSize);
698  hr2[i] = __shfl_down(rv2[i], offset, warpSize);
699  }
700  reducer.reducePacket(r1, &reduced_val1);
701  reducer.reducePacket(r2, &reduced_val2);
702  #else
703  PacketType r1;
704  PacketType r2;
705  half2* hr1 = reinterpret_cast<half2*>(&r1);
706  half2* hr2 = reinterpret_cast<half2*>(&r2);
707  half2* rr1 = reinterpret_cast<half2*>(&reduced_val1);
708  half2* rr2 = reinterpret_cast<half2*>(&reduced_val2);
709  for (int i = 0; i < packet_width / 2; i++) {
710  hr1[i] =
711  __shfl_down_sync(0xFFFFFFFF, rr1[i], (unsigned)offset, warpSize);
712  hr2[i] =
713  __shfl_down_sync(0xFFFFFFFF, rr2[i], (unsigned)offset, warpSize);
714  }
715  reducer.reducePacket(r1, &reduced_val1);
716  reducer.reducePacket(r2, &reduced_val2);
717 
718  #endif
719  }
720  half2* rv1 = reinterpret_cast<half2*>(&reduced_val1);
721  half2* rv2 = reinterpret_cast<half2*>(&reduced_val2);
722  half2 val;
723  if (packet_width > 2) {
724  reducer.reducePacket(rv1[2], rv1);
725  reducer.reducePacket(rv1[3], rv1 + 1);
726  reducer.reducePacket(rv1[1], rv1);
727  reducer.reducePacket(rv2[2], rv2);
728  reducer.reducePacket(rv2[3], rv2 + 1);
729  reducer.reducePacket(rv2[1], rv2);
730  }
731  half val1 = __low2half(*rv1);
732  reducer.reduce(__high2half(*rv1), &val1);
733  half val2 = __low2half(*rv2);
734  reducer.reduce(__high2half(*rv2), &val2);
735  val = __halves2half2(val1, val2);
736  if ((threadIdx.x & (warpSize - 1)) == 0) {
737  half* loc = output + row;
738  atomicReduce((half2*)loc, val, reducer);
739  }
740  }
741  }
742 }
743 
744 #endif // EIGEN_HAS_GPU_FP16
745 
746 template <typename Self, typename Op, typename OutputType, bool PacketAccess, typename Enabled = void>
747 struct InnerReductionLauncher {
748  static EIGEN_DEVICE_FUNC bool run(const Self&, Op&, const GpuDevice&, OutputType*, typename Self::Index, typename Self::Index) {
749  gpu_assert(false && "Should only be called to reduce doubles, floats and half floats on a gpu device");
750  return true;
751  }
752 };
753 
754 // Specialization for float and double
755 template <typename Self, typename Op, typename OutputType, bool PacketAccess>
756 struct InnerReductionLauncher<
757  Self, Op, OutputType, PacketAccess,
758  typename internal::enable_if<
759  internal::is_same<float, OutputType>::value ||
760  internal::is_same<double, OutputType>::value,
761  void>::type> {
762  static bool run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) {
763  typedef typename Self::Index Index;
764 
765  const Index num_coeffs = num_coeffs_to_reduce * num_preserved_vals;
766  const int block_size = 256;
767  const int num_per_thread = 128;
768  const int dyn_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
769  const int max_blocks = device.getNumGpuMultiProcessors() *
770  device.maxGpuThreadsPerMultiProcessor() / block_size;
771  const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
772 
773  if (num_blocks > 1) {
774  // We initialize the outputs outside the reduction kernel when we can't be sure that there
775  // won't be a race conditions between multiple thread blocks.
776  const int dyn_blocks = divup<int>(num_preserved_vals, 1024);
777  const int max_blocks = device.getNumGpuMultiProcessors() *
778  device.maxGpuThreadsPerMultiProcessor() / 1024;
779  const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
780  LAUNCH_GPU_KERNEL((ReductionInitKernel<OutputType, Index>),
781  num_blocks, 1024, 0, device, reducer.initialize(),
782  num_preserved_vals, output);
783  }
784 
785  LAUNCH_GPU_KERNEL((InnerReductionKernel<num_per_thread, Self, Op, Index>),
786  num_blocks, block_size, 0, device, reducer, self, num_coeffs_to_reduce, num_preserved_vals, output);
787 
788  return false;
789  }
790 };
791 
792 #ifdef EIGEN_HAS_GPU_FP16
793 template <typename Self, typename Op>
794 struct InnerReductionLauncher<Self, Op, Eigen::half, false> {
795  static bool run(const Self&, Op&, const GpuDevice&, half*, typename Self::Index, typename Self::Index) {
796  gpu_assert(false && "Should not be called since there is no packet accessor");
797  return true;
798  }
799 };
800 
801 template <typename Self, typename Op>
802 struct InnerReductionLauncher<Self, Op, Eigen::half, true> {
803  static bool run(const Self& self, Op& reducer, const GpuDevice& device, half* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) {
804  typedef typename Self::Index Index;
805 
806  if (num_preserved_vals % 2 != 0) {
807  // Not supported yet, revert to the slower code path
808  return true;
809  }
810 
811  const Index num_coeffs = num_coeffs_to_reduce * num_preserved_vals;
812  const int block_size = /*256*/128;
813  const int num_per_thread = /*128*/64;
814  const int dyn_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
815  const int max_blocks = device.getNumGpuMultiProcessors() *
816  device.maxGpuThreadsPerMultiProcessor() / block_size;
817  const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
818 
819  if (num_blocks > 1) {
820  // We initialize the outputs outside the reduction kernel when we can't be sure that there
821  // won't be a race conditions between multiple thread blocks.
822  LAUNCH_GPU_KERNEL((ReductionInitKernelHalfFloat<Self, Op, Index>),
823  1, 1, 0, device, reducer, self, num_preserved_vals, output);
824  }
825 
826  LAUNCH_GPU_KERNEL((InnerReductionKernelHalfFloat<num_per_thread, Self, Op, Index>),
827  num_blocks, block_size, 0, device, reducer, self, num_coeffs_to_reduce, num_preserved_vals, output);
828 
829  return false;
830  }
831 };
832 #endif // EIGEN_HAS_GPU_FP16
833 
834 
835 template <typename Self, typename Op>
836 struct InnerReducer<Self, Op, GpuDevice> {
837  // Unfortunately nvidia doesn't support well exotic types such as complex,
838  // so reduce the scope of the optimized version of the code to the simple case
839  // of floats and half floats.
840 #ifdef EIGEN_HAS_GPU_FP16
841  static const bool HasOptimizedImplementation = !Self::ReducerTraits::IsStateful &&
845 #else // EIGEN_HAS_GPU_FP16
846  static const bool HasOptimizedImplementation = !Self::ReducerTraits::IsStateful &&
849 #endif // EIGEN_HAS_GPU_FP16
850 
851  template <typename OutputType>
852  static bool run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) {
853  gpu_assert(HasOptimizedImplementation && "Should only be called on doubles, floats or half floats");
854  const Index num_coeffs = array_prod(self.m_impl.dimensions());
855  // Don't crash when we're called with an input tensor of size 0.
856  if (num_coeffs == 0) {
857  return true;
858  }
859  // It's faster to use the usual code.
860  if (num_coeffs_to_reduce <= 128) {
861  return true;
862  }
863 
864  return InnerReductionLauncher<Self, Op, OutputType, reducer_traits<Op, GpuDevice>::PacketAccess>::run(self, reducer, device, output, num_coeffs_to_reduce, num_preserved_vals);
865  }
866 };
867 
868 template <int NumPerThread, typename Self,
869  typename Reducer, typename Index>
870 __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void OuterReductionKernel(Reducer reducer, const Self input, Index num_coeffs_to_reduce, Index num_preserved_coeffs,
871  typename Self::CoeffReturnType* output) {
872  const Index num_threads = blockDim.x * gridDim.x;
873  const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x;
874  // Initialize the output values if they weren't initialized by the ReductionInitKernel
875  if (gridDim.x == 1) {
876  for (Index i = thread_id; i < num_preserved_coeffs; i += num_threads) {
877  output[i] = reducer.initialize();
878  }
879  __syncthreads();
880  }
881 
882  // Do the reduction.
883  const Index max_iter = num_preserved_coeffs * divup<Index>(num_coeffs_to_reduce, NumPerThread);
884  for (Index i = thread_id; i < max_iter; i += num_threads) {
885  const Index input_col = i % num_preserved_coeffs;
886  const Index input_row = (i / num_preserved_coeffs) * NumPerThread;
887  typename Self::CoeffReturnType reduced_val = reducer.initialize();
888  const Index max_row = numext::mini(input_row + NumPerThread, num_coeffs_to_reduce);
889  for (Index j = input_row; j < max_row; j++) {
890  typename Self::CoeffReturnType val = input.m_impl.coeff(j * num_preserved_coeffs + input_col);
891  reducer.reduce(val, &reduced_val);
892  }
893  atomicReduce(&(output[input_col]), reduced_val, reducer);
894  }
895 }
896 
897 
898 template <typename Self, typename Op>
899 struct OuterReducer<Self, Op, GpuDevice> {
900  // Unfortunately nvidia doesn't support well exotic types such as complex,
901  // so reduce the scope of the optimized version of the code to the simple case
902  // of floats.
903  static const bool HasOptimizedImplementation = !Self::ReducerTraits::IsStateful &&
906  template <typename Device, typename OutputType>
907  static
908  #if !defined(EIGEN_HIPCC)
909  // FIXME : leaving this EIGEN_DEVICE_FUNC in, results in the following runtime error
910  // (in the cxx11_tensor_reduction_gpu test)
911  //
912  // terminate called after throwing an instance of 'std::runtime_error'
913  // what(): No device code available for function: _ZN5Eigen8internal20OuterReductionKernelIL...
914  //
915  // don't know why this happens (and why is it a runtime error instead of a compile time error)
916  //
917  // this will be fixed by HIP PR#457
919  #endif
920  bool run(const Self&, Op&, const Device&, OutputType*, typename Self::Index, typename Self::Index) {
921  gpu_assert(false && "Should only be called to reduce doubles or floats on a gpu device");
922  return true;
923  }
924 
925  static bool run(const Self& self, Op& reducer, const GpuDevice& device, float* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) {
926  typedef typename Self::Index Index;
927 
928  // It's faster to use the usual code.
929  if (num_coeffs_to_reduce <= 32) {
930  return true;
931  }
932 
933  const Index num_coeffs = num_coeffs_to_reduce * num_preserved_vals;
934  const int block_size = 256;
935  const int num_per_thread = 16;
936  const int dyn_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
937  const int max_blocks = device.getNumGpuMultiProcessors() *
938  device.maxGpuThreadsPerMultiProcessor() / block_size;
939  const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
940 
941  if (num_blocks > 1) {
942  // We initialize the outputs in the reduction kernel itself when we don't have to worry
943  // about race conditions between multiple thread blocks.
944  const int dyn_blocks = divup<int>(num_preserved_vals, 1024);
945  const int max_blocks = device.getNumGpuMultiProcessors() *
946  device.maxGpuThreadsPerMultiProcessor() / 1024;
947  const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
948  LAUNCH_GPU_KERNEL((ReductionInitKernel<float, Index>),
949  num_blocks, 1024, 0, device, reducer.initialize(),
950  num_preserved_vals, output);
951  }
952 
953  LAUNCH_GPU_KERNEL((OuterReductionKernel<num_per_thread, Self, Op, Index>),
954  num_blocks, block_size, 0, device, reducer, self, num_coeffs_to_reduce, num_preserved_vals, output);
955 
956  return false;
957  }
958 };
959 
960 #endif // defined(EIGEN_USE_GPU) && defined(EIGEN_GPUCC)
961 
962 
963 } // end namespace internal
964 } // end namespace Eigen
965 
966 #endif // EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_GPU_H
#define EIGEN_ALWAYS_INLINE
Definition: Macros.h:932
#define EIGEN_HIP_LAUNCH_BOUNDS_1024
Definition: Macros.h:510
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::ptrdiff_t array_prod(const Sizes< Indices... > &)
m m block(1, 0, 2, 2)<< 4
Rot2 R(Rot2::fromAngle(0.1))
set noclip points set clip one set noclip two set bar set border lt lw set xdata set ydata set zdata set x2data set y2data set boxwidth set dummy y set format x g set format y g set format x2 g set format y2 g set format z g set angles radians set nogrid set key title set key left top Right noreverse box linetype linewidth samplen spacing width set nolabel set noarrow set nologscale set logscale x set set pointsize set encoding default set nopolar set noparametric set set set set surface set nocontour set clabel set mapping cartesian set nohidden3d set cntrparam order set cntrparam linear set cntrparam levels auto set cntrparam points set size set set xzeroaxis lt lw set x2zeroaxis lt lw set yzeroaxis lt lw set y2zeroaxis lt lw set tics in set ticslevel set tics set mxtics default set mytics default set mx2tics default set my2tics default set xtics border mirror norotate autofreq set ytics border mirror norotate autofreq set ztics border nomirror norotate autofreq set nox2tics set noy2tics set timestamp bottom norotate offset
Namespace containing all symbols from the Eigen library.
Definition: jet.h:637
dim3 threadIdx
Definition: gpu_common.h:19
static const symbolic::SymbolExpr< internal::symbolic_last_tag > last
Scalar Scalar int size
Definition: benchVecAdd.cpp:17
m row(1)
EIGEN_DEFAULT_DENSE_INDEX_TYPE Index
The Index type as used for the API.
Definition: Meta.h:74
#define eigen_assert(x)
Definition: Macros.h:1037
Point2(* f)(const Point3 &, OptionalJacobian< 2, 3 >)
static const double r2
#define NULL
Definition: ccolamd.c:609
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T mini(const T &x, const T &y)
#define EIGEN_DEVICE_FUNC
Definition: Macros.h:976
const double h
static const double r1
m col(1)
dim3 blockIdx
Definition: gpu_common.h:19
std::ptrdiff_t j
Definition: pytypes.h:1370
dim3 blockDim
Definition: gpu_common.h:19


gtsam
Author(s):
autogenerated on Tue Jul 4 2023 02:37:23