TensorReductionCuda.h
Go to the documentation of this file.
1 // This file is part of Eigen, a lightweight C++ template library
2 // for linear algebra.
3 //
4 // Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
5 //
6 // This Source Code Form is subject to the terms of the Mozilla
7 // Public License v. 2.0. If a copy of the MPL was not distributed
8 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
9 
10 #ifndef EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_CUDA_H
11 #define EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_CUDA_H
12 
13 namespace Eigen {
14 namespace internal {
15 
16 
17 #if defined(EIGEN_USE_GPU) && defined(__CUDACC__)
18 // Full reducers for GPU, don't vectorize for now
19 
20 // Reducer function that enables multiple cuda thread to safely accumulate at the same
21 // output address. It basically reads the current value of the output variable, and
22 // attempts to update it with the new value. If in the meantime another cuda thread
23 // updated the content of the output address it will try again.
24 template <typename T, typename R>
25 __device__ EIGEN_ALWAYS_INLINE void atomicReduce(T* output, T accum, R& reducer) {
26 #if __CUDA_ARCH__ >= 300
27  if (sizeof(T) == 4)
28  {
29  unsigned int oldval = *reinterpret_cast<unsigned int*>(output);
30  unsigned int newval = oldval;
31  reducer.reduce(accum, reinterpret_cast<T*>(&newval));
32  if (newval == oldval) {
33  return;
34  }
35  unsigned int readback;
36  while ((readback = atomicCAS((unsigned int*)output, oldval, newval)) != oldval) {
37  oldval = readback;
38  newval = oldval;
39  reducer.reduce(accum, reinterpret_cast<T*>(&newval));
40  if (newval == oldval) {
41  return;
42  }
43  }
44  }
45  else if (sizeof(T) == 8) {
46  unsigned long long oldval = *reinterpret_cast<unsigned long long*>(output);
47  unsigned long long newval = oldval;
48  reducer.reduce(accum, reinterpret_cast<T*>(&newval));
49  if (newval == oldval) {
50  return;
51  }
52  unsigned long long readback;
53  while ((readback = atomicCAS((unsigned long long*)output, oldval, newval)) != oldval) {
54  oldval = readback;
55  newval = oldval;
56  reducer.reduce(accum, reinterpret_cast<T*>(&newval));
57  if (newval == oldval) {
58  return;
59  }
60  }
61  }
62  else {
63  assert(0 && "Wordsize not supported");
64  }
65 #else
66  assert(0 && "Shouldn't be called on unsupported device");
67 #endif
68 }
69 
70 // We extend atomicExch to support extra data types
71 template <typename Type>
72 __device__ inline Type atomicExchCustom(Type* address, Type val) {
73  return atomicExch(address, val);
74 }
75 
76 template <>
77 __device__ inline double atomicExchCustom(double* address, double val) {
78  unsigned long long int* address_as_ull = reinterpret_cast<unsigned long long int*>(address);
79  return __longlong_as_double(atomicExch(address_as_ull, __double_as_longlong(val)));
80 }
81 
82 #ifdef EIGEN_HAS_CUDA_FP16
83 template <template <typename T> class R>
84 __device__ inline void atomicReduce(half2* output, half2 accum, R<half>& reducer) {
85  unsigned int oldval = *reinterpret_cast<unsigned int*>(output);
86  unsigned int newval = oldval;
87  reducer.reducePacket(accum, reinterpret_cast<half2*>(&newval));
88  if (newval == oldval) {
89  return;
90  }
91  unsigned int readback;
92  while ((readback = atomicCAS((unsigned int*)output, oldval, newval)) != oldval) {
93  oldval = readback;
94  newval = oldval;
95  reducer.reducePacket(accum, reinterpret_cast<half2*>(&newval));
96  if (newval == oldval) {
97  return;
98  }
99  }
100 }
101 #endif
102 
103 template <>
104 __device__ inline void atomicReduce(float* output, float accum, SumReducer<float>&) {
105 #if __CUDA_ARCH__ >= 300
106  atomicAdd(output, accum);
107 #else
108  assert(0 && "Shouldn't be called on unsupported device");
109 #endif
110 }
111 
112 
113 template <typename CoeffType, typename Index>
114 __global__ void ReductionInitKernel(const CoeffType val, Index num_preserved_coeffs, CoeffType* output) {
115  const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x;
116  const Index num_threads = blockDim.x * gridDim.x;
117  for (Index i = thread_id; i < num_preserved_coeffs; i += num_threads) {
118  output[i] = val;
119  }
120 }
121 
122 
123 template <int BlockSize, int NumPerThread, typename Self,
124  typename Reducer, typename Index>
125 __global__ void FullReductionKernel(Reducer reducer, const Self input, Index num_coeffs,
126  typename Self::CoeffReturnType* output, unsigned int* semaphore) {
127 #if __CUDA_ARCH__ >= 300
128  // Initialize the output value
129  const Index first_index = blockIdx.x * BlockSize * NumPerThread + threadIdx.x;
130  if (gridDim.x == 1) {
131  if (first_index == 0) {
132  *output = reducer.initialize();
133  }
134  }
135  else {
136  if (threadIdx.x == 0) {
137  unsigned int block = atomicCAS(semaphore, 0u, 1u);
138  if (block == 0) {
139  // We're the first block to run, initialize the output value
140  atomicExchCustom(output, reducer.initialize());
141  __threadfence();
142  atomicExch(semaphore, 2u);
143  }
144  else {
145  // Wait for the first block to initialize the output value.
146  // Use atomicCAS here to ensure that the reads aren't cached
147  unsigned int val;
148  do {
149  val = atomicCAS(semaphore, 2u, 2u);
150  }
151  while (val < 2u);
152  }
153  }
154  }
155 
156  __syncthreads();
157 
158  eigen_assert(gridDim.x == 1 || *semaphore >= 2u);
159 
160  typename Self::CoeffReturnType accum = reducer.initialize();
161  Index max_iter = numext::mini<Index>(num_coeffs - first_index, NumPerThread*BlockSize);
162  for (Index i = 0; i < max_iter; i+=BlockSize) {
163  const Index index = first_index + i;
164  eigen_assert(index < num_coeffs);
165  typename Self::CoeffReturnType val = input.m_impl.coeff(index);
166  reducer.reduce(val, &accum);
167  }
168 
169 #pragma unroll
170  for (int offset = warpSize/2; offset > 0; offset /= 2) {
171  reducer.reduce(__shfl_down(accum, offset, warpSize), &accum);
172  }
173 
174  if ((threadIdx.x & (warpSize - 1)) == 0) {
175  atomicReduce(output, accum, reducer);
176  }
177 
178  if (gridDim.x > 1 && threadIdx.x == 0) {
179  // Let the last block reset the semaphore
180  atomicInc(semaphore, gridDim.x + 1);
181  }
182 #else
183  assert(0 && "Shouldn't be called on unsupported device");
184 #endif
185 }
186 
187 
188 #ifdef EIGEN_HAS_CUDA_FP16
189 template <typename Self,
190  typename Reducer, typename Index>
191 __global__ void ReductionInitFullReduxKernelHalfFloat(Reducer reducer, const Self input, Index num_coeffs, half2* scratch) {
192  eigen_assert(blockDim.x == 1);
193  eigen_assert(gridDim.x == 1);
194  if (num_coeffs % 2 != 0) {
195  half last = input.m_impl.coeff(num_coeffs-1);
196  *scratch = __halves2half2(last, reducer.initialize());
197  } else {
198  *scratch = reducer.template initializePacket<half2>();
199  }
200 }
201 
202 template <typename Self,
203  typename Reducer, typename Index>
204 __global__ void ReductionInitKernelHalfFloat(Reducer reducer, const Self input, Index num_coeffs, half* output) {
205  const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x;
206  const Index num_threads = blockDim.x * gridDim.x;
207  const Index num_packets = num_coeffs / 2;
208  for (Index i = thread_id; i < num_packets; i += num_threads) {
209  ((half2*)output)[i] = reducer.template initializePacket<half2>();
210  }
211 
212  if (thread_id == 0 && num_coeffs % 2 != 0) {
213  output[num_coeffs-1] = reducer.initialize();
214  }
215 }
216 
217 template <int BlockSize, int NumPerThread, typename Self,
218  typename Reducer, typename Index>
219 __global__ void FullReductionKernelHalfFloat(Reducer reducer, const Self input, Index num_coeffs,
220  half* output, half2* scratch) {
221  eigen_assert(NumPerThread % 2 == 0);
222 
223  const Index first_index = blockIdx.x * BlockSize * NumPerThread + 2*threadIdx.x;
224 
225  // Initialize the output value if it wasn't initialized by the ReductionInitKernel
226  if (gridDim.x == 1 && first_index == 0) {
227  if (num_coeffs % 2 != 0) {
228  half last = input.m_impl.coeff(num_coeffs-1);
229  *scratch = __halves2half2(last, reducer.initialize());
230  } else {
231  *scratch = reducer.template initializePacket<half2>();
232  }
233  __syncthreads();
234  }
235 
236  half2 accum = reducer.template initializePacket<half2>();
237  const Index max_iter = numext::mini<Index>((num_coeffs - first_index) / 2, NumPerThread*BlockSize / 2);
238  for (Index i = 0; i < max_iter; i += BlockSize) {
239  const Index index = first_index + 2*i;
240  eigen_assert(index + 1 < num_coeffs);
241  half2 val = input.m_impl.template packet<Unaligned>(index);
242  reducer.reducePacket(val, &accum);
243  }
244 
245 #pragma unroll
246  for (int offset = warpSize/2; offset > 0; offset /= 2) {
247  reducer.reducePacket(__shfl_down(accum, offset, warpSize), &accum);
248  }
249 
250  if ((threadIdx.x & (warpSize - 1)) == 0) {
251  atomicReduce(scratch, accum, reducer);
252  }
253 
254  __syncthreads();
255 
256  if (gridDim.x == 1 && first_index == 0) {
257  half tmp = __low2half(*scratch);
258  reducer.reduce(__high2half(*scratch), &tmp);
259  *output = tmp;
260  }
261 }
262 
263 template <typename Op>
264 __global__ void ReductionCleanupKernelHalfFloat(Op& reducer, half* output, half2* scratch) {
265  eigen_assert(threadIdx.x == 1);
266  half tmp = __low2half(*scratch);
267  reducer.reduce(__high2half(*scratch), &tmp);
268  *output = tmp;
269 }
270 
271 #endif
272 
273 template <typename Self, typename Op, typename OutputType, bool PacketAccess, typename Enabled = void>
274 struct FullReductionLauncher {
275  static void run(const Self&, Op&, const GpuDevice&, OutputType*, typename Self::Index) {
276  assert(false && "Should only be called on doubles, floats and half floats");
277  }
278 };
279 
280 // Specialization for float and double
281 template <typename Self, typename Op, typename OutputType, bool PacketAccess>
282 struct FullReductionLauncher<
283  Self, Op, OutputType, PacketAccess,
284  typename internal::enable_if<
285  internal::is_same<float, OutputType>::value ||
286  internal::is_same<double, OutputType>::value,
287  void>::type> {
288  static void run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output, typename Self::Index num_coeffs) {
289  typedef typename Self::Index Index;
290  typedef typename Self::CoeffReturnType Scalar;
291  const int block_size = 256;
292  const int num_per_thread = 128;
293  const int num_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
294 
295  unsigned int* semaphore = NULL;
296  if (num_blocks > 1) {
297  semaphore = device.semaphore();
298  }
299 
300  LAUNCH_CUDA_KERNEL((FullReductionKernel<block_size, num_per_thread, Self, Op, Index>),
301  num_blocks, block_size, 0, device, reducer, self, num_coeffs, output, semaphore);
302  }
303 };
304 
305 #ifdef EIGEN_HAS_CUDA_FP16
306 template <typename Self, typename Op>
307 struct FullReductionLauncher<Self, Op, Eigen::half, false> {
308  static void run(const Self&, Op&, const GpuDevice&, half*, typename Self::Index) {
309  assert(false && "Should not be called since there is no packet accessor");
310  }
311 };
312 
313 template <typename Self, typename Op>
314 struct FullReductionLauncher<Self, Op, Eigen::half, true> {
315  static void run(const Self& self, Op& reducer, const GpuDevice& device, half* output, typename Self::Index num_coeffs) {
316  typedef typename Self::Index Index;
317 
318  const int block_size = 256;
319  const int num_per_thread = 128;
320  const int num_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
321  half2* scratch = static_cast<half2*>(device.scratchpad());
322 
323  if (num_blocks > 1) {
324  // We initialize the output and the scrathpad outside the reduction kernel when we can't be sure that there
325  // won't be a race conditions between multiple thread blocks.
326  LAUNCH_CUDA_KERNEL((ReductionInitFullReduxKernelHalfFloat<Self, Op, Index>),
327  1, 1, 0, device, reducer, self, num_coeffs, scratch);
328  }
329 
330  LAUNCH_CUDA_KERNEL((FullReductionKernelHalfFloat<block_size, num_per_thread, Self, Op, Index>),
331  num_blocks, block_size, 0, device, reducer, self, num_coeffs, output, scratch);
332 
333  if (num_blocks > 1) {
334  LAUNCH_CUDA_KERNEL((ReductionCleanupKernelHalfFloat<Op>),
335  1, 1, 0, device, reducer, output, scratch);
336  }
337  }
338 };
339 #endif
340 
341 
342 template <typename Self, typename Op, bool Vectorizable>
343 struct FullReducer<Self, Op, GpuDevice, Vectorizable> {
344  // Unfortunately nvidia doesn't support well exotic types such as complex,
345  // so reduce the scope of the optimized version of the code to the simple cases
346  // of doubles, floats and half floats
347 #ifdef EIGEN_HAS_CUDA_FP16
348  static const bool HasOptimizedImplementation = !Op::IsStateful &&
352 #else
353  static const bool HasOptimizedImplementation = !Op::IsStateful &&
356 #endif
357 
358  template <typename OutputType>
359  static void run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output) {
360  assert(HasOptimizedImplementation && "Should only be called on doubles, floats or half floats");
361  const Index num_coeffs = array_prod(self.m_impl.dimensions());
362  // Don't crash when we're called with an input tensor of size 0.
363  if (num_coeffs == 0) {
364  return;
365  }
366 
367  FullReductionLauncher<Self, Op, OutputType, reducer_traits<Op, GpuDevice>::PacketAccess>::run(self, reducer, device, output, num_coeffs);
368  }
369 };
370 
371 
372 template <int NumPerThread, typename Self,
373  typename Reducer, typename Index>
374 __global__ void InnerReductionKernel(Reducer reducer, const Self input, Index num_coeffs_to_reduce, Index num_preserved_coeffs,
375  typename Self::CoeffReturnType* output) {
376 #if __CUDA_ARCH__ >= 300
377  typedef typename Self::CoeffReturnType Type;
378  eigen_assert(blockDim.y == 1);
379  eigen_assert(blockDim.z == 1);
380  eigen_assert(gridDim.y == 1);
381  eigen_assert(gridDim.z == 1);
382 
383  const int unroll_times = 16;
384  eigen_assert(NumPerThread % unroll_times == 0);
385 
386  const Index input_col_blocks = divup<Index>(num_coeffs_to_reduce, blockDim.x * NumPerThread);
387  const Index num_input_blocks = input_col_blocks * num_preserved_coeffs;
388 
389  const Index num_threads = blockDim.x * gridDim.x;
390  const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x;
391 
392  // Initialize the output values if they weren't initialized by the ReductionInitKernel
393  if (gridDim.x == 1) {
394  for (Index i = thread_id; i < num_preserved_coeffs; i += num_threads) {
395  output[i] = reducer.initialize();
396  }
397  __syncthreads();
398  }
399 
400  for (Index i = blockIdx.x; i < num_input_blocks; i += gridDim.x) {
401  const Index row = i / input_col_blocks;
402 
403  if (row < num_preserved_coeffs) {
404  const Index col_block = i % input_col_blocks;
405  const Index col_begin = col_block * blockDim.x * NumPerThread + threadIdx.x;
406 
407  Type reduced_val = reducer.initialize();
408 
409  for (Index j = 0; j < NumPerThread; j += unroll_times) {
410  const Index last_col = col_begin + blockDim.x * (j + unroll_times - 1);
411  if (last_col >= num_coeffs_to_reduce) {
412  for (Index col = col_begin + blockDim.x * j; col < num_coeffs_to_reduce; col += blockDim.x) {
413  const Type val = input.m_impl.coeff(row * num_coeffs_to_reduce + col);
414  reducer.reduce(val, &reduced_val);
415  }
416  break;
417  } else {
418  // Faster version of the loop with no branches after unrolling.
419 #pragma unroll
420  for (int k = 0; k < unroll_times; ++k) {
421  const Index col = col_begin + blockDim.x * (j + k);
422  reducer.reduce(input.m_impl.coeff(row * num_coeffs_to_reduce + col), &reduced_val);
423  }
424  }
425  }
426 
427 #pragma unroll
428  for (int offset = warpSize/2; offset > 0; offset /= 2) {
429  reducer.reduce(__shfl_down(reduced_val, offset), &reduced_val);
430  }
431 
432  if ((threadIdx.x & (warpSize - 1)) == 0) {
433  atomicReduce(&(output[row]), reduced_val, reducer);
434  }
435  }
436  }
437 #else
438  assert(0 && "Shouldn't be called on unsupported device");
439 #endif
440 }
441 
442 #ifdef EIGEN_HAS_CUDA_FP16
443 
444 template <int NumPerThread, typename Self,
445  typename Reducer, typename Index>
446 __global__ void InnerReductionKernelHalfFloat(Reducer reducer, const Self input, Index num_coeffs_to_reduce, Index num_preserved_coeffs,
447  half* output) {
448  eigen_assert(blockDim.y == 1);
449  eigen_assert(blockDim.z == 1);
450  eigen_assert(gridDim.y == 1);
451  eigen_assert(gridDim.z == 1);
452 
453  const int unroll_times = 16;
454  eigen_assert(NumPerThread % unroll_times == 0);
455  eigen_assert(unroll_times % 2 == 0);
456 
457  const Index input_col_blocks = divup<Index>(num_coeffs_to_reduce, blockDim.x * NumPerThread * 2);
458  const Index num_input_blocks = divup<Index>(input_col_blocks * num_preserved_coeffs, 2);
459 
460  const Index num_threads = blockDim.x * gridDim.x;
461  const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x;
462 
463  // Initialize the output values if they weren't initialized by the ReductionInitKernel
464  if (gridDim.x == 1) {
465  Index i = 2*thread_id;
466  for (; i + 1 < num_preserved_coeffs; i += 2*num_threads) {
467  half* loc = output + i;
468  *((half2*)loc) = reducer.template initializePacket<half2>();
469  }
470  if (i < num_preserved_coeffs) {
471  output[i] = reducer.initialize();
472  }
473  __syncthreads();
474  }
475 
476  for (Index i = blockIdx.x; i < num_input_blocks; i += gridDim.x) {
477  const Index row = 2 * (i / input_col_blocks);
478 
479  if (row + 1 < num_preserved_coeffs) {
480  const Index col_block = i % input_col_blocks;
481  const Index col_begin = 2 * (col_block * blockDim.x * NumPerThread + threadIdx.x);
482 
483  half2 reduced_val1 = reducer.template initializePacket<half2>();
484  half2 reduced_val2 = reducer.template initializePacket<half2>();
485 
486  for (Index j = 0; j < NumPerThread; j += unroll_times) {
487  const Index last_col = col_begin + blockDim.x * (j + unroll_times - 1) * 2;
488  if (last_col >= num_coeffs_to_reduce) {
489  Index col = col_begin + blockDim.x * j;
490  for (; col + 1 < num_coeffs_to_reduce; col += blockDim.x) {
491  const half2 val1 = input.m_impl.template packet<Unaligned>(row * num_coeffs_to_reduce + col);
492  reducer.reducePacket(val1, &reduced_val1);
493  const half2 val2 = input.m_impl.template packet<Unaligned>((row+1) * num_coeffs_to_reduce + col);
494  reducer.reducePacket(val2, &reduced_val2);
495  }
496  if (col < num_coeffs_to_reduce) {
497  // Peel;
498  const half last1 = input.m_impl.coeff(row * num_coeffs_to_reduce + col);
499  const half2 val1 = __halves2half2(last1, reducer.initialize());
500  reducer.reducePacket(val1, &reduced_val1);
501  const half last2 = input.m_impl.coeff((row+1) * num_coeffs_to_reduce + col);
502  const half2 val2 = __halves2half2(last2, reducer.initialize());
503  reducer.reducePacket(val2, &reduced_val2);
504  }
505  break;
506  } else {
507  // Faster version of the loop with no branches after unrolling.
508 #pragma unroll
509  for (int k = 0; k < unroll_times; ++k) {
510  const Index col = col_begin + blockDim.x * (j + k) * 2;
511  reducer.reducePacket(input.m_impl.template packet<Unaligned>(row * num_coeffs_to_reduce + col), &reduced_val1);
512  reducer.reducePacket(input.m_impl.template packet<Unaligned>((row + 1)* num_coeffs_to_reduce + col), &reduced_val2);
513  }
514  }
515  }
516 
517 #pragma unroll
518  for (int offset = warpSize/2; offset > 0; offset /= 2) {
519  reducer.reducePacket(__shfl_down(reduced_val1, offset, warpSize), &reduced_val1);
520  reducer.reducePacket(__shfl_down(reduced_val2, offset, warpSize), &reduced_val2);
521  }
522 
523  half val1 = __low2half(reduced_val1);
524  reducer.reduce(__high2half(reduced_val1), &val1);
525  half val2 = __low2half(reduced_val2);
526  reducer.reduce(__high2half(reduced_val2), &val2);
527  half2 val = __halves2half2(val1, val2);
528 
529  if ((threadIdx.x & (warpSize - 1)) == 0) {
530  half* loc = output + row;
531  atomicReduce((half2*)loc, val, reducer);
532  }
533  }
534  }
535 }
536 
537 #endif
538 
539 template <typename Self, typename Op, typename OutputType, bool PacketAccess, typename Enabled = void>
540 struct InnerReductionLauncher {
541  static EIGEN_DEVICE_FUNC bool run(const Self&, Op&, const GpuDevice&, OutputType*, typename Self::Index, typename Self::Index) {
542  assert(false && "Should only be called to reduce doubles, floats and half floats on a gpu device");
543  return true;
544  }
545 };
546 
547 // Specialization for float and double
548 template <typename Self, typename Op, typename OutputType, bool PacketAccess>
549 struct InnerReductionLauncher<
550  Self, Op, OutputType, PacketAccess,
551  typename internal::enable_if<
552  internal::is_same<float, OutputType>::value ||
553  internal::is_same<double, OutputType>::value,
554  void>::type> {
555  static bool run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) {
556  typedef typename Self::Index Index;
557 
558  const Index num_coeffs = num_coeffs_to_reduce * num_preserved_vals;
559  const int block_size = 256;
560  const int num_per_thread = 128;
561  const int dyn_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
562  const int max_blocks = device.getNumCudaMultiProcessors() *
563  device.maxCudaThreadsPerMultiProcessor() / block_size;
564  const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
565 
566  if (num_blocks > 1) {
567  // We initialize the outputs outside the reduction kernel when we can't be sure that there
568  // won't be a race conditions between multiple thread blocks.
569  const int dyn_blocks = divup<int>(num_preserved_vals, 1024);
570  const int max_blocks = device.getNumCudaMultiProcessors() *
571  device.maxCudaThreadsPerMultiProcessor() / 1024;
572  const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
573  LAUNCH_CUDA_KERNEL((ReductionInitKernel<OutputType, Index>),
574  num_blocks, 1024, 0, device, reducer.initialize(),
575  num_preserved_vals, output);
576  }
577 
578  LAUNCH_CUDA_KERNEL((InnerReductionKernel<num_per_thread, Self, Op, Index>),
579  num_blocks, block_size, 0, device, reducer, self, num_coeffs_to_reduce, num_preserved_vals, output);
580 
581  return false;
582  }
583 };
584 
585 #ifdef EIGEN_HAS_CUDA_FP16
586 template <typename Self, typename Op>
587 struct InnerReductionLauncher<Self, Op, Eigen::half, false> {
588  static bool run(const Self&, Op&, const GpuDevice&, half*, typename Self::Index, typename Self::Index) {
589  assert(false && "Should not be called since there is no packet accessor");
590  return true;
591  }
592 };
593 
594 template <typename Self, typename Op>
595 struct InnerReductionLauncher<Self, Op, Eigen::half, true> {
596  static bool run(const Self& self, Op& reducer, const GpuDevice& device, half* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) {
597  typedef typename Self::Index Index;
598 
599  if (num_preserved_vals % 2 != 0) {
600  // Not supported yet, revert to the slower code path
601  return true;
602  }
603 
604  const Index num_coeffs = num_coeffs_to_reduce * num_preserved_vals;
605  const int block_size = /*256*/128;
606  const int num_per_thread = /*128*/64;
607  const int dyn_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
608  const int max_blocks = device.getNumCudaMultiProcessors() *
609  device.maxCudaThreadsPerMultiProcessor() / block_size;
610  const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
611 
612  if (num_blocks > 1) {
613  // We initialize the outputs outside the reduction kernel when we can't be sure that there
614  // won't be a race conditions between multiple thread blocks.
615  const int dyn_blocks = divup<int>(num_preserved_vals, 1024);
616  const int max_blocks = device.getNumCudaMultiProcessors() *
617  device.maxCudaThreadsPerMultiProcessor() / 1024;
618  const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
619  LAUNCH_CUDA_KERNEL((ReductionInitKernelHalfFloat<Self, Op, Index>),
620  1, 1, 0, device, reducer, self, num_preserved_vals, output);
621  }
622 
623  LAUNCH_CUDA_KERNEL((InnerReductionKernelHalfFloat<num_per_thread, Self, Op, Index>),
624  num_blocks, block_size, 0, device, reducer, self, num_coeffs_to_reduce, num_preserved_vals, output);
625 
626  return false;
627  }
628 };
629 #endif
630 
631 
632 template <typename Self, typename Op>
633 struct InnerReducer<Self, Op, GpuDevice> {
634  // Unfortunately nvidia doesn't support well exotic types such as complex,
635  // so reduce the scope of the optimized version of the code to the simple case
636  // of floats and half floats.
637 #ifdef EIGEN_HAS_CUDA_FP16
638  static const bool HasOptimizedImplementation = !Op::IsStateful &&
642 #else
643  static const bool HasOptimizedImplementation = !Op::IsStateful &&
646 #endif
647 
648  template <typename OutputType>
649  static bool run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) {
650  assert(HasOptimizedImplementation && "Should only be called on doubles, floats or half floats");
651  const Index num_coeffs = array_prod(self.m_impl.dimensions());
652  // Don't crash when we're called with an input tensor of size 0.
653  if (num_coeffs == 0) {
654  return true;
655  }
656  // It's faster to use the usual code.
657  if (num_coeffs_to_reduce <= 128) {
658  return true;
659  }
660 
661  return InnerReductionLauncher<Self, Op, OutputType, reducer_traits<Op, GpuDevice>::PacketAccess>::run(self, reducer, device, output, num_coeffs_to_reduce, num_preserved_vals);
662  }
663 };
664 
665 template <int NumPerThread, typename Self,
666  typename Reducer, typename Index>
667 __global__ void OuterReductionKernel(Reducer reducer, const Self input, Index num_coeffs_to_reduce, Index num_preserved_coeffs,
668  typename Self::CoeffReturnType* output) {
669  const Index num_threads = blockDim.x * gridDim.x;
670  const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x;
671  // Initialize the output values if they weren't initialized by the ReductionInitKernel
672  if (gridDim.x == 1) {
673  for (Index i = thread_id; i < num_preserved_coeffs; i += num_threads) {
674  output[i] = reducer.initialize();
675  }
676  __syncthreads();
677  }
678 
679  // Do the reduction.
680  const Index max_iter = num_preserved_coeffs * divup<Index>(num_coeffs_to_reduce, NumPerThread);
681  for (Index i = thread_id; i < max_iter; i += num_threads) {
682  const Index input_col = i % num_preserved_coeffs;
683  const Index input_row = (i / num_preserved_coeffs) * NumPerThread;
684  typename Self::CoeffReturnType reduced_val = reducer.initialize();
685  const Index max_row = numext::mini(input_row + NumPerThread, num_coeffs_to_reduce);
686  for (Index j = input_row; j < max_row; j++) {
687  typename Self::CoeffReturnType val = input.m_impl.coeff(j * num_preserved_coeffs + input_col);
688  reducer.reduce(val, &reduced_val);
689  }
690  atomicReduce(&(output[input_col]), reduced_val, reducer);
691  }
692 }
693 
694 
695 template <typename Self, typename Op>
696 struct OuterReducer<Self, Op, GpuDevice> {
697  // Unfortunately nvidia doesn't support well exotic types such as complex,
698  // so reduce the scope of the optimized version of the code to the simple case
699  // of floats.
700  static const bool HasOptimizedImplementation = !Op::IsStateful &&
703  template <typename Device, typename OutputType>
704  static EIGEN_DEVICE_FUNC bool run(const Self&, Op&, const Device&, OutputType*, typename Self::Index, typename Self::Index) {
705  assert(false && "Should only be called to reduce doubles or floats on a gpu device");
706  return true;
707  }
708 
709  static bool run(const Self& self, Op& reducer, const GpuDevice& device, float* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) {
710  typedef typename Self::Index Index;
711 
712  // It's faster to use the usual code.
713  if (num_coeffs_to_reduce <= 32) {
714  return true;
715  }
716 
717  const Index num_coeffs = num_coeffs_to_reduce * num_preserved_vals;
718  const int block_size = 256;
719  const int num_per_thread = 16;
720  const int dyn_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
721  const int max_blocks = device.getNumCudaMultiProcessors() *
722  device.maxCudaThreadsPerMultiProcessor() / block_size;
723  const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
724 
725  if (num_blocks > 1) {
726  // We initialize the outputs in the reduction kernel itself when we don't have to worry
727  // about race conditions between multiple thread blocks.
728  const int dyn_blocks = divup<int>(num_preserved_vals, 1024);
729  const int max_blocks = device.getNumCudaMultiProcessors() *
730  device.maxCudaThreadsPerMultiProcessor() / 1024;
731  const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
732  LAUNCH_CUDA_KERNEL((ReductionInitKernel<float, Index>),
733  num_blocks, 1024, 0, device, reducer.initialize(),
734  num_preserved_vals, output);
735  }
736 
737  LAUNCH_CUDA_KERNEL((OuterReductionKernel<num_per_thread, Self, Op, Index>),
738  num_blocks, block_size, 0, device, reducer, self, num_coeffs_to_reduce, num_preserved_vals, output);
739 
740  return false;
741  }
742 };
743 
744 #endif
745 
746 
747 } // end namespace internal
748 } // end namespace Eigen
749 
750 #endif // EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_CUDA_H
#define EIGEN_ALWAYS_INLINE
Definition: Macros.h:509
SCALAR Scalar
Definition: bench_gemm.cpp:33
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::ptrdiff_t array_prod(const Sizes< Indices... > &)
constexpr int last(int, int result)
m m block(1, 0, 2, 2)<< 4
dim3 threadIdx
Definition: cuda_common.h:11
Rot2 R(Rot2::fromAngle(0.1))
set noclip points set clip one set noclip two set bar set border lt lw set xdata set ydata set zdata set x2data set y2data set boxwidth set dummy y set format x g set format y g set format x2 g set format y2 g set format z g set angles radians set nogrid set key title set key left top Right noreverse box linetype linewidth samplen spacing width set nolabel set noarrow set nologscale set logscale x set set pointsize set encoding default set nopolar set noparametric set set set set surface set nocontour set clabel set mapping cartesian set nohidden3d set cntrparam order set cntrparam linear set cntrparam levels auto set cntrparam points set size set set xzeroaxis lt lw set x2zeroaxis lt lw set yzeroaxis lt lw set y2zeroaxis lt lw set tics in set ticslevel set tics set mxtics default set mytics default set mx2tics default set my2tics default set xtics border mirror norotate autofreq set ytics border mirror norotate autofreq set ztics border nomirror norotate autofreq set nox2tics set noy2tics set timestamp bottom norotate offset
Namespace containing all symbols from the Eigen library.
Definition: jet.h:637
m row(1)
EIGEN_DEFAULT_DENSE_INDEX_TYPE Index
The Index type as used for the API.
Definition: Meta.h:33
#define eigen_assert(x)
Definition: Macros.h:579
#define NULL
Definition: ccolamd.c:609
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T mini(const T &x, const T &y)
dim3 blockIdx
Definition: cuda_common.h:11
m col(1)
dim3 blockDim
Definition: cuda_common.h:11
void run(Expr &expr, Dev &dev)
Definition: TensorSyclRun.h:33
std::ptrdiff_t j
Definition: pytypes.h:897


gtsam
Author(s):
autogenerated on Sat May 8 2021 02:45:44