10 #ifndef EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_CUDA_H 11 #define EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_CUDA_H 17 #if defined(EIGEN_USE_GPU) && defined(__CUDACC__) 24 template <
typename T,
typename R>
26 #if __CUDA_ARCH__ >= 300 29 unsigned int oldval = *
reinterpret_cast<unsigned int*
>(output);
30 unsigned int newval = oldval;
31 reducer.reduce(accum, reinterpret_cast<T*>(&newval));
32 if (newval == oldval) {
35 unsigned int readback;
36 while ((readback = atomicCAS((
unsigned int*)output, oldval, newval)) != oldval) {
39 reducer.reduce(accum, reinterpret_cast<T*>(&newval));
40 if (newval == oldval) {
45 else if (
sizeof(
T) == 8) {
46 unsigned long long oldval = *
reinterpret_cast<unsigned long long*
>(output);
47 unsigned long long newval = oldval;
48 reducer.reduce(accum, reinterpret_cast<T*>(&newval));
49 if (newval == oldval) {
52 unsigned long long readback;
53 while ((readback = atomicCAS((
unsigned long long*)output, oldval, newval)) != oldval) {
56 reducer.reduce(accum, reinterpret_cast<T*>(&newval));
57 if (newval == oldval) {
63 assert(0 &&
"Wordsize not supported");
66 assert(0 &&
"Shouldn't be called on unsupported device");
71 template <
typename Type>
72 __device__
inline Type atomicExchCustom(
Type* address,
Type val) {
73 return atomicExch(address, val);
77 __device__
inline double atomicExchCustom(
double* address,
double val) {
78 unsigned long long int* address_as_ull =
reinterpret_cast<unsigned long long int*
>(address);
79 return __longlong_as_double(atomicExch(address_as_ull, __double_as_longlong(val)));
82 #ifdef EIGEN_HAS_CUDA_FP16 83 template <
template <
typename T>
class R>
84 __device__
inline void atomicReduce(half2* output, half2 accum, R<half>& reducer) {
85 unsigned int oldval = *
reinterpret_cast<unsigned int*
>(output);
86 unsigned int newval = oldval;
87 reducer.reducePacket(accum, reinterpret_cast<half2*>(&newval));
88 if (newval == oldval) {
91 unsigned int readback;
92 while ((readback = atomicCAS((
unsigned int*)output, oldval, newval)) != oldval) {
95 reducer.reducePacket(accum, reinterpret_cast<half2*>(&newval));
96 if (newval == oldval) {
104 __device__
inline void atomicReduce(
float* output,
float accum, SumReducer<float>&) {
105 #if __CUDA_ARCH__ >= 300 106 atomicAdd(output, accum);
108 assert(0 &&
"Shouldn't be called on unsupported device");
113 template <
typename CoeffType,
typename Index>
114 __global__
void ReductionInitKernel(
const CoeffType val,
Index num_preserved_coeffs, CoeffType* output) {
117 for (
Index i = thread_id;
i < num_preserved_coeffs;
i += num_threads) {
123 template <
int BlockSize,
int NumPerThread,
typename Self,
124 typename Reducer,
typename Index>
125 __global__
void FullReductionKernel(Reducer reducer,
const Self input,
Index num_coeffs,
126 typename Self::CoeffReturnType* output,
unsigned int* semaphore) {
127 #if __CUDA_ARCH__ >= 300 130 if (gridDim.x == 1) {
131 if (first_index == 0) {
132 *output = reducer.initialize();
137 unsigned int block = atomicCAS(semaphore, 0u, 1u);
140 atomicExchCustom(output, reducer.initialize());
142 atomicExch(semaphore, 2u);
149 val = atomicCAS(semaphore, 2u, 2u);
160 typename Self::CoeffReturnType accum = reducer.initialize();
161 Index max_iter = numext::mini<Index>(num_coeffs - first_index, NumPerThread*BlockSize);
162 for (
Index i = 0;
i < max_iter;
i+=BlockSize) {
163 const Index index = first_index +
i;
165 typename Self::CoeffReturnType val = input.m_impl.coeff(index);
166 reducer.reduce(val, &accum);
171 reducer.reduce(__shfl_down(accum,
offset, warpSize), &accum);
174 if ((
threadIdx.x & (warpSize - 1)) == 0) {
175 atomicReduce(output, accum, reducer);
180 atomicInc(semaphore, gridDim.x + 1);
183 assert(0 &&
"Shouldn't be called on unsupported device");
188 #ifdef EIGEN_HAS_CUDA_FP16 189 template <
typename Self,
190 typename Reducer,
typename Index>
191 __global__
void ReductionInitFullReduxKernelHalfFloat(Reducer reducer,
const Self input,
Index num_coeffs, half2* scratch) {
194 if (num_coeffs % 2 != 0) {
195 half
last = input.m_impl.coeff(num_coeffs-1);
196 *scratch = __halves2half2(last, reducer.initialize());
198 *scratch = reducer.template initializePacket<half2>();
202 template <
typename Self,
203 typename Reducer,
typename Index>
204 __global__
void ReductionInitKernelHalfFloat(Reducer reducer,
const Self input,
Index num_coeffs, half* output) {
207 const Index num_packets = num_coeffs / 2;
208 for (
Index i = thread_id; i < num_packets; i += num_threads) {
209 ((half2*)output)[
i] = reducer.template initializePacket<half2>();
212 if (thread_id == 0 && num_coeffs % 2 != 0) {
213 output[num_coeffs-1] = reducer.initialize();
217 template <
int BlockSize,
int NumPerThread,
typename Self,
218 typename Reducer,
typename Index>
219 __global__
void FullReductionKernelHalfFloat(Reducer reducer,
const Self input,
Index num_coeffs,
220 half* output, half2* scratch) {
226 if (gridDim.x == 1 && first_index == 0) {
227 if (num_coeffs % 2 != 0) {
228 half last = input.m_impl.coeff(num_coeffs-1);
229 *scratch = __halves2half2(last, reducer.initialize());
231 *scratch = reducer.template initializePacket<half2>();
236 half2 accum = reducer.template initializePacket<half2>();
237 const Index max_iter = numext::mini<Index>((num_coeffs - first_index) / 2, NumPerThread*BlockSize / 2);
238 for (
Index i = 0; i < max_iter; i += BlockSize) {
239 const Index index = first_index + 2*
i;
241 half2 val = input.m_impl.template packet<Unaligned>(index);
242 reducer.reducePacket(val, &accum);
247 reducer.reducePacket(__shfl_down(accum,
offset, warpSize), &accum);
250 if ((
threadIdx.x & (warpSize - 1)) == 0) {
251 atomicReduce(scratch, accum, reducer);
256 if (gridDim.x == 1 && first_index == 0) {
257 half tmp = __low2half(*scratch);
258 reducer.reduce(__high2half(*scratch), &tmp);
263 template <
typename Op>
264 __global__
void ReductionCleanupKernelHalfFloat(Op& reducer, half* output, half2* scratch) {
266 half tmp = __low2half(*scratch);
267 reducer.reduce(__high2half(*scratch), &tmp);
273 template <
typename Self,
typename Op,
typename OutputType,
bool PacketAccess,
typename Enabled =
void>
274 struct FullReductionLauncher {
275 static void run(
const Self&, Op&,
const GpuDevice&, OutputType*,
typename Self::Index) {
276 assert(
false &&
"Should only be called on doubles, floats and half floats");
281 template <
typename Self,
typename Op,
typename OutputType,
bool PacketAccess>
282 struct FullReductionLauncher<
283 Self, Op, OutputType, PacketAccess,
285 internal::is_same<float, OutputType>::value ||
286 internal::is_same<double, OutputType>::value,
288 static void run(
const Self&
self, Op& reducer,
const GpuDevice& device, OutputType* output,
typename Self::Index num_coeffs) {
290 typedef typename Self::CoeffReturnType
Scalar;
291 const int block_size = 256;
292 const int num_per_thread = 128;
293 const int num_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
295 unsigned int* semaphore =
NULL;
296 if (num_blocks > 1) {
297 semaphore = device.semaphore();
300 LAUNCH_CUDA_KERNEL((FullReductionKernel<block_size, num_per_thread, Self, Op, Index>),
301 num_blocks, block_size, 0, device, reducer,
self, num_coeffs, output, semaphore);
305 #ifdef EIGEN_HAS_CUDA_FP16 306 template <
typename Self,
typename Op>
307 struct FullReductionLauncher<Self, Op,
Eigen::half, false> {
308 static void run(
const Self&, Op&,
const GpuDevice&, half*,
typename Self::Index) {
309 assert(
false &&
"Should not be called since there is no packet accessor");
313 template <
typename Self,
typename Op>
314 struct FullReductionLauncher<Self, Op,
Eigen::half, true> {
315 static void run(
const Self&
self, Op& reducer,
const GpuDevice& device, half* output,
typename Self::Index num_coeffs) {
318 const int block_size = 256;
319 const int num_per_thread = 128;
320 const int num_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
321 half2* scratch =
static_cast<half2*
>(device.scratchpad());
323 if (num_blocks > 1) {
326 LAUNCH_CUDA_KERNEL((ReductionInitFullReduxKernelHalfFloat<Self, Op, Index>),
327 1, 1, 0, device, reducer,
self, num_coeffs, scratch);
330 LAUNCH_CUDA_KERNEL((FullReductionKernelHalfFloat<block_size, num_per_thread, Self, Op, Index>),
331 num_blocks, block_size, 0, device, reducer,
self, num_coeffs, output, scratch);
333 if (num_blocks > 1) {
334 LAUNCH_CUDA_KERNEL((ReductionCleanupKernelHalfFloat<Op>),
335 1, 1, 0, device, reducer, output, scratch);
342 template <
typename Self,
typename Op,
bool Vectorizable>
343 struct FullReducer<Self, Op, GpuDevice, Vectorizable> {
347 #ifdef EIGEN_HAS_CUDA_FP16 348 static const bool HasOptimizedImplementation = !Op::IsStateful &&
353 static const bool HasOptimizedImplementation = !Op::IsStateful &&
358 template <
typename OutputType>
359 static void run(
const Self&
self, Op& reducer,
const GpuDevice& device, OutputType* output) {
360 assert(HasOptimizedImplementation &&
"Should only be called on doubles, floats or half floats");
361 const Index num_coeffs =
array_prod(
self.m_impl.dimensions());
363 if (num_coeffs == 0) {
367 FullReductionLauncher<Self, Op, OutputType, reducer_traits<Op, GpuDevice>::PacketAccess>
::run(
self, reducer, device, output, num_coeffs);
372 template <
int NumPerThread,
typename Self,
373 typename Reducer,
typename Index>
374 __global__
void InnerReductionKernel(Reducer reducer,
const Self input, Index num_coeffs_to_reduce, Index num_preserved_coeffs,
375 typename Self::CoeffReturnType* output) {
376 #if __CUDA_ARCH__ >= 300 377 typedef typename Self::CoeffReturnType
Type;
383 const int unroll_times = 16;
386 const Index input_col_blocks = divup<Index>(num_coeffs_to_reduce,
blockDim.x * NumPerThread);
387 const Index num_input_blocks = input_col_blocks * num_preserved_coeffs;
389 const Index num_threads =
blockDim.x * gridDim.x;
393 if (gridDim.x == 1) {
394 for (Index i = thread_id; i < num_preserved_coeffs; i += num_threads) {
395 output[
i] = reducer.initialize();
400 for (Index i =
blockIdx.x; i < num_input_blocks; i += gridDim.x) {
401 const Index
row = i / input_col_blocks;
403 if (row < num_preserved_coeffs) {
404 const Index col_block = i % input_col_blocks;
407 Type reduced_val = reducer.initialize();
409 for (Index
j = 0;
j < NumPerThread;
j += unroll_times) {
410 const Index last_col = col_begin +
blockDim.x * (
j + unroll_times - 1);
411 if (last_col >= num_coeffs_to_reduce) {
413 const Type val = input.m_impl.coeff(row * num_coeffs_to_reduce +
col);
414 reducer.reduce(val, &reduced_val);
420 for (
int k = 0; k < unroll_times; ++k) {
422 reducer.reduce(input.m_impl.coeff(row * num_coeffs_to_reduce + col), &reduced_val);
429 reducer.reduce(__shfl_down(reduced_val,
offset), &reduced_val);
432 if ((
threadIdx.x & (warpSize - 1)) == 0) {
433 atomicReduce(&(output[row]), reduced_val, reducer);
438 assert(0 &&
"Shouldn't be called on unsupported device");
442 #ifdef EIGEN_HAS_CUDA_FP16 444 template <
int NumPerThread,
typename Self,
445 typename Reducer,
typename Index>
446 __global__
void InnerReductionKernelHalfFloat(Reducer reducer,
const Self input, Index num_coeffs_to_reduce, Index num_preserved_coeffs,
453 const int unroll_times = 16;
457 const Index input_col_blocks = divup<Index>(num_coeffs_to_reduce,
blockDim.x * NumPerThread * 2);
458 const Index num_input_blocks = divup<Index>(input_col_blocks * num_preserved_coeffs, 2);
460 const Index num_threads =
blockDim.x * gridDim.x;
464 if (gridDim.x == 1) {
465 Index i = 2*thread_id;
466 for (; i + 1 < num_preserved_coeffs; i += 2*num_threads) {
467 half*
loc = output +
i;
468 *((half2*)loc) = reducer.template initializePacket<half2>();
470 if (i < num_preserved_coeffs) {
471 output[
i] = reducer.initialize();
476 for (Index i =
blockIdx.x; i < num_input_blocks; i += gridDim.x) {
477 const Index row = 2 * (i / input_col_blocks);
479 if (row + 1 < num_preserved_coeffs) {
480 const Index col_block = i % input_col_blocks;
481 const Index col_begin = 2 * (col_block *
blockDim.x * NumPerThread +
threadIdx.x);
483 half2 reduced_val1 = reducer.template initializePacket<half2>();
484 half2 reduced_val2 = reducer.template initializePacket<half2>();
486 for (Index
j = 0;
j < NumPerThread;
j += unroll_times) {
487 const Index last_col = col_begin +
blockDim.x * (
j + unroll_times - 1) * 2;
488 if (last_col >= num_coeffs_to_reduce) {
490 for (; col + 1 < num_coeffs_to_reduce; col +=
blockDim.x) {
491 const half2 val1 = input.m_impl.template packet<Unaligned>(row * num_coeffs_to_reduce +
col);
492 reducer.reducePacket(val1, &reduced_val1);
493 const half2 val2 = input.m_impl.template packet<Unaligned>((row+1) * num_coeffs_to_reduce + col);
494 reducer.reducePacket(val2, &reduced_val2);
496 if (col < num_coeffs_to_reduce) {
498 const half last1 = input.m_impl.coeff(row * num_coeffs_to_reduce + col);
499 const half2 val1 = __halves2half2(last1, reducer.initialize());
500 reducer.reducePacket(val1, &reduced_val1);
501 const half last2 = input.m_impl.coeff((row+1) * num_coeffs_to_reduce + col);
502 const half2 val2 = __halves2half2(last2, reducer.initialize());
503 reducer.reducePacket(val2, &reduced_val2);
509 for (
int k = 0; k < unroll_times; ++k) {
510 const Index col = col_begin +
blockDim.x * (
j + k) * 2;
511 reducer.reducePacket(input.m_impl.template packet<Unaligned>(row * num_coeffs_to_reduce + col), &reduced_val1);
512 reducer.reducePacket(input.m_impl.template packet<Unaligned>((row + 1)* num_coeffs_to_reduce + col), &reduced_val2);
519 reducer.reducePacket(__shfl_down(reduced_val1,
offset, warpSize), &reduced_val1);
520 reducer.reducePacket(__shfl_down(reduced_val2,
offset, warpSize), &reduced_val2);
523 half val1 = __low2half(reduced_val1);
524 reducer.reduce(__high2half(reduced_val1), &val1);
525 half val2 = __low2half(reduced_val2);
526 reducer.reduce(__high2half(reduced_val2), &val2);
527 half2 val = __halves2half2(val1, val2);
529 if ((
threadIdx.x & (warpSize - 1)) == 0) {
531 atomicReduce((half2*)loc, val, reducer);
539 template <
typename Self,
typename Op,
typename OutputType,
bool PacketAccess,
typename Enabled =
void>
540 struct InnerReductionLauncher {
541 static EIGEN_DEVICE_FUNC
bool run(
const Self&, Op&,
const GpuDevice&, OutputType*,
typename Self::Index,
typename Self::Index) {
542 assert(
false &&
"Should only be called to reduce doubles, floats and half floats on a gpu device");
548 template <
typename Self,
typename Op,
typename OutputType,
bool PacketAccess>
549 struct InnerReductionLauncher<
550 Self, Op, OutputType, PacketAccess,
552 internal::is_same<float, OutputType>::value ||
553 internal::is_same<double, OutputType>::value,
555 static bool run(
const Self&
self, Op& reducer,
const GpuDevice& device, OutputType* output,
typename Self::Index num_coeffs_to_reduce,
typename Self::Index num_preserved_vals) {
558 const Index num_coeffs = num_coeffs_to_reduce * num_preserved_vals;
559 const int block_size = 256;
560 const int num_per_thread = 128;
561 const int dyn_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
562 const int max_blocks = device.getNumCudaMultiProcessors() *
563 device.maxCudaThreadsPerMultiProcessor() / block_size;
564 const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
566 if (num_blocks > 1) {
569 const int dyn_blocks = divup<int>(num_preserved_vals, 1024);
570 const int max_blocks = device.getNumCudaMultiProcessors() *
571 device.maxCudaThreadsPerMultiProcessor() / 1024;
572 const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
573 LAUNCH_CUDA_KERNEL((ReductionInitKernel<OutputType, Index>),
574 num_blocks, 1024, 0, device, reducer.initialize(),
575 num_preserved_vals, output);
578 LAUNCH_CUDA_KERNEL((InnerReductionKernel<num_per_thread, Self, Op, Index>),
579 num_blocks, block_size, 0, device, reducer,
self, num_coeffs_to_reduce, num_preserved_vals, output);
585 #ifdef EIGEN_HAS_CUDA_FP16 586 template <
typename Self,
typename Op>
587 struct InnerReductionLauncher<Self, Op,
Eigen::half, false> {
589 assert(
false &&
"Should not be called since there is no packet accessor");
594 template <
typename Self,
typename Op>
595 struct InnerReductionLauncher<Self, Op,
Eigen::half, true> {
596 static bool run(
const Self&
self, Op& reducer,
const GpuDevice& device, half* output,
typename Self::Index num_coeffs_to_reduce,
typename Self::Index num_preserved_vals) {
599 if (num_preserved_vals % 2 != 0) {
604 const Index num_coeffs = num_coeffs_to_reduce * num_preserved_vals;
605 const int block_size = 128;
606 const int num_per_thread = 64;
607 const int dyn_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
608 const int max_blocks = device.getNumCudaMultiProcessors() *
609 device.maxCudaThreadsPerMultiProcessor() / block_size;
610 const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
612 if (num_blocks > 1) {
615 const int dyn_blocks = divup<int>(num_preserved_vals, 1024);
616 const int max_blocks = device.getNumCudaMultiProcessors() *
617 device.maxCudaThreadsPerMultiProcessor() / 1024;
618 const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
619 LAUNCH_CUDA_KERNEL((ReductionInitKernelHalfFloat<Self, Op, Index>),
620 1, 1, 0, device, reducer,
self, num_preserved_vals, output);
623 LAUNCH_CUDA_KERNEL((InnerReductionKernelHalfFloat<num_per_thread, Self, Op, Index>),
624 num_blocks, block_size, 0, device, reducer,
self, num_coeffs_to_reduce, num_preserved_vals, output);
632 template <
typename Self,
typename Op>
633 struct InnerReducer<Self, Op, GpuDevice> {
637 #ifdef EIGEN_HAS_CUDA_FP16 638 static const bool HasOptimizedImplementation = !Op::IsStateful &&
643 static const bool HasOptimizedImplementation = !Op::IsStateful &&
648 template <
typename OutputType>
649 static bool run(
const Self&
self, Op& reducer,
const GpuDevice& device, OutputType* output,
typename Self::Index num_coeffs_to_reduce,
typename Self::Index num_preserved_vals) {
650 assert(HasOptimizedImplementation &&
"Should only be called on doubles, floats or half floats");
651 const Index num_coeffs =
array_prod(
self.m_impl.dimensions());
653 if (num_coeffs == 0) {
657 if (num_coeffs_to_reduce <= 128) {
661 return InnerReductionLauncher<Self, Op, OutputType, reducer_traits<Op, GpuDevice>::PacketAccess>
::run(
self, reducer, device, output, num_coeffs_to_reduce, num_preserved_vals);
665 template <
int NumPerThread,
typename Self,
666 typename Reducer,
typename Index>
667 __global__
void OuterReductionKernel(Reducer reducer,
const Self input, Index num_coeffs_to_reduce, Index num_preserved_coeffs,
668 typename Self::CoeffReturnType* output) {
669 const Index num_threads =
blockDim.x * gridDim.x;
672 if (gridDim.x == 1) {
673 for (Index i = thread_id; i < num_preserved_coeffs; i += num_threads) {
674 output[
i] = reducer.initialize();
680 const Index max_iter = num_preserved_coeffs * divup<Index>(num_coeffs_to_reduce, NumPerThread);
681 for (Index i = thread_id; i < max_iter; i += num_threads) {
682 const Index input_col = i % num_preserved_coeffs;
683 const Index input_row = (i / num_preserved_coeffs) * NumPerThread;
684 typename Self::CoeffReturnType reduced_val = reducer.initialize();
685 const Index max_row =
numext::mini(input_row + NumPerThread, num_coeffs_to_reduce);
686 for (Index
j = input_row;
j < max_row;
j++) {
687 typename Self::CoeffReturnType val = input.m_impl.coeff(
j * num_preserved_coeffs + input_col);
688 reducer.reduce(val, &reduced_val);
690 atomicReduce(&(output[input_col]), reduced_val, reducer);
695 template <
typename Self,
typename Op>
696 struct OuterReducer<Self, Op, GpuDevice> {
700 static const bool HasOptimizedImplementation = !Op::IsStateful &&
703 template <
typename Device,
typename OutputType>
704 static EIGEN_DEVICE_FUNC
bool run(
const Self&, Op&,
const Device&, OutputType*,
typename Self::Index,
typename Self::Index) {
705 assert(
false &&
"Should only be called to reduce doubles or floats on a gpu device");
709 static bool run(
const Self&
self, Op& reducer,
const GpuDevice& device,
float* output,
typename Self::Index num_coeffs_to_reduce,
typename Self::Index num_preserved_vals) {
713 if (num_coeffs_to_reduce <= 32) {
717 const Index num_coeffs = num_coeffs_to_reduce * num_preserved_vals;
718 const int block_size = 256;
719 const int num_per_thread = 16;
720 const int dyn_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
721 const int max_blocks = device.getNumCudaMultiProcessors() *
722 device.maxCudaThreadsPerMultiProcessor() / block_size;
723 const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
725 if (num_blocks > 1) {
728 const int dyn_blocks = divup<int>(num_preserved_vals, 1024);
729 const int max_blocks = device.getNumCudaMultiProcessors() *
730 device.maxCudaThreadsPerMultiProcessor() / 1024;
731 const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
732 LAUNCH_CUDA_KERNEL((ReductionInitKernel<float, Index>),
733 num_blocks, 1024, 0, device, reducer.initialize(),
734 num_preserved_vals, output);
737 LAUNCH_CUDA_KERNEL((OuterReductionKernel<num_per_thread, Self, Op, Index>),
738 num_blocks, block_size, 0, device, reducer,
self, num_coeffs_to_reduce, num_preserved_vals, output);
750 #endif // EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_CUDA_H #define EIGEN_ALWAYS_INLINE
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::ptrdiff_t array_prod(const Sizes< Indices... > &)
constexpr int last(int, int result)
m m block(1, 0, 2, 2)<< 4
Rot2 R(Rot2::fromAngle(0.1))
set noclip points set clip one set noclip two set bar set border lt lw set xdata set ydata set zdata set x2data set y2data set boxwidth set dummy y set format x g set format y g set format x2 g set format y2 g set format z g set angles radians set nogrid set key title set key left top Right noreverse box linetype linewidth samplen spacing width set nolabel set noarrow set nologscale set logscale x set set pointsize set encoding default set nopolar set noparametric set set set set surface set nocontour set clabel set mapping cartesian set nohidden3d set cntrparam order set cntrparam linear set cntrparam levels auto set cntrparam points set size set set xzeroaxis lt lw set x2zeroaxis lt lw set yzeroaxis lt lw set y2zeroaxis lt lw set tics in set ticslevel set tics set mxtics default set mytics default set mx2tics default set my2tics default set xtics border mirror norotate autofreq set ytics border mirror norotate autofreq set ztics border nomirror norotate autofreq set nox2tics set noy2tics set timestamp bottom norotate offset
Namespace containing all symbols from the Eigen library.
EIGEN_DEFAULT_DENSE_INDEX_TYPE Index
The Index type as used for the API.
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T mini(const T &x, const T &y)
void run(Expr &expr, Dev &dev)