temp_utils.hpp
Go to the documentation of this file.
1 #pragma once
2 
3 #include <cuda.h>
5 
6 namespace kfusion
7 {
8  namespace device
9  {
10  template <class T> __kf_hdevice__ void swap(T& a, T& b) { T c(a); a=b; b=c; }
11 
12  template<typename T> struct numeric_limits;
13 
14  template<> struct numeric_limits<float>
15  {
16  __kf_device__ static float quiet_NaN() { return __int_as_float(0x7fffffff); /*CUDART_NAN_F*/ };
17  __kf_device__ static float epsilon() { return 1.192092896e-07f/*FLT_EPSILON*/; };
18  __kf_device__ static float min() { return 1.175494351e-38f/*FLT_MIN*/; };
19  __kf_device__ static float max() { return 3.402823466e+38f/*FLT_MAX*/; };
20  };
21 
22  template<> struct numeric_limits<unsigned short>
23  {
24  __kf_device__ static unsigned short max() { return USHRT_MAX; };
25  };
26 
27  __kf_device__ float dot(const float3& v1, const float3& v2)
28  {
29  return __fmaf_rn(v1.x, v2.x, __fmaf_rn(v1.y, v2.y, v1.z*v2.z));
30  }
31 
32  __kf_device__ float3& operator+=(float3& vec, const float& v)
33  {
34  vec.x += v; vec.y += v; vec.z += v; return vec;
35  }
36 
37  __kf_device__ float3& operator+=(float3& v1, const float3& v2)
38  {
39  v1.x += v2.x; v1.y += v2.y; v1.z += v2.z; return v1;
40  }
41 
42  __kf_device__ float3 operator+(const float3& v1, const float3& v2)
43  {
44  return make_float3(v1.x + v2.x, v1.y + v2.y, v1.z + v2.z);
45  }
46 
47  __kf_device__ float3 operator*(const float3& v1, const float3& v2)
48  {
49  return make_float3(v1.x * v2.x, v1.y * v2.y, v1.z * v2.z);
50  }
51 
52  __kf_hdevice__ float3 operator*(const float3& v1, const int3& v2)
53  {
54  return make_float3(v1.x * v2.x, v1.y * v2.y, v1.z * v2.z);
55  }
56 
57  __kf_device__ float3 operator/(const float3& v1, const float3& v2)
58  {
59  return make_float3(v1.x / v2.x, v1.y / v2.y, v1.z / v2.z);
60  }
61 
62  __kf_hdevice__ float3 operator/(const float& v, const float3& vec)
63  {
64  return make_float3(v / vec.x, v / vec.y, v / vec.z);
65  }
66 
67  __kf_device__ float3& operator*=(float3& vec, const float& v)
68  {
69  vec.x *= v; vec.y *= v; vec.z *= v; return vec;
70  }
71 
72  __kf_device__ float3 operator-(const float3& v1, const float3& v2)
73  {
74  return make_float3(v1.x - v2.x, v1.y - v2.y, v1.z - v2.z);
75  }
76 
77  __kf_hdevice__ float3 operator*(const float3& v1, const float& v)
78  {
79  return make_float3(v1.x * v, v1.y * v, v1.z * v);
80  }
81 
82  __kf_hdevice__ float3 operator*(const float& v, const float3& v1)
83  {
84  return make_float3(v1.x * v, v1.y * v, v1.z * v);
85  }
86 
87  __kf_device__ float norm(const float3& v)
88  {
89  return sqrt(dot(v, v));
90  }
91 
92  __kf_device__ float norm_sqr(const float3& v)
93  {
94  return dot(v, v);
95  }
96 
97  __kf_device__ float3 normalized(const float3& v)
98  {
99  return v * rsqrt(dot(v, v));
100  }
101 
102  __kf_hdevice__ float3 cross(const float3& v1, const float3& v2)
103  {
104  return make_float3(v1.y * v2.z - v1.z * v2.y, v1.z * v2.x - v1.x * v2.z, v1.x * v2.y - v1.y * v2.x);
105  }
106 
107  __kf_device__ void computeRoots2(const float& b, const float& c, float3& roots)
108  {
109  roots.x = 0.f;
110  float d = b * b - 4.f * c;
111  if (d < 0.f) // no real roots!!!! THIS SHOULD NOT HAPPEN!
112  d = 0.f;
113 
114  float sd = sqrtf(d);
115 
116  roots.z = 0.5f * (b + sd);
117  roots.y = 0.5f * (b - sd);
118  }
119 
120  __kf_device__ void computeRoots3(float c0, float c1, float c2, float3& roots)
121  {
122  if ( fabsf(c0) < numeric_limits<float>::epsilon())// one root is 0 -> quadratic equation
123  {
124  computeRoots2 (c2, c1, roots);
125  }
126  else
127  {
128  const float s_inv3 = 1.f/3.f;
129  const float s_sqrt3 = sqrtf(3.f);
130  // Construct the parameters used in classifying the roots of the equation
131  // and in solving the equation for the roots in closed form.
132  float c2_over_3 = c2 * s_inv3;
133  float a_over_3 = (c1 - c2*c2_over_3)*s_inv3;
134  if (a_over_3 > 0.f)
135  a_over_3 = 0.f;
136 
137  float half_b = 0.5f * (c0 + c2_over_3 * (2.f * c2_over_3 * c2_over_3 - c1));
138 
139  float q = half_b * half_b + a_over_3 * a_over_3 * a_over_3;
140  if (q > 0.f)
141  q = 0.f;
142 
143  // Compute the eigenvalues by solving for the roots of the polynomial.
144  float rho = sqrtf(-a_over_3);
145  float theta = atan2f (sqrtf (-q), half_b)*s_inv3;
146  float cos_theta = __cosf (theta);
147  float sin_theta = __sinf (theta);
148  roots.x = c2_over_3 + 2.f * rho * cos_theta;
149  roots.y = c2_over_3 - rho * (cos_theta + s_sqrt3 * sin_theta);
150  roots.z = c2_over_3 - rho * (cos_theta - s_sqrt3 * sin_theta);
151 
152  // Sort in increasing order.
153  if (roots.x >= roots.y)
154  swap(roots.x, roots.y);
155 
156  if (roots.y >= roots.z)
157  {
158  swap(roots.y, roots.z);
159 
160  if (roots.x >= roots.y)
161  swap (roots.x, roots.y);
162  }
163  if (roots.x <= 0) // eigenval for symetric positive semi-definite matrix can not be negative! Set it to 0
164  computeRoots2 (c2, c1, roots);
165  }
166  }
167 
168  struct Eigen33
169  {
170  public:
171  template<int Rows> struct MiniMat
172  {
173  float3 data[Rows];
174  __kf_hdevice__ float3& operator[](int i) { return data[i]; }
175  __kf_hdevice__ const float3& operator[](int i) const { return data[i]; }
176  };
177  typedef MiniMat<3> Mat33;
178  typedef MiniMat<4> Mat43;
179 
180 
181  static __kf_device__ float3 unitOrthogonal (const float3& src)
182  {
183  float3 perp;
184  /* Let us compute the crossed product of *this with a vector
185  * that is not too close to being colinear to *this.
186  */
187 
188  /* unless the x and y coords are both close to zero, we can
189  * simply take ( -y, x, 0 ) and normalize it.
190  */
191  if(!isMuchSmallerThan(src.x, src.z) || !isMuchSmallerThan(src.y, src.z))
192  {
193  float invnm = rsqrtf(src.x*src.x + src.y*src.y);
194  perp.x = -src.y * invnm;
195  perp.y = src.x * invnm;
196  perp.z = 0.0f;
197  }
198  /* if both x and y are close to zero, then the vector is close
199  * to the z-axis, so it's far from colinear to the x-axis for instance.
200  * So we take the crossed product with (1,0,0) and normalize it.
201  */
202  else
203  {
204  float invnm = rsqrtf(src.z * src.z + src.y * src.y);
205  perp.x = 0.0f;
206  perp.y = -src.z * invnm;
207  perp.z = src.y * invnm;
208  }
209 
210  return perp;
211  }
212 
213  __kf_device__ Eigen33(volatile float* mat_pkg_arg) : mat_pkg(mat_pkg_arg) {}
214  __kf_device__ void compute(Mat33& tmp, Mat33& vec_tmp, Mat33& evecs, float3& evals)
215  {
216  // Scale the matrix so its entries are in [-1,1]. The scaling is applied
217  // only when at least one matrix entry has magnitude larger than 1.
218 
219  float max01 = fmaxf( fabsf(mat_pkg[0]), fabsf(mat_pkg[1]) );
220  float max23 = fmaxf( fabsf(mat_pkg[2]), fabsf(mat_pkg[3]) );
221  float max45 = fmaxf( fabsf(mat_pkg[4]), fabsf(mat_pkg[5]) );
222  float m0123 = fmaxf( max01, max23);
223  float scale = fmaxf( max45, m0123);
224 
225  if (scale <= numeric_limits<float>::min())
226  scale = 1.f;
227 
228  mat_pkg[0] /= scale;
229  mat_pkg[1] /= scale;
230  mat_pkg[2] /= scale;
231  mat_pkg[3] /= scale;
232  mat_pkg[4] /= scale;
233  mat_pkg[5] /= scale;
234 
235  // The characteristic equation is x^3 - c2*x^2 + c1*x - c0 = 0. The
236  // eigenvalues are the roots to this equation, all guaranteed to be
237  // real-valued, because the matrix is symmetric.
238  float c0 = m00() * m11() * m22()
239  + 2.f * m01() * m02() * m12()
240  - m00() * m12() * m12()
241  - m11() * m02() * m02()
242  - m22() * m01() * m01();
243  float c1 = m00() * m11() -
244  m01() * m01() +
245  m00() * m22() -
246  m02() * m02() +
247  m11() * m22() -
248  m12() * m12();
249  float c2 = m00() + m11() + m22();
250 
251  computeRoots3(c0, c1, c2, evals);
252 
253  if(evals.z - evals.x <= numeric_limits<float>::epsilon())
254  {
255  evecs[0] = make_float3(1.f, 0.f, 0.f);
256  evecs[1] = make_float3(0.f, 1.f, 0.f);
257  evecs[2] = make_float3(0.f, 0.f, 1.f);
258  }
259  else if (evals.y - evals.x <= numeric_limits<float>::epsilon() )
260  {
261  // first and second equal
262  tmp[0] = row0(); tmp[1] = row1(); tmp[2] = row2();
263  tmp[0].x -= evals.z; tmp[1].y -= evals.z; tmp[2].z -= evals.z;
264 
265  vec_tmp[0] = cross(tmp[0], tmp[1]);
266  vec_tmp[1] = cross(tmp[0], tmp[2]);
267  vec_tmp[2] = cross(tmp[1], tmp[2]);
268 
269  float len1 = dot (vec_tmp[0], vec_tmp[0]);
270  float len2 = dot (vec_tmp[1], vec_tmp[1]);
271  float len3 = dot (vec_tmp[2], vec_tmp[2]);
272 
273  if (len1 >= len2 && len1 >= len3)
274  {
275  evecs[2] = vec_tmp[0] * rsqrtf (len1);
276  }
277  else if (len2 >= len1 && len2 >= len3)
278  {
279  evecs[2] = vec_tmp[1] * rsqrtf (len2);
280  }
281  else
282  {
283  evecs[2] = vec_tmp[2] * rsqrtf (len3);
284  }
285 
286  evecs[1] = unitOrthogonal(evecs[2]);
287  evecs[0] = cross(evecs[1], evecs[2]);
288  }
289  else if (evals.z - evals.y <= numeric_limits<float>::epsilon() )
290  {
291  // second and third equal
292  tmp[0] = row0(); tmp[1] = row1(); tmp[2] = row2();
293  tmp[0].x -= evals.x; tmp[1].y -= evals.x; tmp[2].z -= evals.x;
294 
295  vec_tmp[0] = cross(tmp[0], tmp[1]);
296  vec_tmp[1] = cross(tmp[0], tmp[2]);
297  vec_tmp[2] = cross(tmp[1], tmp[2]);
298 
299  float len1 = dot(vec_tmp[0], vec_tmp[0]);
300  float len2 = dot(vec_tmp[1], vec_tmp[1]);
301  float len3 = dot(vec_tmp[2], vec_tmp[2]);
302 
303  if (len1 >= len2 && len1 >= len3)
304  {
305  evecs[0] = vec_tmp[0] * rsqrtf(len1);
306  }
307  else if (len2 >= len1 && len2 >= len3)
308  {
309  evecs[0] = vec_tmp[1] * rsqrtf(len2);
310  }
311  else
312  {
313  evecs[0] = vec_tmp[2] * rsqrtf(len3);
314  }
315 
316  evecs[1] = unitOrthogonal( evecs[0] );
317  evecs[2] = cross(evecs[0], evecs[1]);
318  }
319  else
320  {
321 
322  tmp[0] = row0(); tmp[1] = row1(); tmp[2] = row2();
323  tmp[0].x -= evals.z; tmp[1].y -= evals.z; tmp[2].z -= evals.z;
324 
325  vec_tmp[0] = cross(tmp[0], tmp[1]);
326  vec_tmp[1] = cross(tmp[0], tmp[2]);
327  vec_tmp[2] = cross(tmp[1], tmp[2]);
328 
329  float len1 = dot(vec_tmp[0], vec_tmp[0]);
330  float len2 = dot(vec_tmp[1], vec_tmp[1]);
331  float len3 = dot(vec_tmp[2], vec_tmp[2]);
332 
333  float mmax[3];
334 
335  unsigned int min_el = 2;
336  unsigned int max_el = 2;
337  if (len1 >= len2 && len1 >= len3)
338  {
339  mmax[2] = len1;
340  evecs[2] = vec_tmp[0] * rsqrtf (len1);
341  }
342  else if (len2 >= len1 && len2 >= len3)
343  {
344  mmax[2] = len2;
345  evecs[2] = vec_tmp[1] * rsqrtf (len2);
346  }
347  else
348  {
349  mmax[2] = len3;
350  evecs[2] = vec_tmp[2] * rsqrtf (len3);
351  }
352 
353  tmp[0] = row0(); tmp[1] = row1(); tmp[2] = row2();
354  tmp[0].x -= evals.y; tmp[1].y -= evals.y; tmp[2].z -= evals.y;
355 
356  vec_tmp[0] = cross(tmp[0], tmp[1]);
357  vec_tmp[1] = cross(tmp[0], tmp[2]);
358  vec_tmp[2] = cross(tmp[1], tmp[2]);
359 
360  len1 = dot(vec_tmp[0], vec_tmp[0]);
361  len2 = dot(vec_tmp[1], vec_tmp[1]);
362  len3 = dot(vec_tmp[2], vec_tmp[2]);
363 
364  if (len1 >= len2 && len1 >= len3)
365  {
366  mmax[1] = len1;
367  evecs[1] = vec_tmp[0] * rsqrtf (len1);
368  min_el = len1 <= mmax[min_el] ? 1 : min_el;
369  max_el = len1 > mmax[max_el] ? 1 : max_el;
370  }
371  else if (len2 >= len1 && len2 >= len3)
372  {
373  mmax[1] = len2;
374  evecs[1] = vec_tmp[1] * rsqrtf (len2);
375  min_el = len2 <= mmax[min_el] ? 1 : min_el;
376  max_el = len2 > mmax[max_el] ? 1 : max_el;
377  }
378  else
379  {
380  mmax[1] = len3;
381  evecs[1] = vec_tmp[2] * rsqrtf (len3);
382  min_el = len3 <= mmax[min_el] ? 1 : min_el;
383  max_el = len3 > mmax[max_el] ? 1 : max_el;
384  }
385 
386  tmp[0] = row0(); tmp[1] = row1(); tmp[2] = row2();
387  tmp[0].x -= evals.x; tmp[1].y -= evals.x; tmp[2].z -= evals.x;
388 
389  vec_tmp[0] = cross(tmp[0], tmp[1]);
390  vec_tmp[1] = cross(tmp[0], tmp[2]);
391  vec_tmp[2] = cross(tmp[1], tmp[2]);
392 
393  len1 = dot (vec_tmp[0], vec_tmp[0]);
394  len2 = dot (vec_tmp[1], vec_tmp[1]);
395  len3 = dot (vec_tmp[2], vec_tmp[2]);
396 
397 
398  if (len1 >= len2 && len1 >= len3)
399  {
400  mmax[0] = len1;
401  evecs[0] = vec_tmp[0] * rsqrtf (len1);
402  min_el = len3 <= mmax[min_el] ? 0 : min_el;
403  max_el = len3 > mmax[max_el] ? 0 : max_el;
404  }
405  else if (len2 >= len1 && len2 >= len3)
406  {
407  mmax[0] = len2;
408  evecs[0] = vec_tmp[1] * rsqrtf (len2);
409  min_el = len3 <= mmax[min_el] ? 0 : min_el;
410  max_el = len3 > mmax[max_el] ? 0 : max_el;
411  }
412  else
413  {
414  mmax[0] = len3;
415  evecs[0] = vec_tmp[2] * rsqrtf (len3);
416  min_el = len3 <= mmax[min_el] ? 0 : min_el;
417  max_el = len3 > mmax[max_el] ? 0 : max_el;
418  }
419 
420  unsigned mid_el = 3 - min_el - max_el;
421  evecs[min_el] = normalized( cross( evecs[(min_el+1) % 3], evecs[(min_el+2) % 3] ) );
422  evecs[mid_el] = normalized( cross( evecs[(mid_el+1) % 3], evecs[(mid_el+2) % 3] ) );
423  }
424  // Rescale back to the original size.
425  evals *= scale;
426  }
427  private:
428  volatile float* mat_pkg;
429 
430  __kf_device__ float m00() const { return mat_pkg[0]; }
431  __kf_device__ float m01() const { return mat_pkg[1]; }
432  __kf_device__ float m02() const { return mat_pkg[2]; }
433  __kf_device__ float m10() const { return mat_pkg[1]; }
434  __kf_device__ float m11() const { return mat_pkg[3]; }
435  __kf_device__ float m12() const { return mat_pkg[4]; }
436  __kf_device__ float m20() const { return mat_pkg[2]; }
437  __kf_device__ float m21() const { return mat_pkg[4]; }
438  __kf_device__ float m22() const { return mat_pkg[5]; }
439 
440  __kf_device__ float3 row0() const { return make_float3( m00(), m01(), m02() ); }
441  __kf_device__ float3 row1() const { return make_float3( m10(), m11(), m12() ); }
442  __kf_device__ float3 row2() const { return make_float3( m20(), m21(), m22() ); }
443 
444  __kf_device__ static bool isMuchSmallerThan (float x, float y)
445  {
446  // copied from <eigen>/include/Eigen/src/Core/NumTraits.h
448  return x * x <= prec_sqr * y * y;
449  }
450  };
451 
452  struct Warp
453  {
454  enum
455  {
456  LOG_WARP_SIZE = 5,
457  WARP_SIZE = 1 << LOG_WARP_SIZE,
458  STRIDE = WARP_SIZE
459  };
460 
462  static __kf_device__ unsigned int laneId()
463  {
464  unsigned int ret;
465  asm("mov.u32 %0, %laneid;" : "=r"(ret) );
466  return ret;
467  }
468 
469  static __kf_device__ unsigned int id()
470  {
471  int tid = threadIdx.z * blockDim.x * blockDim.y + threadIdx.y * blockDim.x + threadIdx.x;
472  return tid >> LOG_WARP_SIZE;
473  }
474 
476  {
477 #if (__CUDA_ARCH__ >= 200)
478  unsigned int ret;
479  asm("mov.u32 %0, %lanemask_lt;" : "=r"(ret) );
480  return ret;
481 #else
482  return 0xFFFFFFFF >> (32 - laneId());
483 #endif
484  }
485  static __kf_device__ int binaryExclScan(int ballot_mask)
486  {
487  return __popc(Warp::laneMaskLt() & ballot_mask);
488  }
489  };
490 
491  struct Block
492  {
493  static __kf_device__ unsigned int stride()
494  {
495  return blockDim.x * blockDim.y * blockDim.z;
496  }
497 
499  {
500  return threadIdx.z * blockDim.x * blockDim.y + threadIdx.y * blockDim.x + threadIdx.x;
501  }
502 
503  template<int CTA_SIZE, typename T, class BinOp>
504  static __kf_device__ void reduce(volatile T* buffer, BinOp op)
505  {
506  int tid = flattenedThreadId();
507  T val = buffer[tid];
508 
509  if (CTA_SIZE >= 1024) { if (tid < 512) buffer[tid] = val = op(val, buffer[tid + 512]); __syncthreads(); }
510  if (CTA_SIZE >= 512) { if (tid < 256) buffer[tid] = val = op(val, buffer[tid + 256]); __syncthreads(); }
511  if (CTA_SIZE >= 256) { if (tid < 128) buffer[tid] = val = op(val, buffer[tid + 128]); __syncthreads(); }
512  if (CTA_SIZE >= 128) { if (tid < 64) buffer[tid] = val = op(val, buffer[tid + 64]); __syncthreads(); }
513 
514  if (tid < 32)
515  {
516  if (CTA_SIZE >= 64) { buffer[tid] = val = op(val, buffer[tid + 32]); }
517  if (CTA_SIZE >= 32) { buffer[tid] = val = op(val, buffer[tid + 16]); }
518  if (CTA_SIZE >= 16) { buffer[tid] = val = op(val, buffer[tid + 8]); }
519  if (CTA_SIZE >= 8) { buffer[tid] = val = op(val, buffer[tid + 4]); }
520  if (CTA_SIZE >= 4) { buffer[tid] = val = op(val, buffer[tid + 2]); }
521  if (CTA_SIZE >= 2) { buffer[tid] = val = op(val, buffer[tid + 1]); }
522  }
523  }
524 
525  template<int CTA_SIZE, typename T, class BinOp>
526  static __kf_device__ T reduce(volatile T* buffer, T init, BinOp op)
527  {
528  int tid = flattenedThreadId();
529  T val = buffer[tid] = init;
530  __syncthreads();
531 
532  if (CTA_SIZE >= 1024) { if (tid < 512) buffer[tid] = val = op(val, buffer[tid + 512]); __syncthreads(); }
533  if (CTA_SIZE >= 512) { if (tid < 256) buffer[tid] = val = op(val, buffer[tid + 256]); __syncthreads(); }
534  if (CTA_SIZE >= 256) { if (tid < 128) buffer[tid] = val = op(val, buffer[tid + 128]); __syncthreads(); }
535  if (CTA_SIZE >= 128) { if (tid < 64) buffer[tid] = val = op(val, buffer[tid + 64]); __syncthreads(); }
536 
537  if (tid < 32)
538  {
539 #if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 300 && 0
540  if (CTA_SIZE >= 64) val = op(val, buffer[tid + 32]);
541  if (CTA_SIZE >= 32) val = op(val, __shfl_xor(val, 16));
542  if (CTA_SIZE >= 16) val = op(val, __shfl_xor(val, 8));
543  if (CTA_SIZE >= 8) val = op(val, __shfl_xor(val, 4));
544  if (CTA_SIZE >= 4) val = op(val, __shfl_xor(val, 2));
545  if (CTA_SIZE >= 2) buffer[tid] = op(val, __shfl_xor(val, 1));
546 #else
547  if (CTA_SIZE >= 64) { buffer[tid] = val = op(val, buffer[tid + 32]); }
548  if (CTA_SIZE >= 32) { buffer[tid] = val = op(val, buffer[tid + 16]); }
549  if (CTA_SIZE >= 16) { buffer[tid] = val = op(val, buffer[tid + 8]); }
550  if (CTA_SIZE >= 8) { buffer[tid] = val = op(val, buffer[tid + 4]); }
551  if (CTA_SIZE >= 4) { buffer[tid] = val = op(val, buffer[tid + 2]); }
552  if (CTA_SIZE >= 2) { buffer[tid] = val = op(val, buffer[tid + 1]); }
553 #endif
554  }
555  __syncthreads();
556  return buffer[0];
557  }
558  };
559 
560 
561 
562  struct Emulation
563  {
564  static __kf_device__ int warp_reduce ( volatile int *ptr , const unsigned int tid)
565  {
566  const unsigned int lane = tid & 31; // index of thread in warp (0..31)
567 
568  if (lane < 16)
569  {
570  int partial = ptr[tid];
571 
572  ptr[tid] = partial = partial + ptr[tid + 16];
573  ptr[tid] = partial = partial + ptr[tid + 8];
574  ptr[tid] = partial = partial + ptr[tid + 4];
575  ptr[tid] = partial = partial + ptr[tid + 2];
576  ptr[tid] = partial = partial + ptr[tid + 1];
577  }
578  return ptr[tid - lane];
579  }
580 
581  static __kf_device__ int Ballot(int predicate, volatile int* cta_buffer)
582  {
583 #if __CUDA_ARCH__ >= 200
584  (void)cta_buffer;
585  return __ballot(predicate);
586 #else
587  int tid = Block::flattenedThreadId();
588  cta_buffer[tid] = predicate ? (1 << (tid & 31)) : 0;
589  return warp_reduce(cta_buffer, tid);
590 #endif
591  }
592 
593  static __kf_device__ bool All(int predicate, volatile int* cta_buffer)
594  {
595 #if __CUDA_ARCH__ >= 200
596  (void)cta_buffer;
597  return __all(predicate);
598 #else
599  int tid = Block::flattenedThreadId();
600  cta_buffer[tid] = predicate ? 1 : 0;
601  return warp_reduce(cta_buffer, tid) == 32;
602 #endif
603  }
604  };
605  }
606 }
__kf_hdevice__ float3 & operator[](int i)
Definition: temp_utils.hpp:174
static __kf_device__ unsigned int stride()
Definition: temp_utils.hpp:493
__kf_device__ float3 row0() const
Definition: temp_utils.hpp:440
__kf_device__ void computeRoots3(float c0, float c1, float c2, float3 &roots)
Definition: temp_utils.hpp:120
__kf_device__ float3 row1() const
Definition: temp_utils.hpp:441
static __kf_device__ int Ballot(int predicate, volatile int *cta_buffer)
Definition: temp_utils.hpp:581
static __kf_device__ int laneMaskLt()
Definition: temp_utils.hpp:475
__kf_device__ float norm_sqr(const float3 &v)
Definition: temp_utils.hpp:92
static __kf_device__ float epsilon()
Definition: temp_utils.hpp:17
static __kf_device__ unsigned short max()
Definition: temp_utils.hpp:24
static __kf_device__ unsigned int laneId()
Returns the warp lane ID of the calling thread.
Definition: temp_utils.hpp:462
__kf_device__ float m12() const
Definition: temp_utils.hpp:435
static __kf_device__ int warp_reduce(volatile int *ptr, const unsigned int tid)
Definition: temp_utils.hpp:564
static __kf_device__ unsigned int id()
Definition: temp_utils.hpp:469
__kf_hdevice__ void swap(T &a, T &b)
Definition: temp_utils.hpp:10
#define __kf_device__
__kf_device__ float m21() const
Definition: temp_utils.hpp:437
__kf_device__ float m10() const
Definition: temp_utils.hpp:433
static __kf_device__ float min()
Definition: temp_utils.hpp:18
static __kf_device__ void reduce(volatile T *buffer, BinOp op)
Definition: temp_utils.hpp:504
__kf_device__ float m22() const
Definition: temp_utils.hpp:438
__kf_device__ float3 operator+(const float3 &v1, const float3 &v2)
Definition: temp_utils.hpp:42
__kf_device__ float3 operator/(const float3 &v1, const float3 &v2)
Definition: temp_utils.hpp:57
static __kf_device__ int binaryExclScan(int ballot_mask)
Definition: temp_utils.hpp:485
static __kf_device__ bool isMuchSmallerThan(float x, float y)
Definition: temp_utils.hpp:444
__kf_device__ float dot(const float3 &v1, const float3 &v2)
Definition: temp_utils.hpp:27
#define __kf_hdevice__
__kf_device__ float m02() const
Definition: temp_utils.hpp:432
__kf_device__ float3 normalized(const float3 &v)
Definition: temp_utils.hpp:97
__kf_hdevice__ const float3 & operator[](int i) const
Definition: temp_utils.hpp:175
__kf_device__ float m11() const
Definition: temp_utils.hpp:434
__kf_device__ float m01() const
Definition: temp_utils.hpp:431
static __kf_device__ float max()
Definition: temp_utils.hpp:19
__kf_device__ float m20() const
Definition: temp_utils.hpp:436
__kf_device__ float3 & operator+=(float3 &vec, const float &v)
Definition: temp_utils.hpp:32
__kf_device__ void compute(Mat33 &tmp, Mat33 &vec_tmp, Mat33 &evecs, float3 &evals)
Definition: temp_utils.hpp:214
static __kf_device__ T reduce(volatile T *buffer, T init, BinOp op)
Definition: temp_utils.hpp:526
__kf_device__ float3 row2() const
Definition: temp_utils.hpp:442
static __kf_device__ bool All(int predicate, volatile int *cta_buffer)
Definition: temp_utils.hpp:593
__kf_device__ float3 & operator*=(float3 &vec, const float &v)
Definition: temp_utils.hpp:67
static __kf_device__ float quiet_NaN()
Definition: temp_utils.hpp:16
__kf_device__ Eigen33(volatile float *mat_pkg_arg)
Definition: temp_utils.hpp:213
__kf_device__ float norm(const float3 &v)
Definition: temp_utils.hpp:87
Utility.
Definition: capture.hpp:8
__kf_device__ float m00() const
Definition: temp_utils.hpp:430
__kf_device__ float3 operator-(const float3 &v1, const float3 &v2)
Definition: temp_utils.hpp:72
__kf_device__ Vec3f operator*(const Mat3f &m, const Vec3f &v)
Definition: device.hpp:74
volatile float * mat_pkg
Definition: temp_utils.hpp:428
__kf_device__ void computeRoots2(const float &b, const float &c, float3 &roots)
Definition: temp_utils.hpp:107
static __kf_device__ int flattenedThreadId()
Definition: temp_utils.hpp:498
__kf_hdevice__ float3 cross(const float3 &v1, const float3 &v2)
Definition: temp_utils.hpp:102
static __kf_device__ float3 unitOrthogonal(const float3 &src)
Definition: temp_utils.hpp:181


lvr2
Author(s): Thomas Wiemann , Sebastian Pütz , Alexander Mock , Lars Kiesow , Lukas Kalbertodt , Tristan Igelbrink , Johan M. von Behren , Dominik Feldschnieders , Alexander Löhr
autogenerated on Mon Feb 28 2022 22:46:09