29 return __fmaf_rn(v1.x, v2.x, __fmaf_rn(v1.y, v2.y, v1.z*v2.z));
34 vec.x += v; vec.y += v; vec.z += v;
return vec;
39 v1.x += v2.x; v1.y += v2.y; v1.z += v2.z;
return v1;
44 return make_float3(v1.x + v2.x, v1.y + v2.y, v1.z + v2.z);
49 return make_float3(v1.x * v2.x, v1.y * v2.y, v1.z * v2.z);
54 return make_float3(v1.x * v2.x, v1.y * v2.y, v1.z * v2.z);
59 return make_float3(v1.x / v2.x, v1.y / v2.y, v1.z / v2.z);
64 return make_float3(v / vec.x, v / vec.y, v / vec.z);
69 vec.x *= v; vec.y *= v; vec.z *= v;
return vec;
74 return make_float3(v1.x - v2.x, v1.y - v2.y, v1.z - v2.z);
79 return make_float3(v1.x * v, v1.y * v, v1.z * v);
84 return make_float3(v1.x * v, v1.y * v, v1.z * v);
89 return sqrt(
dot(v, v));
99 return v * rsqrt(
dot(v, v));
104 return make_float3(v1.y * v2.z - v1.z * v2.y, v1.z * v2.x - v1.x * v2.z, v1.x * v2.y - v1.y * v2.x);
110 float d = b * b - 4.f * c;
116 roots.z = 0.5f * (b + sd);
117 roots.y = 0.5f * (b - sd);
128 const float s_inv3 = 1.f/3.f;
129 const float s_sqrt3 = sqrtf(3.f);
132 float c2_over_3 = c2 * s_inv3;
133 float a_over_3 = (c1 - c2*c2_over_3)*s_inv3;
137 float half_b = 0.5f * (c0 + c2_over_3 * (2.f * c2_over_3 * c2_over_3 - c1));
139 float q = half_b * half_b + a_over_3 * a_over_3 * a_over_3;
144 float rho = sqrtf(-a_over_3);
145 float theta = atan2f (sqrtf (-q), half_b)*s_inv3;
146 float cos_theta = __cosf (theta);
147 float sin_theta = __sinf (theta);
148 roots.x = c2_over_3 + 2.f * rho * cos_theta;
149 roots.y = c2_over_3 - rho * (cos_theta + s_sqrt3 * sin_theta);
150 roots.z = c2_over_3 - rho * (cos_theta - s_sqrt3 * sin_theta);
153 if (roots.x >= roots.y)
154 swap(roots.x, roots.y);
156 if (roots.y >= roots.z)
158 swap(roots.y, roots.z);
160 if (roots.x >= roots.y)
161 swap (roots.x, roots.y);
191 if(!isMuchSmallerThan(src.x, src.z) || !isMuchSmallerThan(src.y, src.z))
193 float invnm = rsqrtf(src.x*src.x + src.y*src.y);
194 perp.x = -src.y * invnm;
195 perp.y = src.x * invnm;
204 float invnm = rsqrtf(src.z * src.z + src.y * src.y);
206 perp.y = -src.z * invnm;
207 perp.z = src.y * invnm;
219 float max01 = fmaxf( fabsf(mat_pkg[0]), fabsf(mat_pkg[1]) );
220 float max23 = fmaxf( fabsf(mat_pkg[2]), fabsf(mat_pkg[3]) );
221 float max45 = fmaxf( fabsf(mat_pkg[4]), fabsf(mat_pkg[5]) );
222 float m0123 = fmaxf( max01, max23);
223 float scale = fmaxf( max45, m0123);
238 float c0 = m00() * m11() * m22()
239 + 2.f * m01() * m02() * m12()
240 - m00() * m12() * m12()
241 - m11() * m02() * m02()
242 - m22() * m01() * m01();
243 float c1 = m00() * m11() -
249 float c2 = m00() + m11() + m22();
255 evecs[0] = make_float3(1.f, 0.f, 0.f);
256 evecs[1] = make_float3(0.f, 1.f, 0.f);
257 evecs[2] = make_float3(0.f, 0.f, 1.f);
262 tmp[0] = row0(); tmp[1] = row1(); tmp[2] = row2();
263 tmp[0].x -= evals.z; tmp[1].y -= evals.z; tmp[2].z -= evals.z;
265 vec_tmp[0] =
cross(tmp[0], tmp[1]);
266 vec_tmp[1] =
cross(tmp[0], tmp[2]);
267 vec_tmp[2] =
cross(tmp[1], tmp[2]);
269 float len1 =
dot (vec_tmp[0], vec_tmp[0]);
270 float len2 =
dot (vec_tmp[1], vec_tmp[1]);
271 float len3 =
dot (vec_tmp[2], vec_tmp[2]);
273 if (len1 >= len2 && len1 >= len3)
275 evecs[2] = vec_tmp[0] * rsqrtf (len1);
277 else if (len2 >= len1 && len2 >= len3)
279 evecs[2] = vec_tmp[1] * rsqrtf (len2);
283 evecs[2] = vec_tmp[2] * rsqrtf (len3);
286 evecs[1] = unitOrthogonal(evecs[2]);
287 evecs[0] =
cross(evecs[1], evecs[2]);
292 tmp[0] = row0(); tmp[1] = row1(); tmp[2] = row2();
293 tmp[0].x -= evals.x; tmp[1].y -= evals.x; tmp[2].z -= evals.x;
295 vec_tmp[0] =
cross(tmp[0], tmp[1]);
296 vec_tmp[1] =
cross(tmp[0], tmp[2]);
297 vec_tmp[2] =
cross(tmp[1], tmp[2]);
299 float len1 =
dot(vec_tmp[0], vec_tmp[0]);
300 float len2 =
dot(vec_tmp[1], vec_tmp[1]);
301 float len3 =
dot(vec_tmp[2], vec_tmp[2]);
303 if (len1 >= len2 && len1 >= len3)
305 evecs[0] = vec_tmp[0] * rsqrtf(len1);
307 else if (len2 >= len1 && len2 >= len3)
309 evecs[0] = vec_tmp[1] * rsqrtf(len2);
313 evecs[0] = vec_tmp[2] * rsqrtf(len3);
316 evecs[1] = unitOrthogonal( evecs[0] );
317 evecs[2] =
cross(evecs[0], evecs[1]);
322 tmp[0] = row0(); tmp[1] = row1(); tmp[2] = row2();
323 tmp[0].x -= evals.z; tmp[1].y -= evals.z; tmp[2].z -= evals.z;
325 vec_tmp[0] =
cross(tmp[0], tmp[1]);
326 vec_tmp[1] =
cross(tmp[0], tmp[2]);
327 vec_tmp[2] =
cross(tmp[1], tmp[2]);
329 float len1 =
dot(vec_tmp[0], vec_tmp[0]);
330 float len2 =
dot(vec_tmp[1], vec_tmp[1]);
331 float len3 =
dot(vec_tmp[2], vec_tmp[2]);
335 unsigned int min_el = 2;
336 unsigned int max_el = 2;
337 if (len1 >= len2 && len1 >= len3)
340 evecs[2] = vec_tmp[0] * rsqrtf (len1);
342 else if (len2 >= len1 && len2 >= len3)
345 evecs[2] = vec_tmp[1] * rsqrtf (len2);
350 evecs[2] = vec_tmp[2] * rsqrtf (len3);
353 tmp[0] = row0(); tmp[1] = row1(); tmp[2] = row2();
354 tmp[0].x -= evals.y; tmp[1].y -= evals.y; tmp[2].z -= evals.y;
356 vec_tmp[0] =
cross(tmp[0], tmp[1]);
357 vec_tmp[1] =
cross(tmp[0], tmp[2]);
358 vec_tmp[2] =
cross(tmp[1], tmp[2]);
360 len1 =
dot(vec_tmp[0], vec_tmp[0]);
361 len2 =
dot(vec_tmp[1], vec_tmp[1]);
362 len3 =
dot(vec_tmp[2], vec_tmp[2]);
364 if (len1 >= len2 && len1 >= len3)
367 evecs[1] = vec_tmp[0] * rsqrtf (len1);
368 min_el = len1 <= mmax[min_el] ? 1 : min_el;
369 max_el = len1 > mmax[max_el] ? 1 : max_el;
371 else if (len2 >= len1 && len2 >= len3)
374 evecs[1] = vec_tmp[1] * rsqrtf (len2);
375 min_el = len2 <= mmax[min_el] ? 1 : min_el;
376 max_el = len2 > mmax[max_el] ? 1 : max_el;
381 evecs[1] = vec_tmp[2] * rsqrtf (len3);
382 min_el = len3 <= mmax[min_el] ? 1 : min_el;
383 max_el = len3 > mmax[max_el] ? 1 : max_el;
386 tmp[0] = row0(); tmp[1] = row1(); tmp[2] = row2();
387 tmp[0].x -= evals.x; tmp[1].y -= evals.x; tmp[2].z -= evals.x;
389 vec_tmp[0] =
cross(tmp[0], tmp[1]);
390 vec_tmp[1] =
cross(tmp[0], tmp[2]);
391 vec_tmp[2] =
cross(tmp[1], tmp[2]);
393 len1 =
dot (vec_tmp[0], vec_tmp[0]);
394 len2 =
dot (vec_tmp[1], vec_tmp[1]);
395 len3 =
dot (vec_tmp[2], vec_tmp[2]);
398 if (len1 >= len2 && len1 >= len3)
401 evecs[0] = vec_tmp[0] * rsqrtf (len1);
402 min_el = len3 <= mmax[min_el] ? 0 : min_el;
403 max_el = len3 > mmax[max_el] ? 0 : max_el;
405 else if (len2 >= len1 && len2 >= len3)
408 evecs[0] = vec_tmp[1] * rsqrtf (len2);
409 min_el = len3 <= mmax[min_el] ? 0 : min_el;
410 max_el = len3 > mmax[max_el] ? 0 : max_el;
415 evecs[0] = vec_tmp[2] * rsqrtf (len3);
416 min_el = len3 <= mmax[min_el] ? 0 : min_el;
417 max_el = len3 > mmax[max_el] ? 0 : max_el;
420 unsigned mid_el = 3 - min_el - max_el;
421 evecs[min_el] =
normalized(
cross( evecs[(min_el+1) % 3], evecs[(min_el+2) % 3] ) );
422 evecs[mid_el] =
normalized(
cross( evecs[(mid_el+1) % 3], evecs[(mid_el+2) % 3] ) );
448 return x * x <= prec_sqr * y * y;
457 WARP_SIZE = 1 << LOG_WARP_SIZE,
465 asm(
"mov.u32 %0, %laneid;" :
"=r"(ret) );
471 int tid = threadIdx.z * blockDim.x * blockDim.y + threadIdx.y * blockDim.x + threadIdx.x;
472 return tid >> LOG_WARP_SIZE;
477 #if (__CUDA_ARCH__ >= 200) 479 asm(
"mov.u32 %0, %lanemask_lt;" :
"=r"(ret) );
482 return 0xFFFFFFFF >> (32 - laneId());
495 return blockDim.x * blockDim.y * blockDim.z;
500 return threadIdx.z * blockDim.x * blockDim.y + threadIdx.y * blockDim.x + threadIdx.x;
503 template<
int CTA_SIZE,
typename T,
class BinOp>
506 int tid = flattenedThreadId();
509 if (CTA_SIZE >= 1024) {
if (tid < 512) buffer[tid] = val = op(val, buffer[tid + 512]); __syncthreads(); }
510 if (CTA_SIZE >= 512) {
if (tid < 256) buffer[tid] = val = op(val, buffer[tid + 256]); __syncthreads(); }
511 if (CTA_SIZE >= 256) {
if (tid < 128) buffer[tid] = val = op(val, buffer[tid + 128]); __syncthreads(); }
512 if (CTA_SIZE >= 128) {
if (tid < 64) buffer[tid] = val = op(val, buffer[tid + 64]); __syncthreads(); }
516 if (CTA_SIZE >= 64) { buffer[tid] = val = op(val, buffer[tid + 32]); }
517 if (CTA_SIZE >= 32) { buffer[tid] = val = op(val, buffer[tid + 16]); }
518 if (CTA_SIZE >= 16) { buffer[tid] = val = op(val, buffer[tid + 8]); }
519 if (CTA_SIZE >= 8) { buffer[tid] = val = op(val, buffer[tid + 4]); }
520 if (CTA_SIZE >= 4) { buffer[tid] = val = op(val, buffer[tid + 2]); }
521 if (CTA_SIZE >= 2) { buffer[tid] = val = op(val, buffer[tid + 1]); }
525 template<
int CTA_SIZE,
typename T,
class BinOp>
528 int tid = flattenedThreadId();
529 T val = buffer[tid] = init;
532 if (CTA_SIZE >= 1024) {
if (tid < 512) buffer[tid] = val = op(val, buffer[tid + 512]); __syncthreads(); }
533 if (CTA_SIZE >= 512) {
if (tid < 256) buffer[tid] = val = op(val, buffer[tid + 256]); __syncthreads(); }
534 if (CTA_SIZE >= 256) {
if (tid < 128) buffer[tid] = val = op(val, buffer[tid + 128]); __syncthreads(); }
535 if (CTA_SIZE >= 128) {
if (tid < 64) buffer[tid] = val = op(val, buffer[tid + 64]); __syncthreads(); }
539 #if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 300 && 0 540 if (CTA_SIZE >= 64) val = op(val, buffer[tid + 32]);
541 if (CTA_SIZE >= 32) val = op(val, __shfl_xor(val, 16));
542 if (CTA_SIZE >= 16) val = op(val, __shfl_xor(val, 8));
543 if (CTA_SIZE >= 8) val = op(val, __shfl_xor(val, 4));
544 if (CTA_SIZE >= 4) val = op(val, __shfl_xor(val, 2));
545 if (CTA_SIZE >= 2) buffer[tid] = op(val, __shfl_xor(val, 1));
547 if (CTA_SIZE >= 64) { buffer[tid] = val = op(val, buffer[tid + 32]); }
548 if (CTA_SIZE >= 32) { buffer[tid] = val = op(val, buffer[tid + 16]); }
549 if (CTA_SIZE >= 16) { buffer[tid] = val = op(val, buffer[tid + 8]); }
550 if (CTA_SIZE >= 8) { buffer[tid] = val = op(val, buffer[tid + 4]); }
551 if (CTA_SIZE >= 4) { buffer[tid] = val = op(val, buffer[tid + 2]); }
552 if (CTA_SIZE >= 2) { buffer[tid] = val = op(val, buffer[tid + 1]); }
566 const unsigned int lane = tid & 31;
570 int partial = ptr[tid];
572 ptr[tid] = partial = partial + ptr[tid + 16];
573 ptr[tid] = partial = partial + ptr[tid + 8];
574 ptr[tid] = partial = partial + ptr[tid + 4];
575 ptr[tid] = partial = partial + ptr[tid + 2];
576 ptr[tid] = partial = partial + ptr[tid + 1];
578 return ptr[tid - lane];
583 #if __CUDA_ARCH__ >= 200 585 return __ballot(predicate);
588 cta_buffer[tid] = predicate ? (1 << (tid & 31)) : 0;
589 return warp_reduce(cta_buffer, tid);
595 #if __CUDA_ARCH__ >= 200 597 return __all(predicate);
600 cta_buffer[tid] = predicate ? 1 : 0;
601 return warp_reduce(cta_buffer, tid) == 32;
__kf_hdevice__ float3 & operator[](int i)
static __kf_device__ unsigned int stride()
__kf_device__ float3 row0() const
__kf_device__ void computeRoots3(float c0, float c1, float c2, float3 &roots)
__kf_device__ float3 row1() const
static __kf_device__ int Ballot(int predicate, volatile int *cta_buffer)
static __kf_device__ int laneMaskLt()
__kf_device__ float norm_sqr(const float3 &v)
static __kf_device__ float epsilon()
static __kf_device__ unsigned short max()
static __kf_device__ unsigned int laneId()
Returns the warp lane ID of the calling thread.
__kf_device__ float m12() const
static __kf_device__ int warp_reduce(volatile int *ptr, const unsigned int tid)
static __kf_device__ unsigned int id()
__kf_hdevice__ void swap(T &a, T &b)
__kf_device__ float m21() const
__kf_device__ float m10() const
static __kf_device__ float min()
static __kf_device__ void reduce(volatile T *buffer, BinOp op)
__kf_device__ float m22() const
__kf_device__ float3 operator+(const float3 &v1, const float3 &v2)
__kf_device__ float3 operator/(const float3 &v1, const float3 &v2)
static __kf_device__ int binaryExclScan(int ballot_mask)
static __kf_device__ bool isMuchSmallerThan(float x, float y)
__kf_device__ float dot(const float3 &v1, const float3 &v2)
__kf_device__ float m02() const
__kf_device__ float3 normalized(const float3 &v)
__kf_hdevice__ const float3 & operator[](int i) const
__kf_device__ float m11() const
__kf_device__ float m01() const
static __kf_device__ float max()
__kf_device__ float m20() const
__kf_device__ float3 & operator+=(float3 &vec, const float &v)
__kf_device__ void compute(Mat33 &tmp, Mat33 &vec_tmp, Mat33 &evecs, float3 &evals)
static __kf_device__ T reduce(volatile T *buffer, T init, BinOp op)
__kf_device__ float3 row2() const
static __kf_device__ bool All(int predicate, volatile int *cta_buffer)
__kf_device__ float3 & operator*=(float3 &vec, const float &v)
static __kf_device__ float quiet_NaN()
__kf_device__ Eigen33(volatile float *mat_pkg_arg)
__kf_device__ float norm(const float3 &v)
__kf_device__ float m00() const
__kf_device__ float3 operator-(const float3 &v1, const float3 &v2)
__kf_device__ Vec3f operator*(const Mat3f &m, const Vec3f &v)
__kf_device__ void computeRoots2(const float &b, const float &c, float3 &roots)
static __kf_device__ int flattenedThreadId()
__kf_hdevice__ float3 cross(const float3 &v1, const float3 &v2)
static __kf_device__ float3 unitOrthogonal(const float3 &src)