29             return __fmaf_rn(v1.x, v2.x, __fmaf_rn(v1.y, v2.y, v1.z*v2.z));
 
   34             vec.x += v;  vec.y += v;  vec.z += v; 
return vec;
 
   39             v1.x += v2.x;  v1.y += v2.y;  v1.z += v2.z; 
return v1;
 
   44             return make_float3(v1.x + v2.x, v1.y + v2.y, v1.z + v2.z);
 
   49             return make_float3(v1.x * v2.x, v1.y * v2.y, v1.z * v2.z);
 
   54             return make_float3(v1.x * v2.x, v1.y * v2.y, v1.z * v2.z);
 
   59             return make_float3(v1.x / v2.x, v1.y / v2.y, v1.z / v2.z);
 
   64             return make_float3(v / vec.x, v / vec.y, v / vec.z);
 
   69             vec.x *= v;  vec.y *= v;  vec.z *= v; 
return vec;
 
   74             return make_float3(v1.x - v2.x, v1.y - v2.y, v1.z - v2.z);
 
   79             return make_float3(v1.x * v, v1.y * v, v1.z * v);
 
   84             return make_float3(v1.x * v, v1.y * v, v1.z * v);
 
   89             return sqrt(
dot(v, v));
 
   99             return v * rsqrt(
dot(v, v));
 
  104             return make_float3(v1.y * v2.z - v1.z * v2.y, v1.z * v2.x - v1.x * v2.z, v1.x * v2.y - v1.y * v2.x);
 
  110             float d = b * b - 4.f * c;
 
  116             roots.z = 0.5f * (b + sd);
 
  117             roots.y = 0.5f * (b - sd);
 
  128                 const float s_inv3 = 1.f/3.f;
 
  129                 const float s_sqrt3 = sqrtf(3.f);
 
  132                 float c2_over_3 = c2 * s_inv3;
 
  133                 float a_over_3 = (c1 - c2*c2_over_3)*s_inv3;
 
  137                 float half_b = 0.5f * (c0 + c2_over_3 * (2.f * c2_over_3 * c2_over_3 - c1));
 
  139                 float q = half_b * half_b + a_over_3 * a_over_3 * a_over_3;
 
  144                 float rho = sqrtf(-a_over_3);
 
  145                 float theta = atan2f (sqrtf (-q), half_b)*s_inv3;
 
  146                 float cos_theta = __cosf (theta);
 
  147                 float sin_theta = __sinf (theta);
 
  148                 roots.x = c2_over_3 + 2.f * rho * cos_theta;
 
  149                 roots.y = c2_over_3 - rho * (cos_theta + s_sqrt3 * sin_theta);
 
  150                 roots.z = c2_over_3 - rho * (cos_theta - s_sqrt3 * sin_theta);
 
  153                 if (roots.x >= roots.y)
 
  154                     swap(roots.x, roots.y);
 
  156                 if (roots.y >= roots.z)
 
  158                     swap(roots.y, roots.z);
 
  160                     if (roots.x >= roots.y)
 
  161                         swap (roots.x, roots.y);
 
  193                     float invnm = rsqrtf(src.x*src.x + src.y*src.y);
 
  194                     perp.x = -src.y * invnm;
 
  195                     perp.y =  src.x * invnm;
 
  204                     float invnm = rsqrtf(src.z * src.z + src.y * src.y);
 
  206                     perp.y = -src.z * invnm;
 
  207                     perp.z =  src.y * invnm;
 
  222                 float m0123 = fmaxf( max01, max23);
 
  223                 float scale = fmaxf( max45, m0123);
 
  243                 float c1 = 
m00() * 
m11() -
 
  255                     evecs[0] = make_float3(1.f, 0.f, 0.f);
 
  256                     evecs[1] = make_float3(0.f, 1.f, 0.f);
 
  257                     evecs[2] = make_float3(0.f, 0.f, 1.f);
 
  263                     tmp[0].x -= evals.z; tmp[1].y -= evals.z; tmp[2].z -= evals.z;
 
  265                     vec_tmp[0] = 
cross(tmp[0], tmp[1]);
 
  266                     vec_tmp[1] = 
cross(tmp[0], tmp[2]);
 
  267                     vec_tmp[2] = 
cross(tmp[1], tmp[2]);
 
  269                     float len1 = 
dot (vec_tmp[0], vec_tmp[0]);
 
  270                     float len2 = 
dot (vec_tmp[1], vec_tmp[1]);
 
  271                     float len3 = 
dot (vec_tmp[2], vec_tmp[2]);
 
  273                     if (len1 >= len2 && len1 >= len3)
 
  275                         evecs[2] = vec_tmp[0] * rsqrtf (len1);
 
  277                     else if (len2 >= len1 && len2 >= len3)
 
  279                         evecs[2] = vec_tmp[1] * rsqrtf (len2);
 
  283                         evecs[2] = vec_tmp[2] * rsqrtf (len3);
 
  287                     evecs[0] = 
cross(evecs[1], evecs[2]);
 
  293                     tmp[0].x -= evals.x; tmp[1].y -= evals.x; tmp[2].z -= evals.x;
 
  295                     vec_tmp[0] = 
cross(tmp[0], tmp[1]);
 
  296                     vec_tmp[1] = 
cross(tmp[0], tmp[2]);
 
  297                     vec_tmp[2] = 
cross(tmp[1], tmp[2]);
 
  299                     float len1 = 
dot(vec_tmp[0], vec_tmp[0]);
 
  300                     float len2 = 
dot(vec_tmp[1], vec_tmp[1]);
 
  301                     float len3 = 
dot(vec_tmp[2], vec_tmp[2]);
 
  303                     if (len1 >= len2 && len1 >= len3)
 
  305                         evecs[0] = vec_tmp[0] * rsqrtf(len1);
 
  307                     else if (len2 >= len1 && len2 >= len3)
 
  309                         evecs[0] = vec_tmp[1] * rsqrtf(len2);
 
  313                         evecs[0] = vec_tmp[2] * rsqrtf(len3);
 
  317                     evecs[2] = 
cross(evecs[0], evecs[1]);
 
  323                     tmp[0].x -= evals.z; tmp[1].y -= evals.z; tmp[2].z -= evals.z;
 
  325                     vec_tmp[0] = 
cross(tmp[0], tmp[1]);
 
  326                     vec_tmp[1] = 
cross(tmp[0], tmp[2]);
 
  327                     vec_tmp[2] = 
cross(tmp[1], tmp[2]);
 
  329                     float len1 = 
dot(vec_tmp[0], vec_tmp[0]);
 
  330                     float len2 = 
dot(vec_tmp[1], vec_tmp[1]);
 
  331                     float len3 = 
dot(vec_tmp[2], vec_tmp[2]);
 
  335                     unsigned int min_el = 2;
 
  336                     unsigned int max_el = 2;
 
  337                     if (len1 >= len2 && len1 >= len3)
 
  340                         evecs[2] = vec_tmp[0] * rsqrtf (len1);
 
  342                     else if (len2 >= len1 && len2 >= len3)
 
  345                         evecs[2] = vec_tmp[1] * rsqrtf (len2);
 
  350                         evecs[2] = vec_tmp[2] * rsqrtf (len3);
 
  354                     tmp[0].x -= evals.y; tmp[1].y -= evals.y; tmp[2].z -= evals.y;
 
  356                     vec_tmp[0] = 
cross(tmp[0], tmp[1]);
 
  357                     vec_tmp[1] = 
cross(tmp[0], tmp[2]);
 
  358                     vec_tmp[2] = 
cross(tmp[1], tmp[2]);
 
  360                     len1 = 
dot(vec_tmp[0], vec_tmp[0]);
 
  361                     len2 = 
dot(vec_tmp[1], vec_tmp[1]);
 
  362                     len3 = 
dot(vec_tmp[2], vec_tmp[2]);
 
  364                     if (len1 >= len2 && len1 >= len3)
 
  367                         evecs[1] = vec_tmp[0] * rsqrtf (len1);
 
  368                         min_el = len1 <= mmax[min_el] ? 1 : min_el;
 
  369                         max_el = len1  > mmax[max_el] ? 1 : max_el;
 
  371                     else if (len2 >= len1 && len2 >= len3)
 
  374                         evecs[1] = vec_tmp[1] * rsqrtf (len2);
 
  375                         min_el = len2 <= mmax[min_el] ? 1 : min_el;
 
  376                         max_el = len2  > mmax[max_el] ? 1 : max_el;
 
  381                         evecs[1] = vec_tmp[2] * rsqrtf (len3);
 
  382                         min_el = len3 <= mmax[min_el] ? 1 : min_el;
 
  383                         max_el = len3 >  mmax[max_el] ? 1 : max_el;
 
  387                     tmp[0].x -= evals.x; tmp[1].y -= evals.x; tmp[2].z -= evals.x;
 
  389                     vec_tmp[0] = 
cross(tmp[0], tmp[1]);
 
  390                     vec_tmp[1] = 
cross(tmp[0], tmp[2]);
 
  391                     vec_tmp[2] = 
cross(tmp[1], tmp[2]);
 
  393                     len1 = 
dot (vec_tmp[0], vec_tmp[0]);
 
  394                     len2 = 
dot (vec_tmp[1], vec_tmp[1]);
 
  395                     len3 = 
dot (vec_tmp[2], vec_tmp[2]);
 
  398                     if (len1 >= len2 && len1 >= len3)
 
  401                         evecs[0] = vec_tmp[0] * rsqrtf (len1);
 
  402                         min_el = len3 <= mmax[min_el] ? 0 : min_el;
 
  403                         max_el = len3  > mmax[max_el] ? 0 : max_el;
 
  405                     else if (len2 >= len1 && len2 >= len3)
 
  408                         evecs[0] = vec_tmp[1] * rsqrtf (len2);
 
  409                         min_el = len3 <= mmax[min_el] ? 0 : min_el;
 
  410                         max_el = len3  > mmax[max_el] ? 0 : max_el;
 
  415                         evecs[0] = vec_tmp[2] * rsqrtf (len3);
 
  416                         min_el = len3 <= mmax[min_el] ? 0 : min_el;
 
  417                         max_el = len3  > mmax[max_el] ? 0 : max_el;
 
  420                     unsigned mid_el = 3 - min_el - max_el;
 
  421                     evecs[min_el] = 
normalized( 
cross( evecs[(min_el+1) % 3], evecs[(min_el+2) % 3] ) );
 
  422                     evecs[mid_el] = 
normalized( 
cross( evecs[(mid_el+1) % 3], evecs[(mid_el+2) % 3] ) );
 
  448                 return x * x <= prec_sqr * y * y;
 
  465                 asm(
"mov.u32 %0, %laneid;" : 
"=r"(ret) );
 
  471                 int tid = threadIdx.z * blockDim.x * blockDim.y + threadIdx.y * blockDim.x + threadIdx.x;
 
  477 #if (__CUDA_ARCH__ >= 200) 
  479                 asm(
"mov.u32 %0, %lanemask_lt;" : 
"=r"(ret) );
 
  482                 return 0xFFFFFFFF >> (32 - 
laneId());
 
  495                 return blockDim.x * blockDim.y * blockDim.z;
 
  500                 return threadIdx.z * blockDim.x * blockDim.y + threadIdx.y * blockDim.x + threadIdx.x;
 
  503             template<
int CTA_SIZE, 
typename T, 
class BinOp>
 
  509                 if (CTA_SIZE >= 1024) { 
if (tid < 512) buffer[tid] = val = op(val, buffer[tid + 512]); __syncthreads(); }
 
  510                 if (CTA_SIZE >=  512) { 
if (tid < 256) buffer[tid] = val = op(val, buffer[tid + 256]); __syncthreads(); }
 
  511                 if (CTA_SIZE >=  256) { 
if (tid < 128) buffer[tid] = val = op(val, buffer[tid + 128]); __syncthreads(); }
 
  512                 if (CTA_SIZE >=  128) { 
if (tid <  64) buffer[tid] = val = op(val, buffer[tid +  64]); __syncthreads(); }
 
  516                     if (CTA_SIZE >=   64) { buffer[tid] = val = op(val, buffer[tid +  32]); }
 
  517                     if (CTA_SIZE >=   32) { buffer[tid] = val = op(val, buffer[tid +  16]); }
 
  518                     if (CTA_SIZE >=   16) { buffer[tid] = val = op(val, buffer[tid +   8]); }
 
  519                     if (CTA_SIZE >=    8) { buffer[tid] = val = op(val, buffer[tid +   4]); }
 
  520                     if (CTA_SIZE >=    4) { buffer[tid] = val = op(val, buffer[tid +   2]); }
 
  521                     if (CTA_SIZE >=    2) { buffer[tid] = val = op(val, buffer[tid +   1]); }
 
  525             template<
int CTA_SIZE, 
typename T, 
class BinOp>
 
  529                 T val =  buffer[tid] = init;
 
  532                 if (CTA_SIZE >= 1024) { 
if (tid < 512) buffer[tid] = val = op(val, buffer[tid + 512]); __syncthreads(); }
 
  533                 if (CTA_SIZE >=  512) { 
if (tid < 256) buffer[tid] = val = op(val, buffer[tid + 256]); __syncthreads(); }
 
  534                 if (CTA_SIZE >=  256) { 
if (tid < 128) buffer[tid] = val = op(val, buffer[tid + 128]); __syncthreads(); }
 
  535                 if (CTA_SIZE >=  128) { 
if (tid <  64) buffer[tid] = val = op(val, buffer[tid +  64]); __syncthreads(); }
 
  539 #if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 300 && 0 
  540                     if (CTA_SIZE >=   64) val = op(val, buffer[tid +  32]);
 
  541                     if (CTA_SIZE >=   32) val = op(val, __shfl_xor(val, 16));
 
  542                     if (CTA_SIZE >=   16) val = op(val, __shfl_xor(val,  8));
 
  543                     if (CTA_SIZE >=    8) val = op(val, __shfl_xor(val,  4));
 
  544                     if (CTA_SIZE >=    4) val = op(val, __shfl_xor(val,  2));
 
  545                     if (CTA_SIZE >=    2) buffer[tid] = op(val, __shfl_xor(val,  1));
 
  547                     if (CTA_SIZE >=   64) { buffer[tid] = val = op(val, buffer[tid +  32]); }
 
  548                     if (CTA_SIZE >=   32) { buffer[tid] = val = op(val, buffer[tid +  16]); }
 
  549                     if (CTA_SIZE >=   16) { buffer[tid] = val = op(val, buffer[tid +   8]); }
 
  550                     if (CTA_SIZE >=    8) { buffer[tid] = val = op(val, buffer[tid +   4]); }
 
  551                     if (CTA_SIZE >=    4) { buffer[tid] = val = op(val, buffer[tid +   2]); }
 
  552                     if (CTA_SIZE >=    2) { buffer[tid] = val = op(val, buffer[tid +   1]); }
 
  566                 const unsigned int lane = tid & 31; 
 
  570                     int partial = ptr[tid];
 
  572                     ptr[tid] = partial = partial + ptr[tid + 16];
 
  573                     ptr[tid] = partial = partial + ptr[tid + 8];
 
  574                     ptr[tid] = partial = partial + ptr[tid + 4];
 
  575                     ptr[tid] = partial = partial + ptr[tid + 2];
 
  576                     ptr[tid] = partial = partial + ptr[tid + 1];
 
  578                 return ptr[tid - lane];
 
  583 #if __CUDA_ARCH__ >= 200 
  585                 return __ballot(predicate);
 
  588                 cta_buffer[tid] = predicate ? (1 << (tid & 31)) : 0;
 
  595 #if __CUDA_ARCH__ >= 200 
  597                 return __all(predicate);
 
  600                 cta_buffer[tid] = predicate ? 1 : 0;