10 #ifndef EIGEN_PACKET_MATH_CUDA_H    11 #define EIGEN_PACKET_MATH_CUDA_H    20 #if defined(__CUDACC__) && defined(EIGEN_USE_GPU)    21 template<> 
struct is_arithmetic<float4>  { 
enum { 
value = 
true }; };
    22 template<> 
struct is_arithmetic<double2> { 
enum { 
value = 
true }; };
    24 template<> 
struct packet_traits<float> : default_packet_traits
    55 template<> 
struct packet_traits<double> : default_packet_traits
    85 template<> 
struct unpacket_traits<float4>  { 
typedef float  type; 
enum {size=4, alignment=
Aligned16}; 
typedef float4 half; };
    86 template<> 
struct unpacket_traits<double2> { 
typedef double type; 
enum {size=2, alignment=
Aligned16}; 
typedef double2 half; };
    89   return make_float4(from, from, from, from);
    91 template<> EIGEN_DEVICE_FUNC 
EIGEN_STRONG_INLINE double2 pset1<double2>(
const double& from) {
    92   return make_double2(from, from);
    97   return make_float4(a, a+1, a+2, a+3);
   100   return make_double2(a, a+1);
   103 template<> EIGEN_DEVICE_FUNC 
EIGEN_STRONG_INLINE float4 padd<float4>(
const float4& a, 
const float4& b) {
   104   return make_float4(a.x+b.x, a.y+b.y, a.z+b.z, a.w+b.w);
   106 template<> EIGEN_DEVICE_FUNC 
EIGEN_STRONG_INLINE double2 padd<double2>(
const double2& a, 
const double2& b) {
   107   return make_double2(a.x+b.x, a.y+b.y);
   110 template<> EIGEN_DEVICE_FUNC 
EIGEN_STRONG_INLINE float4 psub<float4>(
const float4& a, 
const float4& b) {
   111   return make_float4(a.x-b.x, a.y-b.y, a.z-b.z, a.w-b.w);
   113 template<> EIGEN_DEVICE_FUNC 
EIGEN_STRONG_INLINE double2 psub<double2>(
const double2& a, 
const double2& b) {
   114   return make_double2(a.x-b.x, a.y-b.y);
   118   return make_float4(-a.x, -a.y, -a.z, -a.w);
   121   return make_double2(-a.x, -a.y);
   127 template<> EIGEN_DEVICE_FUNC 
EIGEN_STRONG_INLINE float4 pmul<float4>(
const float4& a, 
const float4& b) {
   128   return make_float4(a.x*b.x, a.y*b.y, a.z*b.z, a.w*b.w);
   130 template<> EIGEN_DEVICE_FUNC 
EIGEN_STRONG_INLINE double2 pmul<double2>(
const double2& a, 
const double2& b) {
   131   return make_double2(a.x*b.x, a.y*b.y);
   134 template<> EIGEN_DEVICE_FUNC 
EIGEN_STRONG_INLINE float4 pdiv<float4>(
const float4& a, 
const float4& b) {
   135   return make_float4(a.x/b.x, a.y/b.y, a.z/b.z, a.w/b.w);
   137 template<> EIGEN_DEVICE_FUNC 
EIGEN_STRONG_INLINE double2 pdiv<double2>(
const double2& a, 
const double2& b) {
   138   return make_double2(a.x/b.x, a.y/b.y);
   141 template<> EIGEN_DEVICE_FUNC 
EIGEN_STRONG_INLINE float4 pmin<float4>(
const float4& a, 
const float4& b) {
   142   return make_float4(fminf(a.x, b.x), fminf(a.y, b.y), fminf(a.z, b.z), fminf(a.w, b.w));
   144 template<> EIGEN_DEVICE_FUNC 
EIGEN_STRONG_INLINE double2 pmin<double2>(
const double2& a, 
const double2& b) {
   145   return make_double2(fmin(a.x, b.x), fmin(a.y, b.y));
   148 template<> EIGEN_DEVICE_FUNC 
EIGEN_STRONG_INLINE float4 pmax<float4>(
const float4& a, 
const float4& b) {
   149   return make_float4(fmaxf(a.x, b.x), fmaxf(a.y, b.y), fmaxf(a.z, b.z), fmaxf(a.w, b.w));
   151 template<> EIGEN_DEVICE_FUNC 
EIGEN_STRONG_INLINE double2 pmax<double2>(
const double2& a, 
const double2& b) {
   152   return make_double2(fmax(a.x, b.x), fmax(a.y, b.y));
   156   return *
reinterpret_cast<const float4*
>(from);
   159 template<> EIGEN_DEVICE_FUNC 
EIGEN_STRONG_INLINE double2 pload<double2>(
const double* from) {
   160   return *
reinterpret_cast<const double2*
>(from);
   163 template<> EIGEN_DEVICE_FUNC 
EIGEN_STRONG_INLINE float4 ploadu<float4>(
const float* from) {
   164   return make_float4(from[0], from[1], from[2], from[3]);
   166 template<> EIGEN_DEVICE_FUNC 
EIGEN_STRONG_INLINE double2 ploadu<double2>(
const double* from) {
   167   return make_double2(from[0], from[1]);
   171   return make_float4(from[0], from[0], from[1], from[1]);
   174   return make_double2(from[0], from[0]);
   178   *
reinterpret_cast<float4*
>(to) = from;
   182   *
reinterpret_cast<double2*
>(to) = from;
   198 EIGEN_DEVICE_FUNC 
EIGEN_ALWAYS_INLINE float4 ploadt_ro<float4, Aligned>(
const float* from) {
   199 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350   200   return __ldg((
const float4*)from);
   202   return make_float4(from[0], from[1], from[2], from[3]);
   206 EIGEN_DEVICE_FUNC 
EIGEN_ALWAYS_INLINE double2 ploadt_ro<double2, Aligned>(
const double* from) {
   207 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350   208   return __ldg((
const double2*)from);
   210   return make_double2(from[0], from[1]);
   215 EIGEN_DEVICE_FUNC 
EIGEN_ALWAYS_INLINE float4 ploadt_ro<float4, Unaligned>(
const float* from) {
   216 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350   217   return make_float4(__ldg(from+0), __ldg(from+1), __ldg(from+2), __ldg(from+3));
   219   return make_float4(from[0], from[1], from[2], from[3]);
   223 EIGEN_DEVICE_FUNC 
EIGEN_ALWAYS_INLINE double2 ploadt_ro<double2, Unaligned>(
const double* from) {
   224 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350   225   return make_double2(__ldg(from+0), __ldg(from+1));
   227   return make_double2(from[0], from[1]);
   231 template<> EIGEN_DEVICE_FUNC 
inline float4 pgather<float, float4>(
const float* from, 
Index stride) {
   232   return make_float4(from[0*stride], from[1*stride], from[2*stride], from[3*stride]);
   235 template<> EIGEN_DEVICE_FUNC 
inline double2 pgather<double, double2>(
const double* from, 
Index stride) {
   236   return make_double2(from[0*stride], from[1*stride]);
   239 template<> EIGEN_DEVICE_FUNC 
inline void pscatter<float, float4>(
float* to, 
const float4& from, 
Index stride) {
   240   to[stride*0] = from.x;
   241   to[stride*1] = from.y;
   242   to[stride*2] = from.z;
   243   to[stride*3] = from.w;
   245 template<> EIGEN_DEVICE_FUNC 
inline void pscatter<double, double2>(
double* to, 
const double2& from, 
Index stride) {
   246   to[stride*0] = from.x;
   247   to[stride*1] = from.y;
   250 template<> EIGEN_DEVICE_FUNC 
inline float  pfirst<float4>(
const float4& a) {
   253 template<> EIGEN_DEVICE_FUNC 
inline double pfirst<double2>(
const double2& a) {
   257 template<> EIGEN_DEVICE_FUNC 
inline float  predux<float4>(
const float4& a) {
   258   return a.x + a.y + a.z + a.w;
   260 template<> EIGEN_DEVICE_FUNC 
inline double predux<double2>(
const double2& a) {
   264 template<> EIGEN_DEVICE_FUNC 
inline float  predux_max<float4>(
const float4& a) {
   265   return fmaxf(fmaxf(a.x, a.y), fmaxf(a.z, a.w));
   267 template<> EIGEN_DEVICE_FUNC 
inline double predux_max<double2>(
const double2& a) {
   268   return fmax(a.x, a.y);
   271 template<> EIGEN_DEVICE_FUNC 
inline float  predux_min<float4>(
const float4& a) {
   272   return fminf(fminf(a.x, a.y), fminf(a.z, a.w));
   274 template<> EIGEN_DEVICE_FUNC 
inline double predux_min<double2>(
const double2& a) {
   275   return fmin(a.x, a.y);
   278 template<> EIGEN_DEVICE_FUNC 
inline float  predux_mul<float4>(
const float4& a) {
   279   return a.x * a.y * a.z * a.w;
   281 template<> EIGEN_DEVICE_FUNC 
inline double predux_mul<double2>(
const double2& a) {
   285 template<> EIGEN_DEVICE_FUNC 
inline float4  pabs<float4>(
const float4& a) {
   286   return make_float4(fabsf(a.x), fabsf(a.y), fabsf(a.z), fabsf(a.w));
   288 template<> EIGEN_DEVICE_FUNC 
inline double2 pabs<double2>(
const double2& a) {
   289   return make_double2(fabs(a.x), fabs(a.y));
   292 EIGEN_DEVICE_FUNC 
inline void   294   double tmp = kernel.packet[0].y;
   295   kernel.packet[0].y = kernel.packet[1].x;
   296   kernel.packet[1].x = tmp;
   298   tmp = kernel.packet[0].z;
   299   kernel.packet[0].z = kernel.packet[2].x;
   300   kernel.packet[2].x = tmp;
   302   tmp = kernel.packet[0].w;
   303   kernel.packet[0].w = kernel.packet[3].x;
   304   kernel.packet[3].x = tmp;
   306   tmp = kernel.packet[1].z;
   307   kernel.packet[1].z = kernel.packet[2].y;
   308   kernel.packet[2].y = tmp;
   310   tmp = kernel.packet[1].w;
   311   kernel.packet[1].w = kernel.packet[3].y;
   312   kernel.packet[3].y = tmp;
   314   tmp = kernel.packet[2].w;
   315   kernel.packet[2].w = kernel.packet[3].z;
   316   kernel.packet[3].z = tmp;
   319 EIGEN_DEVICE_FUNC 
inline void   321   double tmp = kernel.packet[0].y;
   322   kernel.packet[0].y = kernel.packet[1].x;
   323   kernel.packet[1].x = tmp;
   333 #endif // EIGEN_PACKET_MATH_CUDA_H #define EIGEN_ALWAYS_INLINE
#define EIGEN_STRONG_INLINE
EIGEN_STRONG_INLINE void pstoreu< double >(double *to, const Packet4d &from)
EIGEN_STRONG_INLINE void pstore< double >(double *to, const Packet4d &from)
EIGEN_STRONG_INLINE void ptranspose(PacketBlock< Packet2cf, 2 > &kernel)
EIGEN_STRONG_INLINE void pstore< float >(float *to, const Packet4f &from)
EIGEN_DEFAULT_DENSE_INDEX_TYPE Index
The Index type as used for the API. 
EIGEN_STRONG_INLINE Packet2cf pconj(const Packet2cf &a)
EIGEN_STRONG_INLINE void pstoreu< float >(float *to, const Packet4f &from)
EIGEN_STRONG_INLINE Packet2cf pnegate(const Packet2cf &a)