10 #ifndef EIGEN_PACKET_MATH_CUDA_H 11 #define EIGEN_PACKET_MATH_CUDA_H 20 #if defined(__CUDACC__) && defined(EIGEN_USE_GPU) 21 template<>
struct is_arithmetic<float4> {
enum {
value =
true }; };
22 template<>
struct is_arithmetic<double2> {
enum {
value =
true }; };
24 template<>
struct packet_traits<float> : default_packet_traits
55 template<>
struct packet_traits<double> : default_packet_traits
85 template<>
struct unpacket_traits<float4> {
typedef float type;
enum {
size=4, alignment=
Aligned16};
typedef float4 half; };
86 template<>
struct unpacket_traits<double2> {
typedef double type;
enum {
size=2, alignment=
Aligned16};
typedef double2 half; };
89 return make_float4(from, from, from, from);
91 template<> EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE double2 pset1<double2>(
const double& from) {
92 return make_double2(from, from);
97 return make_float4(a, a+1, a+2, a+3);
100 return make_double2(a, a+1);
103 template<> EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE float4 padd<float4>(
const float4& a,
const float4&
b) {
104 return make_float4(a.x+
b.x, a.y+
b.y, a.z+
b.z, a.w+
b.w);
106 template<> EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE double2 padd<double2>(
const double2& a,
const double2&
b) {
107 return make_double2(a.x+
b.x, a.y+
b.y);
110 template<> EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE float4 psub<float4>(
const float4& a,
const float4&
b) {
111 return make_float4(a.x-
b.x, a.y-
b.y, a.z-
b.z, a.w-
b.w);
113 template<> EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE double2 psub<double2>(
const double2& a,
const double2&
b) {
114 return make_double2(a.x-
b.x, a.y-
b.y);
118 return make_float4(-a.x, -a.y, -a.z, -a.w);
121 return make_double2(-a.x, -a.y);
127 template<> EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE float4 pmul<float4>(
const float4& a,
const float4&
b) {
128 return make_float4(a.x*
b.x, a.y*
b.y, a.z*
b.z, a.w*
b.w);
130 template<> EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE double2 pmul<double2>(
const double2& a,
const double2&
b) {
131 return make_double2(a.x*
b.x, a.y*
b.y);
134 template<> EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE float4 pdiv<float4>(
const float4& a,
const float4&
b) {
135 return make_float4(a.x/
b.x, a.y/
b.y, a.z/
b.z, a.w/
b.w);
137 template<> EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE double2 pdiv<double2>(
const double2& a,
const double2&
b) {
138 return make_double2(a.x/
b.x, a.y/
b.y);
141 template<> EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE float4 pmin<float4>(
const float4& a,
const float4&
b) {
142 return make_float4(fminf(a.x,
b.x), fminf(a.y,
b.y), fminf(a.z,
b.z), fminf(a.w,
b.w));
144 template<> EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE double2 pmin<double2>(
const double2& a,
const double2&
b) {
145 return make_double2(
fmin(a.x,
b.x),
fmin(a.y,
b.y));
148 template<> EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE float4 pmax<float4>(
const float4& a,
const float4&
b) {
149 return make_float4(fmaxf(a.x,
b.x), fmaxf(a.y,
b.y), fmaxf(a.z,
b.z), fmaxf(a.w,
b.w));
151 template<> EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE double2 pmax<double2>(
const double2& a,
const double2&
b) {
152 return make_double2(
fmax(a.x,
b.x),
fmax(a.y,
b.y));
156 return *
reinterpret_cast<const float4*
>(from);
159 template<> EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE double2 pload<double2>(
const double* from) {
160 return *
reinterpret_cast<const double2*
>(from);
163 template<> EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE float4 ploadu<float4>(
const float* from) {
164 return make_float4(from[0], from[1], from[2], from[3]);
166 template<> EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE double2 ploadu<double2>(
const double* from) {
167 return make_double2(from[0], from[1]);
171 return make_float4(from[0], from[0], from[1], from[1]);
174 return make_double2(from[0], from[0]);
178 *
reinterpret_cast<float4*
>(to) = from;
182 *
reinterpret_cast<double2*
>(to) = from;
198 EIGEN_DEVICE_FUNC
EIGEN_ALWAYS_INLINE float4 ploadt_ro<float4, Aligned>(
const float* from) {
199 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350 200 return __ldg((
const float4*)from);
202 return make_float4(from[0], from[1], from[2], from[3]);
206 EIGEN_DEVICE_FUNC
EIGEN_ALWAYS_INLINE double2 ploadt_ro<double2, Aligned>(
const double* from) {
207 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350 208 return __ldg((
const double2*)from);
210 return make_double2(from[0], from[1]);
215 EIGEN_DEVICE_FUNC
EIGEN_ALWAYS_INLINE float4 ploadt_ro<float4, Unaligned>(
const float* from) {
216 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350 217 return make_float4(__ldg(from+0), __ldg(from+1), __ldg(from+2), __ldg(from+3));
219 return make_float4(from[0], from[1], from[2], from[3]);
223 EIGEN_DEVICE_FUNC
EIGEN_ALWAYS_INLINE double2 ploadt_ro<double2, Unaligned>(
const double* from) {
224 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350 225 return make_double2(__ldg(from+0), __ldg(from+1));
227 return make_double2(from[0], from[1]);
231 template<> EIGEN_DEVICE_FUNC
inline float4 pgather<float, float4>(
const float* from,
Index stride) {
232 return make_float4(from[0*stride], from[1*stride], from[2*stride], from[3*stride]);
235 template<> EIGEN_DEVICE_FUNC
inline double2 pgather<double, double2>(
const double* from,
Index stride) {
236 return make_double2(from[0*stride], from[1*stride]);
239 template<> EIGEN_DEVICE_FUNC
inline void pscatter<float, float4>(
float* to,
const float4& from,
Index stride) {
240 to[stride*0] = from.x;
241 to[stride*1] = from.y;
242 to[stride*2] = from.z;
243 to[stride*3] = from.w;
245 template<> EIGEN_DEVICE_FUNC
inline void pscatter<double, double2>(
double* to,
const double2& from,
Index stride) {
246 to[stride*0] = from.x;
247 to[stride*1] = from.y;
250 template<> EIGEN_DEVICE_FUNC
inline float pfirst<float4>(
const float4& a) {
253 template<> EIGEN_DEVICE_FUNC
inline double pfirst<double2>(
const double2& a) {
257 template<> EIGEN_DEVICE_FUNC
inline float predux<float4>(
const float4& a) {
258 return a.x + a.y + a.z + a.w;
260 template<> EIGEN_DEVICE_FUNC
inline double predux<double2>(
const double2& a) {
264 template<> EIGEN_DEVICE_FUNC
inline float predux_max<float4>(
const float4& a) {
265 return fmaxf(fmaxf(a.x, a.y), fmaxf(a.z, a.w));
267 template<> EIGEN_DEVICE_FUNC
inline double predux_max<double2>(
const double2& a) {
268 return fmax(a.x, a.y);
271 template<> EIGEN_DEVICE_FUNC
inline float predux_min<float4>(
const float4& a) {
272 return fminf(fminf(a.x, a.y), fminf(a.z, a.w));
274 template<> EIGEN_DEVICE_FUNC
inline double predux_min<double2>(
const double2& a) {
275 return fmin(a.x, a.y);
278 template<> EIGEN_DEVICE_FUNC
inline float predux_mul<float4>(
const float4& a) {
279 return a.x * a.y * a.z * a.w;
281 template<> EIGEN_DEVICE_FUNC
inline double predux_mul<double2>(
const double2& a) {
285 template<> EIGEN_DEVICE_FUNC
inline float4 pabs<float4>(
const float4& a) {
286 return make_float4(fabsf(a.x), fabsf(a.y), fabsf(a.z), fabsf(a.w));
288 template<> EIGEN_DEVICE_FUNC
inline double2 pabs<double2>(
const double2& a) {
289 return make_double2(
fabs(a.x),
fabs(a.y));
292 EIGEN_DEVICE_FUNC
inline void 294 float tmp = kernel.packet[0].y;
295 kernel.packet[0].y = kernel.packet[1].x;
296 kernel.packet[1].x = tmp;
298 tmp = kernel.packet[0].z;
299 kernel.packet[0].z = kernel.packet[2].x;
300 kernel.packet[2].x = tmp;
302 tmp = kernel.packet[0].w;
303 kernel.packet[0].w = kernel.packet[3].x;
304 kernel.packet[3].x = tmp;
306 tmp = kernel.packet[1].z;
307 kernel.packet[1].z = kernel.packet[2].y;
308 kernel.packet[2].y = tmp;
310 tmp = kernel.packet[1].w;
311 kernel.packet[1].w = kernel.packet[3].y;
312 kernel.packet[3].y = tmp;
314 tmp = kernel.packet[2].w;
315 kernel.packet[2].w = kernel.packet[3].z;
316 kernel.packet[3].z = tmp;
319 EIGEN_DEVICE_FUNC
inline void 321 double tmp = kernel.packet[0].y;
322 kernel.packet[0].y = kernel.packet[1].x;
323 kernel.packet[1].x = tmp;
333 #endif // EIGEN_PACKET_MATH_CUDA_H #define EIGEN_ALWAYS_INLINE
#define EIGEN_STRONG_INLINE
const mpreal fmin(const mpreal &x, const mpreal &y, mp_rnd_t rnd_mode=mpreal::get_default_rnd())
const mpreal fabs(const mpreal &x, mp_rnd_t r=mpreal::get_default_rnd())
static constexpr size_t size(Tuple< Args... > &)
Provides access to the number of elements in a tuple as a compile-time constant expression.
EIGEN_STRONG_INLINE void pstoreu< double >(double *to, const Packet4d &from)
const mpreal fmax(const mpreal &x, const mpreal &y, mp_rnd_t rnd_mode=mpreal::get_default_rnd())
EIGEN_STRONG_INLINE void pstore< double >(double *to, const Packet4d &from)
EIGEN_STRONG_INLINE void ptranspose(PacketBlock< Packet2cf, 2 > &kernel)
EIGEN_STRONG_INLINE void pstore< float >(float *to, const Packet4f &from)
EIGEN_DEFAULT_DENSE_INDEX_TYPE Index
The Index type as used for the API.
EIGEN_STRONG_INLINE Packet2cf pconj(const Packet2cf &a)
EIGEN_STRONG_INLINE void pstoreu< float >(float *to, const Packet4f &from)
EIGEN_STRONG_INLINE Packet2cf pnegate(const Packet2cf &a)
EIGEN_DEVICE_FUNC const Scalar & b