arch/NEON/GeneralBlockPanelKernel.h
Go to the documentation of this file.
1 namespace Eigen {
2 namespace internal {
3 
4 #if EIGEN_ARCH_ARM && EIGEN_COMP_CLANG
5 
6 // Clang seems to excessively spill registers in the GEBP kernel on 32-bit arm.
7 // Here we specialize gebp_traits to eliminate these register spills.
8 // See #2138.
9 template<>
10 struct gebp_traits <float,float,false,false,Architecture::NEON,GEBPPacketFull>
11  : gebp_traits<float,float,false,false,Architecture::Generic,GEBPPacketFull>
12 {
13  EIGEN_STRONG_INLINE void acc(const AccPacket& c, const ResPacket& alpha, ResPacket& r) const
14  {
15  // This volatile inline ASM both acts as a barrier to prevent reordering,
16  // as well as enforces strict register use.
17  asm volatile(
18  "vmla.f32 %q[r], %q[c], %q[alpha]"
19  : [r] "+w" (r)
20  : [c] "w" (c),
21  [alpha] "w" (alpha)
22  : );
23  }
24 
25  template <typename LaneIdType>
26  EIGEN_STRONG_INLINE void madd(const Packet4f& a, const Packet4f& b,
27  Packet4f& c, Packet4f& tmp,
28  const LaneIdType&) const {
29  acc(a, b, c);
30  }
31 
32  template <typename LaneIdType>
33  EIGEN_STRONG_INLINE void madd(const Packet4f& a, const QuadPacket<Packet4f>& b,
34  Packet4f& c, Packet4f& tmp,
35  const LaneIdType& lane) const {
36  madd(a, b.get(lane), c, tmp, lane);
37  }
38 };
39 
40 #endif // EIGEN_ARCH_ARM && EIGEN_COMP_CLANG
41 
42 #if EIGEN_ARCH_ARM64
43 
44 template<>
45 struct gebp_traits <float,float,false,false,Architecture::NEON,GEBPPacketFull>
46  : gebp_traits<float,float,false,false,Architecture::Generic,GEBPPacketFull>
47 {
48  typedef float RhsPacket;
49  typedef float32x4_t RhsPacketx4;
50 
51  EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const
52  {
53  dest = *b;
54  }
55 
56  EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketx4& dest) const
57  {
58  dest = vld1q_f32(b);
59  }
60 
61  EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, RhsPacket& dest) const
62  {
63  dest = *b;
64  }
65 
67  {}
68 
69  EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, RhsPacket& dest) const
70  {
71  loadRhs(b,dest);
72  }
73 
74  EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<0>&) const
75  {
76  c = vfmaq_n_f32(c, a, b);
77  }
78 
79  // NOTE: Template parameter inference failed when compiled with Android NDK:
80  // "candidate template ignored: could not match 'FixedInt<N>' against 'Eigen::internal::FixedInt<0>".
81 
82  EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<0>&) const
83  { madd_helper<0>(a, b, c); }
84  EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<1>&) const
85  { madd_helper<1>(a, b, c); }
86  EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<2>&) const
87  { madd_helper<2>(a, b, c); }
88  EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<3>&) const
89  { madd_helper<3>(a, b, c); }
90 
91  private:
92  template<int LaneID>
93  EIGEN_STRONG_INLINE void madd_helper(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c) const
94  {
95  #if EIGEN_COMP_GNUC_STRICT && !(EIGEN_GNUC_AT_LEAST(9,0))
96  // workaround gcc issue https://gcc.gnu.org/bugzilla/show_bug.cgi?id=89101
97  // vfmaq_laneq_f32 is implemented through a costly dup
98  if(LaneID==0) asm("fmla %0.4s, %1.4s, %2.s[0]\n" : "+w" (c) : "w" (a), "w" (b) : );
99  else if(LaneID==1) asm("fmla %0.4s, %1.4s, %2.s[1]\n" : "+w" (c) : "w" (a), "w" (b) : );
100  else if(LaneID==2) asm("fmla %0.4s, %1.4s, %2.s[2]\n" : "+w" (c) : "w" (a), "w" (b) : );
101  else if(LaneID==3) asm("fmla %0.4s, %1.4s, %2.s[3]\n" : "+w" (c) : "w" (a), "w" (b) : );
102  #else
103  c = vfmaq_laneq_f32(c, a, b, LaneID);
104  #endif
105  }
106 };
107 
108 
109 template<>
110 struct gebp_traits <double,double,false,false,Architecture::NEON>
111  : gebp_traits<double,double,false,false,Architecture::Generic>
112 {
113  typedef double RhsPacket;
114 
115  struct RhsPacketx4 {
116  float64x2_t B_0, B_1;
117  };
118 
119  EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const
120  {
121  dest = *b;
122  }
123 
124  EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketx4& dest) const
125  {
126  dest.B_0 = vld1q_f64(b);
127  dest.B_1 = vld1q_f64(b+2);
128  }
129 
130  EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, RhsPacket& dest) const
131  {
132  loadRhs(b,dest);
133  }
134 
136  {}
137 
138  EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, RhsPacket& dest) const
139  {
140  loadRhs(b,dest);
141  }
142 
143  EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<0>&) const
144  {
145  c = vfmaq_n_f64(c, a, b);
146  }
147 
148  // NOTE: Template parameter inference failed when compiled with Android NDK:
149  // "candidate template ignored: could not match 'FixedInt<N>' against 'Eigen::internal::FixedInt<0>".
150 
151  EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<0>&) const
152  { madd_helper<0>(a, b, c); }
153  EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<1>&) const
154  { madd_helper<1>(a, b, c); }
155  EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<2>&) const
156  { madd_helper<2>(a, b, c); }
157  EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<3>&) const
158  { madd_helper<3>(a, b, c); }
159 
160  private:
161  template <int LaneID>
162  EIGEN_STRONG_INLINE void madd_helper(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c) const
163  {
164  #if EIGEN_COMP_GNUC_STRICT && !(EIGEN_GNUC_AT_LEAST(9,0))
165  // workaround gcc issue https://gcc.gnu.org/bugzilla/show_bug.cgi?id=89101
166  // vfmaq_laneq_f64 is implemented through a costly dup
167  if(LaneID==0) asm("fmla %0.2d, %1.2d, %2.d[0]\n" : "+w" (c) : "w" (a), "w" (b.B_0) : );
168  else if(LaneID==1) asm("fmla %0.2d, %1.2d, %2.d[1]\n" : "+w" (c) : "w" (a), "w" (b.B_0) : );
169  else if(LaneID==2) asm("fmla %0.2d, %1.2d, %2.d[0]\n" : "+w" (c) : "w" (a), "w" (b.B_1) : );
170  else if(LaneID==3) asm("fmla %0.2d, %1.2d, %2.d[1]\n" : "+w" (c) : "w" (a), "w" (b.B_1) : );
171  #else
172  if(LaneID==0) c = vfmaq_laneq_f64(c, a, b.B_0, 0);
173  else if(LaneID==1) c = vfmaq_laneq_f64(c, a, b.B_0, 1);
174  else if(LaneID==2) c = vfmaq_laneq_f64(c, a, b.B_1, 0);
175  else if(LaneID==3) c = vfmaq_laneq_f64(c, a, b.B_1, 1);
176  #endif
177  }
178 };
179 
180 #endif // EIGEN_ARCH_ARM64
181 
182 } // namespace internal
183 } // namespace Eigen
Eigen::Architecture::NEON
@ NEON
Definition: Constants.h:476
Eigen
Namespace containing all symbols from the Eigen library.
Definition: jet.h:637
Eigen::internal::Packet4f
__vector float Packet4f
Definition: AltiVec/PacketMath.h:30
alpha
RealScalar alpha
Definition: level1_cplx_impl.h:147
Eigen::internal::gebp_traits::loadRhs
EIGEN_STRONG_INLINE void loadRhs(const RhsScalar *b, RhsPacketType &dest) const
Definition: products/GeneralBlockPanelKernel.h:474
c
Scalar Scalar * c
Definition: benchVecAdd.cpp:17
b
Scalar * b
Definition: benchVecAdd.cpp:17
Eigen::internal::gebp_traits::ResPacket
conditional< Vectorizable, _ResPacket, ResScalar >::type ResPacket
Definition: products/GeneralBlockPanelKernel.h:462
Eigen::internal::GEBPPacketFull
@ GEBPPacketFull
Definition: products/GeneralBlockPanelKernel.h:19
Eigen::internal::gebp_traits::RhsPacket
conditional< Vectorizable, _RhsPacket, RhsScalar >::type RhsPacket
Definition: products/GeneralBlockPanelKernel.h:461
Eigen::internal::gebp_traits::acc
EIGEN_STRONG_INLINE void acc(const AccPacket &c, const ResPacket &alpha, ResPacket &r) const
Definition: products/GeneralBlockPanelKernel.h:533
Eigen::internal::gebp_traits::loadRhsQuad
EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar *b, RhsPacket &dest) const
Definition: products/GeneralBlockPanelKernel.h:494
Eigen::internal::gebp_traits::madd
EIGEN_STRONG_INLINE void madd(const LhsPacketType &a, const RhsPacketType &b, AccPacketType &c, RhsPacketType &tmp, const LaneIdType &) const
Definition: products/GeneralBlockPanelKernel.h:512
EIGEN_STRONG_INLINE
#define EIGEN_STRONG_INLINE
Definition: Macros.h:917
a
ArrayXXi a
Definition: Array_initializer_list_23_cxx11.cpp:1
Eigen::internal::gebp_traits::RhsScalar
_RhsScalar RhsScalar
Definition: products/GeneralBlockPanelKernel.h:422
Eigen::internal::gebp_traits::LhsPacket
conditional< Vectorizable, _LhsPacket, LhsScalar >::type LhsPacket
Definition: products/GeneralBlockPanelKernel.h:460
Eigen::internal::gebp_traits::AccPacket
ResPacket AccPacket
Definition: products/GeneralBlockPanelKernel.h:466
gtsam.examples.DogLegOptimizerExample.float
float
Definition: DogLegOptimizerExample.py:113
internal
Definition: BandTriangularSolver.h:13
Eigen::internal::gebp_traits::RhsPacketx4
QuadPacket< RhsPacket > RhsPacketx4
Definition: products/GeneralBlockPanelKernel.h:465
Eigen::internal::gebp_traits::updateRhs
EIGEN_STRONG_INLINE void updateRhs(const RhsScalar *b, RhsPacketType &dest) const
Definition: products/GeneralBlockPanelKernel.h:485
Eigen::internal::QuadPacket::B_0
Packet B_0
Definition: products/GeneralBlockPanelKernel.h:363


gtsam
Author(s):
autogenerated on Fri Jan 10 2025 04:02:06