code_point.hpp
Go to the documentation of this file.
1 // Copyright (C) 2020-2024 Jonathan Müller and lexy contributors
2 // SPDX-License-Identifier: BSL-1.0
3 
4 #ifndef LEXY_CODE_POINT_HPP_INCLUDED
5 #define LEXY_CODE_POINT_HPP_INCLUDED
6 
7 #include <cstdint>
10 
11 #if LEXY_HAS_UNICODE_DATABASE
12 # define LEXY_UNICODE_CONSTEXPR constexpr
13 #else
14 # define LEXY_UNICODE_CONSTEXPR
15 #endif
16 
17 namespace lexy
18 {
21 {
22 public:
23  constexpr code_point() noexcept : _value(0xFFFF'FFFF) {}
24  constexpr explicit code_point(char32_t value) noexcept : _value(value) {}
25 
26  constexpr auto value() const noexcept
27  {
28  return _value;
29  }
30 
31  //=== classification ===//
32  constexpr bool is_ascii() const noexcept
33  {
34  return _value <= 0x7F;
35  }
36  constexpr bool is_bmp() const noexcept
37  {
38  return _value <= 0xFFFF;
39  }
40  constexpr bool is_valid() const noexcept
41  {
42  return _value <= 0x10'FFFF;
43  }
44 
45  constexpr bool is_control() const noexcept
46  {
47  return _value <= 0x1F || (0x7F <= _value && _value <= 0x9F);
48  }
49  constexpr bool is_surrogate() const noexcept
50  {
51  return 0xD800 <= _value && _value <= 0xDFFF;
52  }
53  constexpr bool is_private_use() const noexcept
54  {
55  return (0xE000 <= _value && _value <= 0xF8FF)
56  || (0x0F'0000 <= _value && _value <= 0x0F'FFFD)
57  || (0x10'0000 <= _value && _value <= 0x10'FFFD);
58  }
59  constexpr bool is_noncharacter() const noexcept
60  {
61  // Contiguous range of 32 non-characters.
62  if (0xFDD0 <= _value && _value <= 0xFDEF)
63  return true;
64 
65  // Last two code points of every plane.
66  auto in_plane = _value & 0xFFFF;
67  return in_plane == 0xFFFE || in_plane == 0xFFFF;
68  }
69 
70  constexpr bool is_scalar() const noexcept
71  {
72  return is_valid() && !is_surrogate();
73  }
74 
75  //=== general category ===//
76  enum general_category_t
77  {
78  // NOLINTNEXTLINE: can't use parentheses here
79 #define LEXY_UNICODE_CATEGORY(Short, Long) Short, Long = Short
80 
81  LEXY_UNICODE_CATEGORY(Lu, uppercase_letter),
82  LEXY_UNICODE_CATEGORY(Ll, lowercase_letter),
83  LEXY_UNICODE_CATEGORY(Lt, titlecase_letter),
84  LEXY_UNICODE_CATEGORY(Lm, modifier_letter),
85  LEXY_UNICODE_CATEGORY(Lo, other_letter),
86 
87  LEXY_UNICODE_CATEGORY(Mn, nonspacing_mark),
88  LEXY_UNICODE_CATEGORY(Mc, spacing_mark),
89  LEXY_UNICODE_CATEGORY(Me, enclosing_mark),
90 
91  LEXY_UNICODE_CATEGORY(Nd, decimal_number),
92  LEXY_UNICODE_CATEGORY(Nl, letter_number),
93  LEXY_UNICODE_CATEGORY(No, other_number),
94 
95  LEXY_UNICODE_CATEGORY(Pc, connector_punctuation),
96  LEXY_UNICODE_CATEGORY(Pd, dash_punctuation),
97  LEXY_UNICODE_CATEGORY(Ps, open_punctuation),
98  LEXY_UNICODE_CATEGORY(Pe, closing_punctuation),
99  LEXY_UNICODE_CATEGORY(Pi, initial_puncutation),
100  LEXY_UNICODE_CATEGORY(Pf, final_puncutation),
101  LEXY_UNICODE_CATEGORY(Po, other_punctuation),
102 
103  LEXY_UNICODE_CATEGORY(Sm, math_symbol),
104  LEXY_UNICODE_CATEGORY(Sc, currency_symbol),
105  LEXY_UNICODE_CATEGORY(Sk, modifier_symbol),
106  LEXY_UNICODE_CATEGORY(So, other_symbol),
107 
108  LEXY_UNICODE_CATEGORY(Zs, space_separator),
109  LEXY_UNICODE_CATEGORY(Zl, line_separator),
110  LEXY_UNICODE_CATEGORY(Zp, paragraph_separator),
111 
113  LEXY_UNICODE_CATEGORY(Cf, format),
114  LEXY_UNICODE_CATEGORY(Cs, surrogate),
115  LEXY_UNICODE_CATEGORY(Co, private_use),
116  LEXY_UNICODE_CATEGORY(Cn, unassigned),
117 
118 #undef LEXY_UNICODE_CATEGORY
119  };
120 
121  template <general_category_t... Cats>
122  struct _gc_group
123  {
124  const char* name;
125 
126  friend constexpr bool operator==(_gc_group, general_category_t cat)
127  {
128  return ((cat == Cats) || ...);
129  }
130  friend constexpr bool operator==(general_category_t cat, _gc_group)
131  {
132  return ((cat == Cats) || ...);
133  }
134 
135  friend constexpr bool operator!=(_gc_group, general_category_t cat)
136  {
137  return !(_gc_group{} == cat);
138  }
139  friend constexpr bool operator!=(general_category_t cat, _gc_group)
140  {
141  return !(_gc_group{} == cat);
142  }
143  };
144 
145 #define LEXY_UNICODE_CATEGORY_GROUP(Name, Short, Long, ...) \
146  static constexpr _gc_group<__VA_ARGS__> Short{"code-point." Name}; \
147  static constexpr _gc_group<__VA_ARGS__> Long = Short
148 
149  LEXY_UNICODE_CATEGORY_GROUP("cased-letter", LC, cased_letter, Lu, Ll, Lt);
150  LEXY_UNICODE_CATEGORY_GROUP("letter", L, letter, Lu, Ll, Lt, Lm, Lo);
151  LEXY_UNICODE_CATEGORY_GROUP("mark", M, mark, Mn, Mc, Me);
152  LEXY_UNICODE_CATEGORY_GROUP("number", N, number, Nd, Nl, No);
153  LEXY_UNICODE_CATEGORY_GROUP("punctuation", P, punctuation, Pc, Pd, Ps, Pe, Pi, Pf, Po);
154  LEXY_UNICODE_CATEGORY_GROUP("symbol", S, symbol, Sm, Sc, Sk, So);
155  LEXY_UNICODE_CATEGORY_GROUP("separator", Z, separator, Zs, Zl, Zp);
156  LEXY_UNICODE_CATEGORY_GROUP("other", C, other, Cc, Cf, Cs, Co, Cn);
157 
158 #undef LEXY_UNICODE_CATEGORY_GROUP
159 
161 
162  //=== comparision ===//
163  friend constexpr bool operator==(code_point lhs, code_point rhs) noexcept
164  {
165  return lhs._value == rhs._value;
166  }
167  friend constexpr bool operator!=(code_point lhs, code_point rhs) noexcept
168  {
169  return lhs._value != rhs._value;
170  }
171 
172 private:
173  char32_t _value;
174 };
175 
177 } // namespace lexy
178 
179 namespace lexy::_detail
180 {
182 {
183  switch (category)
184  {
185  case lexy::code_point::Lu:
186  return "code-point.uppercase-letter";
187  case lexy::code_point::Ll:
188  return "code-point.lowercase-letter";
189  case lexy::code_point::Lt:
190  return "code-point.titlecase-letter";
191  case lexy::code_point::Lm:
192  return "code-point.modifier-letter";
193  case lexy::code_point::Lo:
194  return "code-point.other-letter";
195 
196  case lexy::code_point::Mn:
197  return "code-point.nonspacing-mark";
198  case lexy::code_point::Mc:
199  return "code-point.combining-mark";
200  case lexy::code_point::Me:
201  return "code-point.enclosing-mark";
202 
203  case lexy::code_point::Nd:
204  return "code-point.decimal-number";
205  case lexy::code_point::Nl:
206  return "code-point.letter-number";
207  case lexy::code_point::No:
208  return "code-point.other-number";
209 
210  case lexy::code_point::Pc:
211  return "code-point.connector-punctuation";
212  case lexy::code_point::Pd:
213  return "code-point.dash-punctuation";
214  case lexy::code_point::Ps:
215  return "code-point.open-punctuation";
216  case lexy::code_point::Pe:
217  return "code-point.close-punctuation";
218  case lexy::code_point::Pi:
219  return "code-point.initial-quote-punctuation";
220  case lexy::code_point::Pf:
221  return "code-point.final-quote-punctuation";
222  case lexy::code_point::Po:
223  return "code-point.other-punctuation";
224 
225  case lexy::code_point::Sm:
226  return "code-point.math-symbol";
227  case lexy::code_point::Sc:
228  return "code-point.currency-symbol";
229  case lexy::code_point::Sk:
230  return "code-point.modifier-symbol";
231  case lexy::code_point::So:
232  return "code-point.other-symbol";
233 
234  case lexy::code_point::Zs:
235  return "code-point.space-separator";
236  case lexy::code_point::Zl:
237  return "code-point.line-separator";
238  case lexy::code_point::Zp:
239  return "code-point.paragraph-separator";
240 
241  case lexy::code_point::Cc:
242  return "code-point.control";
243  case lexy::code_point::Cf:
244  return "code-point.format";
245  case lexy::code_point::Cs:
246  return "code-point.surrogate";
247  case lexy::code_point::Co:
248  return "code-point.private-use";
249  case lexy::code_point::Cn:
250  return "code-point.not-assigned";
251  }
252 
253  return nullptr; // unreachable
254 }
255 } // namespace lexy::_detail
256 
257 #if LEXY_HAS_UNICODE_DATABASE
259 
261 {
262  if (!is_valid())
263  return general_category_t::unassigned;
264 
266  return _unicode_db::category[idx];
267 }
268 
270 {
271  if (!cp.is_valid())
272  return cp;
273 
274  auto idx = _unicode_db::property_index(cp.value());
275  auto offset = _unicode_db::case_folding_offset[idx];
276  return code_point(char32_t(std::int_least32_t(cp.value()) + offset));
277 }
278 
279 namespace lexy::_detail
280 {
281 template <lexy::_unicode_db::binary_properties_t... Props>
282 LEXY_FORCE_INLINE constexpr bool code_point_has_properties(char32_t cp)
283 {
284  constexpr auto mask = ((1 << Props) | ...);
285 
286  auto idx = _unicode_db::property_index(cp);
287  auto props = _unicode_db::binary_properties[idx];
288  return (props & mask) != 0;
289 }
290 } // namespace lexy::_detail
291 
292 # define LEXY_UNICODE_PROPERTY(Name) ::lexy::_unicode_db::Name
293 
294 #else
295 namespace lexy::_detail
296 {
297 template <int... Props>
298 bool code_point_has_properties(char32_t cp); // not implemented
299 } // namespace lexy::_detail
300 
301 # define LEXY_UNICODE_PROPERTY(Name) 0
302 
303 #endif
304 
305 #endif // LEXY_CODE_POINT_HPP_INCLUDED
306 
lexy::simple_case_fold
LEXY_UNICODE_CONSTEXPR code_point simple_case_fold(code_point cp) noexcept
config.hpp
lexy::code_point::operator!=
constexpr friend bool operator!=(code_point lhs, code_point rhs) noexcept
Definition: code_point.hpp:167
lexy::code_point::_gc_group::operator!=
constexpr friend bool operator!=(general_category_t cat, _gc_group)
Definition: code_point.hpp:139
lexy::_unicode_db::category
constexpr lexy::code_point::general_category_t category[]
Definition: unicode_database.hpp:65
lexy::code_point::_value
char32_t _value
Definition: code_point.hpp:173
lexy::_unicode_db::binary_properties_t
binary_properties_t
Definition: unicode_database.hpp:67
lexyd::code_point
constexpr auto code_point
Matches a single unicode code point in the current unicode encoding.
Definition: dsl/code_point.hpp:200
lexy::code_point::is_ascii
constexpr bool is_ascii() const noexcept
Definition: code_point.hpp:32
LEXY_UNICODE_CONSTEXPR
#define LEXY_UNICODE_CONSTEXPR
Definition: code_point.hpp:14
lexy
Definition: any_ref.hpp:12
lexy::code_point::general_category_t
general_category_t
Definition: code_point.hpp:76
lexy::code_point::value
constexpr auto value() const noexcept
Definition: code_point.hpp:26
lexy::code_point::_gc_group::operator!=
constexpr friend bool operator!=(_gc_group, general_category_t cat)
Definition: code_point.hpp:135
lexy::_detail::cat
typename cat_< A, B >::type cat
Definition: nttp_string.hpp:105
lexy::code_point::LEXY_UNICODE_CATEGORY
@ LEXY_UNICODE_CATEGORY
Definition: code_point.hpp:81
lexyd::symbol
constexpr auto symbol
Parses optional rule, then matches the resulting lexeme against the symbol table.
Definition: symbol.hpp:539
lexy::_unicode_db::case_folding_offset
constexpr std::int_least32_t case_folding_offset[]
Definition: unicode_database.hpp:82
lexy::_unicode_db::binary_properties
constexpr std::uint_least8_t binary_properties[]
Definition: unicode_database.hpp:80
lexy::code_point::general_category
LEXY_UNICODE_CONSTEXPR general_category_t general_category() const noexcept
lexy::code_point::is_bmp
constexpr bool is_bmp() const noexcept
Definition: code_point.hpp:36
lexy::code_point::_gc_group::operator==
constexpr friend bool operator==(_gc_group, general_category_t cat)
Definition: code_point.hpp:126
assert.hpp
lexy::code_point::code_point
constexpr code_point(char32_t value) noexcept
Definition: code_point.hpp:24
lexy::_detail::general_category_name
constexpr const char * general_category_name(lexy::code_point::general_category_t category)
Definition: code_point.hpp:181
lexy::code_point::is_valid
constexpr bool is_valid() const noexcept
Definition: code_point.hpp:40
lexy::_detail
Definition: any_ref.hpp:12
lexy::code_point::LEXY_UNICODE_CATEGORY_GROUP
LEXY_UNICODE_CATEGORY_GROUP("cased-letter", LC, cased_letter, Lu, Ll, Lt)
LEXY_FORCE_INLINE
#define LEXY_FORCE_INLINE
Definition: config.hpp:171
lexyd::ascii::control
constexpr auto control
Definition: ascii.hpp:42
lexy::code_point::_gc_group::name
const char * name
Definition: code_point.hpp:124
lexy::_detail::code_point_has_properties
bool code_point_has_properties(char32_t cp)
lexy::code_point::_gc_group
Definition: code_point.hpp:122
lexy::_unicode_db::property_index
constexpr std::size_t property_index(char32_t code_point)
Definition: unicode_database.hpp:54
lexy::code_point::code_point
constexpr code_point() noexcept
Definition: code_point.hpp:23
unicode_database.hpp
lexy::code_point::_gc_group::operator==
constexpr friend bool operator==(general_category_t cat, _gc_group)
Definition: code_point.hpp:130
lexy::code_point
A unicode code point.
Definition: code_point.hpp:20


behaviortree_cpp_v4
Author(s): Davide Faconti
autogenerated on Fri Dec 13 2024 03:19:16