code_point.hpp
Go to the documentation of this file.
1 // Copyright (C) 2020-2023 Jonathan Müller and lexy contributors
2 // SPDX-License-Identifier: BSL-1.0
3 
4 #ifndef LEXY_CODE_POINT_HPP_INCLUDED
5 #define LEXY_CODE_POINT_HPP_INCLUDED
6 
7 #include <cstdint>
10 
11 #ifndef LEXY_HAS_UNICODE_DATABASE
12 # define LEXY_HAS_UNICODE_DATABASE 0
13 #endif
14 
15 #if LEXY_HAS_UNICODE_DATABASE
16 # define LEXY_UNICODE_CONSTEXPR constexpr
17 #else
18 # define LEXY_UNICODE_CONSTEXPR
19 #endif
20 
21 namespace lexy
22 {
25 {
26 public:
27  constexpr code_point() noexcept : _value(0xFFFF'FFFF) {}
28  constexpr explicit code_point(char32_t value) noexcept : _value(value) {}
29 
30  constexpr auto value() const noexcept
31  {
32  return _value;
33  }
34 
35  //=== classification ===//
36  constexpr bool is_ascii() const noexcept
37  {
38  return _value <= 0x7F;
39  }
40  constexpr bool is_bmp() const noexcept
41  {
42  return _value <= 0xFFFF;
43  }
44  constexpr bool is_valid() const noexcept
45  {
46  return _value <= 0x10'FFFF;
47  }
48 
49  constexpr bool is_control() const noexcept
50  {
51  return _value <= 0x1F || (0x7F <= _value && _value <= 0x9F);
52  }
53  constexpr bool is_surrogate() const noexcept
54  {
55  return 0xD800 <= _value && _value <= 0xDFFF;
56  }
57  constexpr bool is_private_use() const noexcept
58  {
59  return (0xE000 <= _value && _value <= 0xF8FF)
60  || (0x0F'0000 <= _value && _value <= 0x0F'FFFD)
61  || (0x10'0000 <= _value && _value <= 0x10'FFFD);
62  }
63  constexpr bool is_noncharacter() const noexcept
64  {
65  // Contiguous range of 32 non-characters.
66  if (0xFDD0 <= _value && _value <= 0xFDEF)
67  return true;
68 
69  // Last two code points of every plane.
70  auto in_plane = _value & 0xFFFF;
71  return in_plane == 0xFFFE || in_plane == 0xFFFF;
72  }
73 
74  constexpr bool is_scalar() const noexcept
75  {
76  return is_valid() && !is_surrogate();
77  }
78 
79  //=== general category ===//
80  enum general_category_t
81  {
82  // NOLINTNEXTLINE: can't use parentheses here
83 #define LEXY_UNICODE_CATEGORY(Short, Long) Short, Long = Short
84 
85  LEXY_UNICODE_CATEGORY(Lu, uppercase_letter),
86  LEXY_UNICODE_CATEGORY(Ll, lowercase_letter),
87  LEXY_UNICODE_CATEGORY(Lt, titlecase_letter),
88  LEXY_UNICODE_CATEGORY(Lm, modifier_letter),
89  LEXY_UNICODE_CATEGORY(Lo, other_letter),
90 
91  LEXY_UNICODE_CATEGORY(Mn, nonspacing_mark),
92  LEXY_UNICODE_CATEGORY(Mc, spaing_mark),
93  LEXY_UNICODE_CATEGORY(Me, enclosing_mark),
94 
95  LEXY_UNICODE_CATEGORY(Nd, decimal_number),
96  LEXY_UNICODE_CATEGORY(Nl, letter_number),
97  LEXY_UNICODE_CATEGORY(No, other_number),
98 
99  LEXY_UNICODE_CATEGORY(Pc, connector_punctuation),
100  LEXY_UNICODE_CATEGORY(Pd, dash_punctuation),
101  LEXY_UNICODE_CATEGORY(Ps, open_punctuation),
102  LEXY_UNICODE_CATEGORY(Pe, closing_punctuation),
103  LEXY_UNICODE_CATEGORY(Pi, initial_puncutation),
104  LEXY_UNICODE_CATEGORY(Pf, final_puncutation),
105  LEXY_UNICODE_CATEGORY(Po, other_punctuation),
106 
107  LEXY_UNICODE_CATEGORY(Sm, math_symbol),
108  LEXY_UNICODE_CATEGORY(Sc, currency_symbol),
109  LEXY_UNICODE_CATEGORY(Sk, modifier_symbol),
110  LEXY_UNICODE_CATEGORY(So, other_symbol),
111 
112  LEXY_UNICODE_CATEGORY(Zs, space_separator),
113  LEXY_UNICODE_CATEGORY(Zl, line_separator),
114  LEXY_UNICODE_CATEGORY(Zp, paragraph_separator),
115 
117  LEXY_UNICODE_CATEGORY(Cf, format),
118  LEXY_UNICODE_CATEGORY(Cs, surrogate),
119  LEXY_UNICODE_CATEGORY(Co, private_use),
120  LEXY_UNICODE_CATEGORY(Cn, unassigned),
121 
122 #undef LEXY_UNICODE_CATEGORY
123  };
124 
125  template <general_category_t... Cats>
126  struct _gc_group
127  {
128  const char* name;
129 
130  friend constexpr bool operator==(_gc_group, general_category_t cat)
131  {
132  return ((cat == Cats) || ...);
133  }
134  friend constexpr bool operator==(general_category_t cat, _gc_group)
135  {
136  return ((cat == Cats) || ...);
137  }
138 
139  friend constexpr bool operator!=(_gc_group, general_category_t cat)
140  {
141  return !(_gc_group{} == cat);
142  }
143  friend constexpr bool operator!=(general_category_t cat, _gc_group)
144  {
145  return !(_gc_group{} == cat);
146  }
147  };
148 
149 #define LEXY_UNICODE_CATEGORY_GROUP(Name, Short, Long, ...) \
150  static constexpr _gc_group<__VA_ARGS__> Short{"code-point." Name}; \
151  static constexpr _gc_group<__VA_ARGS__> Long = Short
152 
153  LEXY_UNICODE_CATEGORY_GROUP("cased-letter", LC, cased_letter, Lu, Ll, Lt);
154  LEXY_UNICODE_CATEGORY_GROUP("letter", L, letter, Lu, Ll, Lt, Lm, Lo);
155  LEXY_UNICODE_CATEGORY_GROUP("mark", M, mark, Mn, Mc, Me);
156  LEXY_UNICODE_CATEGORY_GROUP("number", N, number, Nd, Nl, No);
157  LEXY_UNICODE_CATEGORY_GROUP("punctuation", P, punctuation, Pc, Pd, Ps, Pe, Pi, Pf, Po);
158  LEXY_UNICODE_CATEGORY_GROUP("symbol", S, symbol, Sm, Sc, Sk, So);
159  LEXY_UNICODE_CATEGORY_GROUP("separator", Z, separator, Zs, Zl, Zp);
160  LEXY_UNICODE_CATEGORY_GROUP("other", C, other, Cc, Cf, Cs, Co, Cn);
161 
162 #undef LEXY_UNICODE_CATEGORY_GROUP
163 
165 
166  //=== comparision ===//
167  friend constexpr bool operator==(code_point lhs, code_point rhs) noexcept
168  {
169  return lhs._value == rhs._value;
170  }
171  friend constexpr bool operator!=(code_point lhs, code_point rhs) noexcept
172  {
173  return lhs._value != rhs._value;
174  }
175 
176 private:
177  char32_t _value;
178 };
179 
181 } // namespace lexy
182 
183 namespace lexy::_detail
184 {
186 {
187  switch (category)
188  {
189  case lexy::code_point::Lu:
190  return "code-point.uppercase-letter";
191  case lexy::code_point::Ll:
192  return "code-point.lowercase-letter";
193  case lexy::code_point::Lt:
194  return "code-point.titlecase-letter";
195  case lexy::code_point::Lm:
196  return "code-point.modifier-letter";
197  case lexy::code_point::Lo:
198  return "code-point.other-letter";
199 
200  case lexy::code_point::Mn:
201  return "code-point.nonspacing-mark";
202  case lexy::code_point::Mc:
203  return "code-point.combining-mark";
204  case lexy::code_point::Me:
205  return "code-point.enclosing-mark";
206 
207  case lexy::code_point::Nd:
208  return "code-point.decimal-number";
209  case lexy::code_point::Nl:
210  return "code-point.letter-number";
211  case lexy::code_point::No:
212  return "code-point.other-number";
213 
214  case lexy::code_point::Pc:
215  return "code-point.connector-punctuation";
216  case lexy::code_point::Pd:
217  return "code-point.dash-punctuation";
218  case lexy::code_point::Ps:
219  return "code-point.open-punctuation";
220  case lexy::code_point::Pe:
221  return "code-point.close-punctuation";
222  case lexy::code_point::Pi:
223  return "code-point.initial-quote-punctuation";
224  case lexy::code_point::Pf:
225  return "code-point.final-quote-punctuation";
226  case lexy::code_point::Po:
227  return "code-point.other-punctuation";
228 
229  case lexy::code_point::Sm:
230  return "code-point.math-symbol";
231  case lexy::code_point::Sc:
232  return "code-point.currency-symbol";
233  case lexy::code_point::Sk:
234  return "code-point.modifier-symbol";
235  case lexy::code_point::So:
236  return "code-point.other-symbol";
237 
238  case lexy::code_point::Zs:
239  return "code-point.space-separator";
240  case lexy::code_point::Zl:
241  return "code-point.line-separator";
242  case lexy::code_point::Zp:
243  return "code-point.paragraph-separator";
244 
245  case lexy::code_point::Cc:
246  return "code-point.control";
247  case lexy::code_point::Cf:
248  return "code-point.format";
249  case lexy::code_point::Cs:
250  return "code-point.surrogate";
251  case lexy::code_point::Co:
252  return "code-point.private-use";
253  case lexy::code_point::Cn:
254  return "code-point.not-assigned";
255  }
256 
257  return nullptr; // unreachable
258 }
259 } // namespace lexy::_detail
260 
261 #if LEXY_HAS_UNICODE_DATABASE
263 
265 {
266  if (!is_valid())
267  return general_category_t::unassigned;
268 
270  return _unicode_db::category[idx];
271 }
272 
274 {
275  if (!cp.is_valid())
276  return cp;
277 
278  auto idx = _unicode_db::property_index(cp.value());
279  auto offset = _unicode_db::case_folding_offset[idx];
280  return code_point(char32_t(std::int_least32_t(cp.value()) + offset));
281 }
282 
283 namespace lexy::_detail
284 {
285 template <lexy::_unicode_db::binary_properties_t... Props>
286 LEXY_FORCE_INLINE constexpr bool code_point_has_properties(char32_t cp)
287 {
288  constexpr auto mask = ((1 << Props) | ...);
289 
290  auto idx = _unicode_db::property_index(cp);
291  auto props = _unicode_db::binary_properties[idx];
292  return (props & mask) != 0;
293 }
294 } // namespace lexy::_detail
295 
296 # define LEXY_UNICODE_PROPERTY(Name) ::lexy::_unicode_db::Name
297 
298 #else
299 namespace lexy::_detail
300 {
301 template <int... Props>
302 bool code_point_has_properties(char32_t cp); // not implemented
303 } // namespace lexy::_detail
304 
305 # define LEXY_UNICODE_PROPERTY(Name) 0
306 
307 #endif
308 
309 #endif // LEXY_CODE_POINT_HPP_INCLUDED
310 
lexy::simple_case_fold
LEXY_UNICODE_CONSTEXPR code_point simple_case_fold(code_point cp) noexcept
config.hpp
lexy::code_point::operator!=
constexpr friend bool operator!=(code_point lhs, code_point rhs) noexcept
Definition: code_point.hpp:171
lexy::code_point::_gc_group::operator!=
constexpr friend bool operator!=(general_category_t cat, _gc_group)
Definition: code_point.hpp:143
lexy::_unicode_db::category
constexpr lexy::code_point::general_category_t category[]
Definition: unicode_database.hpp:65
lexy::code_point::_value
char32_t _value
Definition: code_point.hpp:177
lexy::_unicode_db::binary_properties_t
binary_properties_t
Definition: unicode_database.hpp:67
lexyd::code_point
constexpr auto code_point
Matches a single unicode code point in the current unicode encoding.
Definition: dsl/code_point.hpp:200
lexy::code_point::is_ascii
constexpr bool is_ascii() const noexcept
Definition: code_point.hpp:36
LEXY_UNICODE_CONSTEXPR
#define LEXY_UNICODE_CONSTEXPR
Definition: code_point.hpp:18
lexy
Definition: any_ref.hpp:12
lexy::code_point::general_category_t
general_category_t
Definition: code_point.hpp:80
lexy::code_point::value
constexpr auto value() const noexcept
Definition: code_point.hpp:30
lexy::code_point::_gc_group::operator!=
constexpr friend bool operator!=(_gc_group, general_category_t cat)
Definition: code_point.hpp:139
lexy::_detail::cat
typename cat_< A, B >::type cat
Definition: nttp_string.hpp:105
lexy::code_point::LEXY_UNICODE_CATEGORY
@ LEXY_UNICODE_CATEGORY
Definition: code_point.hpp:85
lexyd::symbol
constexpr auto symbol
Parses optional rule, then matches the resulting lexeme against the symbol table.
Definition: symbol.hpp:532
lexy::_unicode_db::case_folding_offset
constexpr std::int_least32_t case_folding_offset[]
Definition: unicode_database.hpp:82
lexy::_unicode_db::binary_properties
constexpr std::uint_least8_t binary_properties[]
Definition: unicode_database.hpp:80
lexy::code_point::general_category
LEXY_UNICODE_CONSTEXPR general_category_t general_category() const noexcept
lexy::code_point::is_bmp
constexpr bool is_bmp() const noexcept
Definition: code_point.hpp:40
lexy::code_point::_gc_group::operator==
constexpr friend bool operator==(_gc_group, general_category_t cat)
Definition: code_point.hpp:130
assert.hpp
lexy::code_point::code_point
constexpr code_point(char32_t value) noexcept
Definition: code_point.hpp:28
lexy::_detail::general_category_name
constexpr const char * general_category_name(lexy::code_point::general_category_t category)
Definition: code_point.hpp:185
lexy::code_point::is_valid
constexpr bool is_valid() const noexcept
Definition: code_point.hpp:44
lexy::_detail
Definition: any_ref.hpp:12
lexy::code_point::LEXY_UNICODE_CATEGORY_GROUP
LEXY_UNICODE_CATEGORY_GROUP("cased-letter", LC, cased_letter, Lu, Ll, Lt)
LEXY_FORCE_INLINE
#define LEXY_FORCE_INLINE
Definition: config.hpp:148
lexyd::ascii::control
constexpr auto control
Definition: ascii.hpp:42
lexy::code_point::_gc_group::name
const char * name
Definition: code_point.hpp:128
lexy::_detail::code_point_has_properties
bool code_point_has_properties(char32_t cp)
lexy::code_point::_gc_group
Definition: code_point.hpp:126
lexy::_unicode_db::property_index
constexpr std::size_t property_index(char32_t code_point)
Definition: unicode_database.hpp:54
lexy::code_point::code_point
constexpr code_point() noexcept
Definition: code_point.hpp:27
unicode_database.hpp
lexy::code_point::_gc_group::operator==
constexpr friend bool operator==(general_category_t cat, _gc_group)
Definition: code_point.hpp:134
lexy::code_point
A unicode code point.
Definition: code_point.hpp:24


behaviortree_cpp_v4
Author(s): Davide Faconti
autogenerated on Fri Jun 28 2024 02:20:07