_detail/code_point.hpp
Go to the documentation of this file.
1 // Copyright (C) 2020-2023 Jonathan Müller and lexy contributors
2 // SPDX-License-Identifier: BSL-1.0
3 
4 #ifndef LEXY_DETAIL_CODE_POINT_HPP_INCLUDED
5 #define LEXY_DETAIL_CODE_POINT_HPP_INCLUDED
6 
7 #include <lexy/input/base.hpp>
8 
9 //=== encoding ===//
10 namespace lexy::_detail
11 {
12 template <typename Encoding>
13 constexpr std::size_t encode_code_point(char32_t cp, typename Encoding::char_type* buffer,
14  std::size_t size)
15 {
16  if constexpr (std::is_same_v<Encoding, lexy::ascii_encoding>)
17  {
18  LEXY_PRECONDITION(size >= 1);
19 
20  *buffer = char(cp);
21  return 1;
22  }
23  else if constexpr (std::is_same_v<Encoding,
25  || std::is_same_v<Encoding, lexy::utf8_char_encoding>)
26  {
27  using char_type = typename Encoding::char_type;
28  // Taken from http://www.herongyang.com/Unicode/UTF-8-UTF-8-Encoding-Algorithm.html.
29  if (cp <= 0x7F)
30  {
31  LEXY_PRECONDITION(size >= 1);
32 
33  buffer[0] = char_type(cp);
34  return 1;
35  }
36  else if (cp <= 0x07'FF)
37  {
38  LEXY_PRECONDITION(size >= 2);
39 
40  auto first = (cp >> 6) & 0x1F;
41  auto second = (cp >> 0) & 0x3F;
42 
43  buffer[0] = char_type(0xC0 | first);
44  buffer[1] = char_type(0x80 | second);
45  return 2;
46  }
47  else if (cp <= 0xFF'FF)
48  {
49  LEXY_PRECONDITION(size >= 3);
50 
51  auto first = (cp >> 12) & 0x0F;
52  auto second = (cp >> 6) & 0x3F;
53  auto third = (cp >> 0) & 0x3F;
54 
55  buffer[0] = char_type(0xE0 | first);
56  buffer[1] = char_type(0x80 | second);
57  buffer[2] = char_type(0x80 | third);
58  return 3;
59  }
60  else
61  {
62  LEXY_PRECONDITION(size >= 4);
63 
64  auto first = (cp >> 18) & 0x07;
65  auto second = (cp >> 12) & 0x3F;
66  auto third = (cp >> 6) & 0x3F;
67  auto fourth = (cp >> 0) & 0x3F;
68 
69  buffer[0] = char_type(0xF0 | first);
70  buffer[1] = char_type(0x80 | second);
71  buffer[2] = char_type(0x80 | third);
72  buffer[3] = char_type(0x80 | fourth);
73  return 4;
74  }
75  }
76  else if constexpr (std::is_same_v<Encoding, lexy::utf16_encoding>)
77  {
78  if (cp <= 0xFF'FF)
79  {
80  LEXY_PRECONDITION(size >= 1);
81 
82  buffer[0] = char16_t(cp);
83  return 1;
84  }
85  else
86  {
87  // Algorithm implemented from
88  // https://en.wikipedia.org/wiki/UTF-16#Code_points_from_U+010000_to_U+10FFFF.
89  LEXY_PRECONDITION(size >= 2);
90 
91  auto u_prime = cp - 0x1'0000;
92  auto high_ten_bits = u_prime >> 10;
93  auto low_ten_bits = u_prime & 0b0000'0011'1111'1111;
94 
95  buffer[0] = char16_t(0xD800 + high_ten_bits);
96  buffer[1] = char16_t(0xDC00 + low_ten_bits);
97  return 2;
98  }
99  }
100  else if constexpr (std::is_same_v<Encoding, lexy::utf32_encoding>)
101  {
102  LEXY_PRECONDITION(size >= 1);
103 
104  *buffer = char32_t(cp);
105  return 1;
106  }
107  else
108  {
109  static_assert(lexy::_detail::error<Encoding>,
110  "cannot encode a code point in this encoding");
111  (void)cp;
112  (void)buffer;
113  (void)size;
114  return 0;
115  }
116 }
117 } // namespace lexy::_detail
118 
119 //=== parsing ===//
120 namespace lexy::_detail
121 {
122 enum class cp_error
123 {
124  success,
125  eof,
126  leads_with_trailing,
127  missing_trailing,
128  surrogate,
129  overlong_sequence,
130  out_of_range,
131 };
132 
133 template <typename Reader>
134 struct cp_result
135 {
136  char32_t cp;
137  cp_error error;
138  typename Reader::iterator end;
139 };
140 
141 template <typename Reader>
142 constexpr cp_result<Reader> parse_code_point(Reader reader)
143 {
144  if constexpr (std::is_same_v<typename Reader::encoding, lexy::ascii_encoding>)
145  {
146  if (reader.peek() == Reader::encoding::eof())
147  return {{}, cp_error::eof, reader.position()};
148 
149  auto cur = reader.peek();
150  reader.bump();
151 
152  auto cp = static_cast<char32_t>(cur);
153  if (cp <= 0x7F)
154  return {cp, cp_error::success, reader.position()};
155  else
156  return {cp, cp_error::out_of_range, reader.position()};
157  }
158  else if constexpr (std::is_same_v<typename Reader::encoding, lexy::utf8_encoding> //
159  || std::is_same_v<typename Reader::encoding, lexy::utf8_char_encoding>)
160  {
161  using uchar_t = unsigned char;
162  constexpr auto payload_lead1 = 0b0111'1111;
163  constexpr auto payload_lead2 = 0b0001'1111;
164  constexpr auto payload_lead3 = 0b0000'1111;
165  constexpr auto payload_lead4 = 0b0000'0111;
166  constexpr auto payload_cont = 0b0011'1111;
167 
168  constexpr auto pattern_lead1 = 0b0 << 7;
169  constexpr auto pattern_lead2 = 0b110 << 5;
170  constexpr auto pattern_lead3 = 0b1110 << 4;
171  constexpr auto pattern_lead4 = 0b11110 << 3;
172  constexpr auto pattern_cont = 0b10 << 6;
173 
174  auto first = uchar_t(reader.peek());
175  if ((first & ~payload_lead1) == pattern_lead1)
176  {
177  // ASCII character.
178  reader.bump();
179  return {first, cp_error::success, reader.position()};
180  }
181  else if ((first & ~payload_cont) == pattern_cont)
182  {
183  return {{}, cp_error::leads_with_trailing, reader.position()};
184  }
185  else if ((first & ~payload_lead2) == pattern_lead2)
186  {
187  reader.bump();
188 
189  auto second = uchar_t(reader.peek());
190  if ((second & ~payload_cont) != pattern_cont)
191  return {{}, cp_error::missing_trailing, reader.position()};
192  reader.bump();
193 
194  auto result = char32_t(first & payload_lead2);
195  result <<= 6;
196  result |= char32_t(second & payload_cont);
197 
198  // C0 and C1 are overlong ASCII.
199  if (first == 0xC0 || first == 0xC1)
200  return {result, cp_error::overlong_sequence, reader.position()};
201  else
202  return {result, cp_error::success, reader.position()};
203  }
204  else if ((first & ~payload_lead3) == pattern_lead3)
205  {
206  reader.bump();
207 
208  auto second = uchar_t(reader.peek());
209  if ((second & ~payload_cont) != pattern_cont)
210  return {{}, cp_error::missing_trailing, reader.position()};
211  reader.bump();
212 
213  auto third = uchar_t(reader.peek());
214  if ((third & ~payload_cont) != pattern_cont)
215  return {{}, cp_error::missing_trailing, reader.position()};
216  reader.bump();
217 
218  auto result = char32_t(first & payload_lead3);
219  result <<= 6;
220  result |= char32_t(second & payload_cont);
221  result <<= 6;
222  result |= char32_t(third & payload_cont);
223 
224  auto cp = result;
225  if (0xD800 <= cp && cp <= 0xDFFF)
226  return {cp, cp_error::surrogate, reader.position()};
227  else if (first == 0xE0 && second < 0xA0)
228  return {cp, cp_error::overlong_sequence, reader.position()};
229  else
230  return {cp, cp_error::success, reader.position()};
231  }
232  else if ((first & ~payload_lead4) == pattern_lead4)
233  {
234  reader.bump();
235 
236  auto second = uchar_t(reader.peek());
237  if ((second & ~payload_cont) != pattern_cont)
238  return {{}, cp_error::missing_trailing, reader.position()};
239  reader.bump();
240 
241  auto third = uchar_t(reader.peek());
242  if ((third & ~payload_cont) != pattern_cont)
243  return {{}, cp_error::missing_trailing, reader.position()};
244  reader.bump();
245 
246  auto fourth = uchar_t(reader.peek());
247  if ((fourth & ~payload_cont) != pattern_cont)
248  return {{}, cp_error::missing_trailing, reader.position()};
249  reader.bump();
250 
251  auto result = char32_t(first & payload_lead4);
252  result <<= 6;
253  result |= char32_t(second & payload_cont);
254  result <<= 6;
255  result |= char32_t(third & payload_cont);
256  result <<= 6;
257  result |= char32_t(fourth & payload_cont);
258 
259  auto cp = result;
260  if (cp > 0x10'FFFF)
261  return {cp, cp_error::out_of_range, reader.position()};
262  else if (first == 0xF0 && second < 0x90)
263  return {cp, cp_error::overlong_sequence, reader.position()};
264  else
265  return {cp, cp_error::success, reader.position()};
266  }
267  else // FE or FF
268  {
269  return {{}, cp_error::eof, reader.position()};
270  }
271  }
272  else if constexpr (std::is_same_v<typename Reader::encoding, lexy::utf16_encoding>)
273  {
274  constexpr auto payload1 = 0b0000'0011'1111'1111;
275  constexpr auto payload2 = payload1;
276 
277  constexpr auto pattern1 = 0b110110 << 10;
278  constexpr auto pattern2 = 0b110111 << 10;
279 
280  if (reader.peek() == Reader::encoding::eof())
281  return {{}, cp_error::eof, reader.position()};
282 
283  auto first = char16_t(reader.peek());
284  if ((first & ~payload1) == pattern1)
285  {
286  reader.bump();
287  if (reader.peek() == Reader::encoding::eof())
288  return {{}, cp_error::missing_trailing, reader.position()};
289 
290  auto second = char16_t(reader.peek());
291  if ((second & ~payload2) != pattern2)
292  return {{}, cp_error::missing_trailing, reader.position()};
293  reader.bump();
294 
295  // We've got a valid code point.
296  auto result = char32_t(first & payload1);
297  result <<= 10;
298  result |= char32_t(second & payload2);
299  result |= 0x10000;
300  return {result, cp_error::success, reader.position()};
301  }
302  else if ((first & ~payload2) == pattern2)
303  {
304  return {{}, cp_error::leads_with_trailing, reader.position()};
305  }
306  else
307  {
308  // Single code unit code point; always valid.
309  reader.bump();
310  return {first, cp_error::success, reader.position()};
311  }
312  }
313  else if constexpr (std::is_same_v<typename Reader::encoding, lexy::utf32_encoding>)
314  {
315  if (reader.peek() == Reader::encoding::eof())
316  return {{}, cp_error::eof, reader.position()};
317 
318  auto cur = reader.peek();
319  reader.bump();
320 
321  auto cp = cur;
322  if (cp > 0x10'FFFF)
323  return {cp, cp_error::out_of_range, reader.position()};
324  else if (0xD800 <= cp && cp <= 0xDFFF)
325  return {cp, cp_error::surrogate, reader.position()};
326  else
327  return {cp, cp_error::success, reader.position()};
328  }
329  else
330  {
331  static_assert(lexy::_detail::error<typename Reader::encoding>,
332  "no known code point for this encoding");
333  return {};
334  }
335 }
336 
337 template <typename Reader>
338 constexpr void recover_code_point(Reader& reader, cp_result<Reader> result)
339 {
340  switch (result.error)
341  {
342  case cp_error::success:
343  // Consume the entire code point.
344  reader.set_position(result.end);
345  break;
346  case cp_error::eof:
347  // We don't need to do anything to "recover" from EOF.
348  break;
349 
351  // Invalid code unit, consume to recover.
352  LEXY_PRECONDITION(result.end == reader.position());
353  reader.bump();
354  break;
355 
357  case cp_error::surrogate:
360  // Consume all the invalid code units to recover.
361  reader.set_position(result.end);
362  break;
363  }
364 }
365 } // namespace lexy::_detail
366 
367 #endif // LEXY_DETAIL_CODE_POINT_HPP_INCLUDED
368 
cx::size
constexpr auto size(const C &c) -> decltype(c.size())
Definition: wildcards.hpp:636
magic_enum::char_type
string_view::value_type char_type
Definition: magic_enum.hpp:145
lexy::_detail::cp_error::missing_trailing
@ missing_trailing
lexy::_detail::cp_error::out_of_range
@ out_of_range
lexy::_detail::cp_error::eof
@ eof
LEXY_PRECONDITION
#define LEXY_PRECONDITION(Expr)
Definition: assert.hpp:36
lexy::_detail::uchar_t
decltype(make_uchar(CharT())) uchar_t
Definition: swar.hpp:42
lexy::_detail::cp_error::surrogate
@ surrogate
lexy::_detail::encode_code_point
constexpr std::size_t encode_code_point(char32_t cp, typename Encoding::char_type *buffer, std::size_t size)
Definition: _detail/code_point.hpp:13
lexy::_detail::cp_error::overlong_sequence
@ overlong_sequence
lexy::utf8_encoding
An encoding where the input is assumed to be valid UTF-8.
Definition: encoding.hpp:84
lexy::_detail::cp_error::leads_with_trailing
@ leads_with_trailing
wildcards::detail::is_set_state::first
@ first
lexy::_detail
Definition: any_ref.hpp:12
lexy::buffer
Definition: buffer.hpp:81
lexy::_detail::cp_error::success
@ success
base.hpp
lexyd::eof
constexpr auto eof
Matches EOF.
Definition: eof.hpp:72


behaviortree_cpp_v4
Author(s): Davide Faconti
autogenerated on Fri Jun 28 2024 02:20:07