4 #ifndef LEXY_DETAIL_CODE_POINT_HPP_INCLUDED
5 #define LEXY_DETAIL_CODE_POINT_HPP_INCLUDED
12 template <
typename Encoding>
16 if constexpr (std::is_same_v<Encoding, lexy::ascii_encoding>)
23 else if constexpr (std::is_same_v<Encoding,
25 || std::is_same_v<Encoding, lexy::utf8_char_encoding>)
36 else if (cp <= 0x07
'FF)
38 LEXY_PRECONDITION(size >= 2);
40 auto first = (cp >> 6) & 0x1F;
41 auto second = (cp >> 0) & 0x3F;
43 buffer[0] = char_type(0xC0 | first);
44 buffer[1] = char_type(0x80 | second);
47 else if (cp <= 0xFF'FF)
51 auto first = (cp >> 12) & 0x0F;
52 auto second = (cp >> 6) & 0x3F;
53 auto third = (cp >> 0) & 0x3F;
64 auto first = (cp >> 18) & 0x07;
65 auto second = (cp >> 12) & 0x3F;
66 auto third = (cp >> 6) & 0x3F;
67 auto fourth = (cp >> 0) & 0x3F;
76 else if constexpr (std::is_same_v<Encoding, lexy::utf16_encoding>)
80 LEXY_PRECONDITION(size >= 1);
82 buffer[0] = char16_t(cp);
87 // Algorithm implemented from
88 // https://en.wikipedia.org/wiki/UTF-16#Code_points_from_U+010000_to_U+10FFFF.
89 LEXY_PRECONDITION(size >= 2);
91 auto u_prime = cp - 0x1'0000;
92 auto high_ten_bits = u_prime >> 10;
93 auto low_ten_bits = u_prime & 0b0000
'0011'1111
'1111;
95 buffer[0] = char16_t(0xD800 + high_ten_bits);
96 buffer[1] = char16_t(0xDC00 + low_ten_bits);
100 else if constexpr (std::is_same_v<Encoding, lexy::utf32_encoding>)
102 LEXY_PRECONDITION(size >= 1);
104 *buffer = char32_t(cp);
109 static_assert(lexy::_detail::error<Encoding>,
110 "cannot encode a code point in this encoding");
117 } // namespace lexy::_detail
120 namespace lexy::_detail
133 template <typename Reader>
138 typename Reader::marker end;
141 template <typename Reader>
142 constexpr cp_result<Reader> parse_code_point(Reader reader)
144 if constexpr (std::is_same_v<typename Reader::encoding, lexy::ascii_encoding>)
146 if (reader.peek() == Reader::encoding::eof())
147 return {{}, cp_error::eof, reader.current()};
149 auto cur = reader.peek();
152 auto cp = static_cast<char32_t>(cur);
154 return {cp, cp_error::success, reader.current()};
156 return {cp, cp_error::out_of_range, reader.current()};
158 else if constexpr (std::is_same_v<typename Reader::encoding, lexy::utf8_encoding> //
159 || std::is_same_v<typename Reader::encoding, lexy::utf8_char_encoding>)
161 using uchar_t = unsigned char;
162 constexpr auto payload_lead1 = 0b0111'1111;
163 constexpr
auto payload_lead2 = 0b0001
'1111;
164 constexpr auto payload_lead3 = 0b0000'1111;
165 constexpr
auto payload_lead4 = 0b0000
'0111;
166 constexpr auto payload_cont = 0b0011'1111;
168 constexpr
auto pattern_lead1 = 0b0 << 7;
169 constexpr
auto pattern_lead2 = 0b110 << 5;
170 constexpr
auto pattern_lead3 = 0b1110 << 4;
171 constexpr
auto pattern_lead4 = 0b11110 << 3;
172 constexpr
auto pattern_cont = 0b10 << 6;
174 auto first =
uchar_t(reader.peek());
175 if ((first & ~payload_lead1) == pattern_lead1)
181 else if ((first & ~payload_cont) == pattern_cont)
185 else if ((first & ~payload_lead2) == pattern_lead2)
189 auto second =
uchar_t(reader.peek());
190 if ((second & ~payload_cont) != pattern_cont)
194 auto result = char32_t(first & payload_lead2);
196 result |= char32_t(second & payload_cont);
199 if (first == 0xC0 || first == 0xC1)
204 else if ((first & ~payload_lead3) == pattern_lead3)
208 auto second =
uchar_t(reader.peek());
209 if ((second & ~payload_cont) != pattern_cont)
213 auto third =
uchar_t(reader.peek());
214 if ((third & ~payload_cont) != pattern_cont)
218 auto result = char32_t(first & payload_lead3);
220 result |= char32_t(second & payload_cont);
222 result |= char32_t(third & payload_cont);
225 if (0xD800 <= cp && cp <= 0xDFFF)
227 else if (first == 0xE0 && second < 0xA0)
232 else if ((first & ~payload_lead4) == pattern_lead4)
236 auto second =
uchar_t(reader.peek());
237 if ((second & ~payload_cont) != pattern_cont)
241 auto third =
uchar_t(reader.peek());
242 if ((third & ~payload_cont) != pattern_cont)
246 auto fourth =
uchar_t(reader.peek());
247 if ((fourth & ~payload_cont) != pattern_cont)
251 auto result = char32_t(first & payload_lead4);
253 result |= char32_t(second & payload_cont);
255 result |= char32_t(third & payload_cont);
257 result |= char32_t(fourth & payload_cont);
261 return {cp, cp_error::out_of_range, reader.current()};
262 else if (first == 0xF0 && second < 0x90)
263 return {cp, cp_error::overlong_sequence, reader.current()};
265 return {cp, cp_error::success, reader.current()};
269 return {{}, cp_error::eof, reader.current()};
272 else if constexpr (std::is_same_v<typename Reader::encoding, lexy::utf16_encoding>)
274 constexpr auto payload1 = 0b0000'0011
'1111'1111;
275 constexpr
auto payload2 = payload1;
277 constexpr
auto pattern1 = 0b110110 << 10;
278 constexpr
auto pattern2 = 0b110111 << 10;
283 auto first = char16_t(reader.peek());
284 if ((first & ~payload1) == pattern1)
290 auto second = char16_t(reader.peek());
291 if ((second & ~payload2) != pattern2)
296 auto result = char32_t(first & payload1);
298 result |= char32_t(second & payload2);
302 else if ((first & ~payload2) == pattern2)
313 else if constexpr (std::is_same_v<typename Reader::encoding, lexy::utf32_encoding>)
318 auto cur = reader.peek();
323 return {cp, cp_error::out_of_range, reader.current()};
324 else if (0xD800 <= cp && cp <= 0xDFFF)
325 return {cp, cp_error::surrogate, reader.current()};
327 return {cp, cp_error::success, reader.current()};
331 static_assert(lexy::_detail::error<typename Reader::encoding>,
332 "no known code point for this encoding");
337 template <typename Reader>
338 constexpr void recover_code_point(Reader& reader, cp_result<Reader> result)
340 switch (result.error)
342 case cp_error::success:
343 // Consume the entire code point.
344 reader.reset(result.end);
347 // We don't need to
do anything to
"recover" from EOF.
361 reader.reset(result.end);
367 #endif // LEXY_DETAIL_CODE_POINT_HPP_INCLUDED