charconv_parse.cc
Go to the documentation of this file.
1 // Copyright 2018 The Abseil Authors.
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // https://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
16 #include "absl/strings/charconv.h"
17 
18 #include <cassert>
19 #include <cstdint>
20 #include <limits>
21 
23 
24 namespace absl {
25 namespace {
26 
27 // ParseFloat<10> will read the first 19 significant digits of the mantissa.
28 // This number was chosen for multiple reasons.
29 //
30 // (a) First, for whatever integer type we choose to represent the mantissa, we
31 // want to choose the largest possible number of decimal digits for that integer
32 // type. We are using uint64_t, which can express any 19-digit unsigned
33 // integer.
34 //
35 // (b) Second, we need to parse enough digits that the binary value of any
36 // mantissa we capture has more bits of resolution than the mantissa
37 // representation in the target float. Our algorithm requires at least 3 bits
38 // of headway, but 19 decimal digits give a little more than that.
39 //
40 // The following static assertions verify the above comments:
41 constexpr int kDecimalMantissaDigitsMax = 19;
42 
43 static_assert(std::numeric_limits<uint64_t>::digits10 ==
44  kDecimalMantissaDigitsMax,
45  "(a) above");
46 
47 // IEEE doubles, which we assume in Abseil, have 53 binary bits of mantissa.
48 static_assert(std::numeric_limits<double>::is_iec559, "IEEE double assumed");
49 static_assert(std::numeric_limits<double>::radix == 2, "IEEE double fact");
50 static_assert(std::numeric_limits<double>::digits == 53, "IEEE double fact");
51 
52 // The lowest valued 19-digit decimal mantissa we can read still contains
53 // sufficient information to reconstruct a binary mantissa.
54 static_assert(1000000000000000000u > (uint64_t(1) << (53 + 3)), "(b) above");
55 
56 // ParseFloat<16> will read the first 15 significant digits of the mantissa.
57 //
58 // Because a base-16-to-base-2 conversion can be done exactly, we do not need
59 // to maximize the number of scanned hex digits to improve our conversion. What
60 // is required is to scan two more bits than the mantissa can represent, so that
61 // we always round correctly.
62 //
63 // (One extra bit does not suffice to perform correct rounding, since a number
64 // exactly halfway between two representable floats has unique rounding rules,
65 // so we need to differentiate between a "halfway between" number and a "closer
66 // to the larger value" number.)
67 constexpr int kHexadecimalMantissaDigitsMax = 15;
68 
69 // The minimum number of significant bits that will be read from
70 // kHexadecimalMantissaDigitsMax hex digits. We must subtract by three, since
71 // the most significant digit can be a "1", which only contributes a single
72 // significant bit.
73 constexpr int kGuaranteedHexadecimalMantissaBitPrecision =
74  4 * kHexadecimalMantissaDigitsMax - 3;
75 
76 static_assert(kGuaranteedHexadecimalMantissaBitPrecision >
77  std::numeric_limits<double>::digits + 2,
78  "kHexadecimalMantissaDigitsMax too small");
79 
80 // We also impose a limit on the number of significant digits we will read from
81 // an exponent, to avoid having to deal with integer overflow. We use 9 for
82 // this purpose.
83 //
84 // If we read a 9 digit exponent, the end result of the conversion will
85 // necessarily be infinity or zero, depending on the sign of the exponent.
86 // Therefore we can just drop extra digits on the floor without any extra
87 // logic.
88 constexpr int kDecimalExponentDigitsMax = 9;
89 static_assert(std::numeric_limits<int>::digits10 >= kDecimalExponentDigitsMax,
90  "int type too small");
91 
92 // To avoid incredibly large inputs causing integer overflow for our exponent,
93 // we impose an arbitrary but very large limit on the number of significant
94 // digits we will accept. The implementation refuses to match a string with
95 // more consecutive significant mantissa digits than this.
96 constexpr int kDecimalDigitLimit = 50000000;
97 
98 // Corresponding limit for hexadecimal digit inputs. This is one fourth the
99 // amount of kDecimalDigitLimit, since each dropped hexadecimal digit requires
100 // a binary exponent adjustment of 4.
101 constexpr int kHexadecimalDigitLimit = kDecimalDigitLimit / 4;
102 
103 // The largest exponent we can read is 999999999 (per
104 // kDecimalExponentDigitsMax), and the largest exponent adjustment we can get
105 // from dropped mantissa digits is 2 * kDecimalDigitLimit, and the sum of these
106 // comfortably fits in an integer.
107 //
108 // We count kDecimalDigitLimit twice because there are independent limits for
109 // numbers before and after the decimal point. (In the case where there are no
110 // significant digits before the decimal point, there are independent limits for
111 // post-decimal-point leading zeroes and for significant digits.)
112 static_assert(999999999 + 2 * kDecimalDigitLimit <
113  std::numeric_limits<int>::max(),
114  "int type too small");
115 static_assert(999999999 + 2 * (4 * kHexadecimalDigitLimit) <
116  std::numeric_limits<int>::max(),
117  "int type too small");
118 
119 // Returns true if the provided bitfield allows parsing an exponent value
120 // (e.g., "1.5e100").
121 bool AllowExponent(chars_format flags) {
122  bool fixed = (flags & chars_format::fixed) == chars_format::fixed;
123  bool scientific =
125  return scientific || !fixed;
126 }
127 
128 // Returns true if the provided bitfield requires an exponent value be present.
129 bool RequireExponent(chars_format flags) {
130  bool fixed = (flags & chars_format::fixed) == chars_format::fixed;
131  bool scientific =
133  return scientific && !fixed;
134 }
135 
136 const int8_t kAsciiToInt[256] = {
137  -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
138  -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
139  -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8,
140  9, -1, -1, -1, -1, -1, -1, -1, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1,
141  -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
142  -1, -1, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
143  -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
144  -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
145  -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
146  -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
147  -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
148  -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
149  -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
150  -1, -1, -1, -1, -1, -1, -1, -1, -1};
151 
152 // Returns true if `ch` is a digit in the given base
153 template <int base>
154 bool IsDigit(char ch);
155 
156 // Converts a valid `ch` to its digit value in the given base.
157 template <int base>
158 unsigned ToDigit(char ch);
159 
160 // Returns true if `ch` is the exponent delimiter for the given base.
161 template <int base>
162 bool IsExponentCharacter(char ch);
163 
164 // Returns the maximum number of significant digits we will read for a float
165 // in the given base.
166 template <int base>
167 constexpr int MantissaDigitsMax();
168 
169 // Returns the largest consecutive run of digits we will accept when parsing a
170 // number in the given base.
171 template <int base>
172 constexpr int DigitLimit();
173 
174 // Returns the amount the exponent must be adjusted by for each dropped digit.
175 // (For decimal this is 1, since the digits are in base 10 and the exponent base
176 // is also 10, but for hexadecimal this is 4, since the digits are base 16 but
177 // the exponent base is 2.)
178 template <int base>
179 constexpr int DigitMagnitude();
180 
181 template <>
182 bool IsDigit<10>(char ch) {
183  return ch >= '0' && ch <= '9';
184 }
185 template <>
186 bool IsDigit<16>(char ch) {
187  return kAsciiToInt[static_cast<unsigned char>(ch)] >= 0;
188 }
189 
190 template <>
191 unsigned ToDigit<10>(char ch) {
192  return ch - '0';
193 }
194 template <>
195 unsigned ToDigit<16>(char ch) {
196  return kAsciiToInt[static_cast<unsigned char>(ch)];
197 }
198 
199 template <>
200 bool IsExponentCharacter<10>(char ch) {
201  return ch == 'e' || ch == 'E';
202 }
203 
204 template <>
205 bool IsExponentCharacter<16>(char ch) {
206  return ch == 'p' || ch == 'P';
207 }
208 
209 template <>
210 constexpr int MantissaDigitsMax<10>() {
211  return kDecimalMantissaDigitsMax;
212 }
213 template <>
214 constexpr int MantissaDigitsMax<16>() {
215  return kHexadecimalMantissaDigitsMax;
216 }
217 
218 template <>
219 constexpr int DigitLimit<10>() {
220  return kDecimalDigitLimit;
221 }
222 template <>
223 constexpr int DigitLimit<16>() {
224  return kHexadecimalDigitLimit;
225 }
226 
227 template <>
228 constexpr int DigitMagnitude<10>() {
229  return 1;
230 }
231 template <>
232 constexpr int DigitMagnitude<16>() {
233  return 4;
234 }
235 
236 // Reads decimal digits from [begin, end) into *out. Returns the number of
237 // digits consumed.
238 //
239 // After max_digits has been read, keeps consuming characters, but no longer
240 // adjusts *out. If a nonzero digit is dropped this way, *dropped_nonzero_digit
241 // is set; otherwise, it is left unmodified.
242 //
243 // If no digits are matched, returns 0 and leaves *out unchanged.
244 //
245 // ConsumeDigits does not protect against overflow on *out; max_digits must
246 // be chosen with respect to type T to avoid the possibility of overflow.
247 template <int base, typename T>
248 std::size_t ConsumeDigits(const char* begin, const char* end, int max_digits,
249  T* out, bool* dropped_nonzero_digit) {
250  if (base == 10) {
251  assert(max_digits <= std::numeric_limits<T>::digits10);
252  } else if (base == 16) {
253  assert(max_digits * 4 <= std::numeric_limits<T>::digits);
254  }
255  const char* const original_begin = begin;
256  T accumulator = *out;
257  const char* significant_digits_end =
258  (end - begin > max_digits) ? begin + max_digits : end;
259  while (begin < significant_digits_end && IsDigit<base>(*begin)) {
260  // Do not guard against *out overflow; max_digits was chosen to avoid this.
261  // Do assert against it, to detect problems in debug builds.
262  auto digit = static_cast<T>(ToDigit<base>(*begin));
263  assert(accumulator * base >= accumulator);
264  accumulator *= base;
265  assert(accumulator + digit >= accumulator);
266  accumulator += digit;
267  ++begin;
268  }
269  bool dropped_nonzero = false;
270  while (begin < end && IsDigit<base>(*begin)) {
271  dropped_nonzero = dropped_nonzero || (*begin != '0');
272  ++begin;
273  }
274  if (dropped_nonzero && dropped_nonzero_digit != nullptr) {
275  *dropped_nonzero_digit = true;
276  }
277  *out = accumulator;
278  return begin - original_begin;
279 }
280 
281 // Returns true if `v` is one of the chars allowed inside parentheses following
282 // a NaN.
283 bool IsNanChar(char v) {
284  return (v == '_') || (v >= '0' && v <= '9') || (v >= 'a' && v <= 'z') ||
285  (v >= 'A' && v <= 'Z');
286 }
287 
288 // Checks the range [begin, end) for a strtod()-formatted infinity or NaN. If
289 // one is found, sets `out` appropriately and returns true.
290 bool ParseInfinityOrNan(const char* begin, const char* end,
291  strings_internal::ParsedFloat* out) {
292  if (end - begin < 3) {
293  return false;
294  }
295  switch (*begin) {
296  case 'i':
297  case 'I': {
298  // An infinity std::string consists of the characters "inf" or "infinity",
299  // case insensitive.
300  if (strings_internal::memcasecmp(begin + 1, "nf", 2) != 0) {
301  return false;
302  }
304  if (end - begin >= 8 &&
305  strings_internal::memcasecmp(begin + 3, "inity", 5) == 0) {
306  out->end = begin + 8;
307  } else {
308  out->end = begin + 3;
309  }
310  return true;
311  }
312  case 'n':
313  case 'N': {
314  // A NaN consists of the characters "nan", case insensitive, optionally
315  // followed by a parenthesized sequence of zero or more alphanumeric
316  // characters and/or underscores.
317  if (strings_internal::memcasecmp(begin + 1, "an", 2) != 0) {
318  return false;
319  }
321  out->end = begin + 3;
322  // NaN is allowed to be followed by a parenthesized std::string, consisting of
323  // only the characters [a-zA-Z0-9_]. Match that if it's present.
324  begin += 3;
325  if (begin < end && *begin == '(') {
326  const char* nan_begin = begin + 1;
327  while (nan_begin < end && IsNanChar(*nan_begin)) {
328  ++nan_begin;
329  }
330  if (nan_begin < end && *nan_begin == ')') {
331  // We found an extra NaN specifier range
332  out->subrange_begin = begin + 1;
333  out->subrange_end = nan_begin;
334  out->end = nan_begin + 1;
335  }
336  }
337  return true;
338  }
339  default:
340  return false;
341  }
342 }
343 } // namespace
344 
345 namespace strings_internal {
346 
347 template <int base>
348 strings_internal::ParsedFloat ParseFloat(const char* begin, const char* end,
349  chars_format format_flags) {
351 
352  // Exit early if we're given an empty range.
353  if (begin == end) return result;
354 
355  // Handle the infinity and NaN cases.
356  if (ParseInfinityOrNan(begin, end, &result)) {
357  return result;
358  }
359 
360  const char* const mantissa_begin = begin;
361  while (begin < end && *begin == '0') {
362  ++begin; // skip leading zeros
363  }
364  uint64_t mantissa = 0;
365 
366  int exponent_adjustment = 0;
367  bool mantissa_is_inexact = false;
368  std::size_t pre_decimal_digits = ConsumeDigits<base>(
369  begin, end, MantissaDigitsMax<base>(), &mantissa, &mantissa_is_inexact);
370  begin += pre_decimal_digits;
371  int digits_left;
372  if (pre_decimal_digits >= DigitLimit<base>()) {
373  // refuse to parse pathological inputs
374  return result;
375  } else if (pre_decimal_digits > MantissaDigitsMax<base>()) {
376  // We dropped some non-fraction digits on the floor. Adjust our exponent
377  // to compensate.
378  exponent_adjustment =
379  static_cast<int>(pre_decimal_digits - MantissaDigitsMax<base>());
380  digits_left = 0;
381  } else {
382  digits_left =
383  static_cast<int>(MantissaDigitsMax<base>() - pre_decimal_digits);
384  }
385  if (begin < end && *begin == '.') {
386  ++begin;
387  if (mantissa == 0) {
388  // If we haven't seen any nonzero digits yet, keep skipping zeros. We
389  // have to adjust the exponent to reflect the changed place value.
390  const char* begin_zeros = begin;
391  while (begin < end && *begin == '0') {
392  ++begin;
393  }
394  std::size_t zeros_skipped = begin - begin_zeros;
395  if (zeros_skipped >= DigitLimit<base>()) {
396  // refuse to parse pathological inputs
397  return result;
398  }
399  exponent_adjustment -= static_cast<int>(zeros_skipped);
400  }
401  std::size_t post_decimal_digits = ConsumeDigits<base>(
402  begin, end, digits_left, &mantissa, &mantissa_is_inexact);
403  begin += post_decimal_digits;
404 
405  // Since `mantissa` is an integer, each significant digit we read after
406  // the decimal point requires an adjustment to the exponent. "1.23e0" will
407  // be stored as `mantissa` == 123 and `exponent` == -2 (that is,
408  // "123e-2").
409  if (post_decimal_digits >= DigitLimit<base>()) {
410  // refuse to parse pathological inputs
411  return result;
412  } else if (post_decimal_digits > digits_left) {
413  exponent_adjustment -= digits_left;
414  } else {
415  exponent_adjustment -= post_decimal_digits;
416  }
417  }
418  // If we've found no mantissa whatsoever, this isn't a number.
419  if (mantissa_begin == begin) {
420  return result;
421  }
422  // A bare "." doesn't count as a mantissa either.
423  if (begin - mantissa_begin == 1 && *mantissa_begin == '.') {
424  return result;
425  }
426 
427  if (mantissa_is_inexact) {
428  // We dropped significant digits on the floor. Handle this appropriately.
429  if (base == 10) {
430  // If we truncated significant decimal digits, store the full range of the
431  // mantissa for future big integer math for exact rounding.
432  result.subrange_begin = mantissa_begin;
433  result.subrange_end = begin;
434  } else if (base == 16) {
435  // If we truncated hex digits, reflect this fact by setting the low
436  // ("sticky") bit. This allows for correct rounding in all cases.
437  mantissa |= 1;
438  }
439  }
440  result.mantissa = mantissa;
441 
442  const char* const exponent_begin = begin;
443  result.literal_exponent = 0;
444  bool found_exponent = false;
445  if (AllowExponent(format_flags) && begin < end &&
446  IsExponentCharacter<base>(*begin)) {
447  bool negative_exponent = false;
448  ++begin;
449  if (begin < end && *begin == '-') {
450  negative_exponent = true;
451  ++begin;
452  } else if (begin < end && *begin == '+') {
453  ++begin;
454  }
455  const char* const exponent_digits_begin = begin;
456  // Exponent is always expressed in decimal, even for hexadecimal floats.
457  begin += ConsumeDigits<10>(begin, end, kDecimalExponentDigitsMax,
458  &result.literal_exponent, nullptr);
459  if (begin == exponent_digits_begin) {
460  // there were no digits where we expected an exponent. We failed to read
461  // an exponent and should not consume the 'e' after all. Rewind 'begin'.
462  found_exponent = false;
463  begin = exponent_begin;
464  } else {
465  found_exponent = true;
466  if (negative_exponent) {
467  result.literal_exponent = -result.literal_exponent;
468  }
469  }
470  }
471 
472  if (!found_exponent && RequireExponent(format_flags)) {
473  // Provided flags required an exponent, but none was found. This results
474  // in a failure to scan.
475  return result;
476  }
477 
478  // Success!
480  if (result.mantissa > 0) {
481  result.exponent = result.literal_exponent +
482  (DigitMagnitude<base>() * exponent_adjustment);
483  } else {
484  result.exponent = 0;
485  }
486  result.end = begin;
487  return result;
488 }
489 
490 template ParsedFloat ParseFloat<10>(const char* begin, const char* end,
491  chars_format format_flags);
492 template ParsedFloat ParseFloat<16>(const char* begin, const char* end,
493  chars_format format_flags);
494 
495 } // namespace strings_internal
496 } // namespace absl
int v
Definition: variant_test.cc:81
char * begin
uint64_t mantissa
Definition: charconv.cc:238
char * end
template ParsedFloat ParseFloat< 10 >(const char *begin, const char *end, chars_format format_flags)
Definition: algorithm.h:29
template ParsedFloat ParseFloat< 16 >(const char *begin, const char *end, chars_format format_flags)
static bool IsDigit(char c)
Definition: demangle.cc:369
strings_internal::ParsedFloat ParseFloat(const char *begin, const char *end, chars_format format_flags)
int radix
char * out
Definition: mutex.h:1013
int memcasecmp(const char *s1, const char *s2, size_t len)
Definition: memutil.cc:22
chars_format
Definition: charconv.h:26


abseil_cpp
Author(s):
autogenerated on Mon Feb 28 2022 21:31:17