abseil_cpp: charconv_parse.cc Source File

Go to the documentation of this file.
00001 // Copyright 2018 The Abseil Authors.
00002 //
00003 // Licensed under the Apache License, Version 2.0 (the "License");
00004 // you may not use this file except in compliance with the License.
00005 // You may obtain a copy of the License at
00006 //
00007 //      https://www.apache.org/licenses/LICENSE-2.0
00008 //
00009 // Unless required by applicable law or agreed to in writing, software
00010 // distributed under the License is distributed on an "AS IS" BASIS,
00011 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00012 // See the License for the specific language governing permissions and
00013 // limitations under the License.
00014 
00015 #include "absl/strings/internal/charconv_parse.h"
00016 #include "absl/strings/charconv.h"
00017 
00018 #include <cassert>
00019 #include <cstdint>
00020 #include <limits>
00021 
00022 #include "absl/strings/internal/memutil.h"
00023 
00024 namespace absl {
00025 namespace {
00026 
00027 // ParseFloat<10> will read the first 19 significant digits of the mantissa.
00028 // This number was chosen for multiple reasons.
00029 //
00030 // (a) First, for whatever integer type we choose to represent the mantissa, we
00031 // want to choose the largest possible number of decimal digits for that integer
00032 // type.  We are using uint64_t, which can express any 19-digit unsigned
00033 // integer.
00034 //
00035 // (b) Second, we need to parse enough digits that the binary value of any
00036 // mantissa we capture has more bits of resolution than the mantissa
00037 // representation in the target float.  Our algorithm requires at least 3 bits
00038 // of headway, but 19 decimal digits give a little more than that.
00039 //
00040 // The following static assertions verify the above comments:
00041 constexpr int kDecimalMantissaDigitsMax = 19;
00042 
00043 static_assert(std::numeric_limits<uint64_t>::digits10 ==
00044                   kDecimalMantissaDigitsMax,
00045               "(a) above");
00046 
00047 // IEEE doubles, which we assume in Abseil, have 53 binary bits of mantissa.
00048 static_assert(std::numeric_limits<double>::is_iec559, "IEEE double assumed");
00049 static_assert(std::numeric_limits<double>::radix == 2, "IEEE double fact");
00050 static_assert(std::numeric_limits<double>::digits == 53, "IEEE double fact");
00051 
00052 // The lowest valued 19-digit decimal mantissa we can read still contains
00053 // sufficient information to reconstruct a binary mantissa.
00054 static_assert(1000000000000000000u > (uint64_t(1) << (53 + 3)), "(b) above");
00055 
00056 // ParseFloat<16> will read the first 15 significant digits of the mantissa.
00057 //
00058 // Because a base-16-to-base-2 conversion can be done exactly, we do not need
00059 // to maximize the number of scanned hex digits to improve our conversion.  What
00060 // is required is to scan two more bits than the mantissa can represent, so that
00061 // we always round correctly.
00062 //
00063 // (One extra bit does not suffice to perform correct rounding, since a number
00064 // exactly halfway between two representable floats has unique rounding rules,
00065 // so we need to differentiate between a "halfway between" number and a "closer
00066 // to the larger value" number.)
00067 constexpr int kHexadecimalMantissaDigitsMax = 15;
00068 
00069 // The minimum number of significant bits that will be read from
00070 // kHexadecimalMantissaDigitsMax hex digits.  We must subtract by three, since
00071 // the most significant digit can be a "1", which only contributes a single
00072 // significant bit.
00073 constexpr int kGuaranteedHexadecimalMantissaBitPrecision =
00074     4 * kHexadecimalMantissaDigitsMax - 3;
00075 
00076 static_assert(kGuaranteedHexadecimalMantissaBitPrecision >
00077                   std::numeric_limits<double>::digits + 2,
00078               "kHexadecimalMantissaDigitsMax too small");
00079 
00080 // We also impose a limit on the number of significant digits we will read from
00081 // an exponent, to avoid having to deal with integer overflow.  We use 9 for
00082 // this purpose.
00083 //
00084 // If we read a 9 digit exponent, the end result of the conversion will
00085 // necessarily be infinity or zero, depending on the sign of the exponent.
00086 // Therefore we can just drop extra digits on the floor without any extra
00087 // logic.
00088 constexpr int kDecimalExponentDigitsMax = 9;
00089 static_assert(std::numeric_limits<int>::digits10 >= kDecimalExponentDigitsMax,
00090               "int type too small");
00091 
00092 // To avoid incredibly large inputs causing integer overflow for our exponent,
00093 // we impose an arbitrary but very large limit on the number of significant
00094 // digits we will accept.  The implementation refuses to match a string with
00095 // more consecutive significant mantissa digits than this.
00096 constexpr int kDecimalDigitLimit = 50000000;
00097 
00098 // Corresponding limit for hexadecimal digit inputs.  This is one fourth the
00099 // amount of kDecimalDigitLimit, since each dropped hexadecimal digit requires
00100 // a binary exponent adjustment of 4.
00101 constexpr int kHexadecimalDigitLimit = kDecimalDigitLimit / 4;
00102 
00103 // The largest exponent we can read is 999999999 (per
00104 // kDecimalExponentDigitsMax), and the largest exponent adjustment we can get
00105 // from dropped mantissa digits is 2 * kDecimalDigitLimit, and the sum of these
00106 // comfortably fits in an integer.
00107 //
00108 // We count kDecimalDigitLimit twice because there are independent limits for
00109 // numbers before and after the decimal point.  (In the case where there are no
00110 // significant digits before the decimal point, there are independent limits for
00111 // post-decimal-point leading zeroes and for significant digits.)
00112 static_assert(999999999 + 2 * kDecimalDigitLimit <
00113                   std::numeric_limits<int>::max(),
00114               "int type too small");
00115 static_assert(999999999 + 2 * (4 * kHexadecimalDigitLimit) <
00116                   std::numeric_limits<int>::max(),
00117               "int type too small");
00118 
00119 // Returns true if the provided bitfield allows parsing an exponent value
00120 // (e.g., "1.5e100").
00121 bool AllowExponent(chars_format flags) {
00122   bool fixed = (flags & chars_format::fixed) == chars_format::fixed;
00123   bool scientific =
00124       (flags & chars_format::scientific) == chars_format::scientific;
00125   return scientific || !fixed;
00126 }
00127 
00128 // Returns true if the provided bitfield requires an exponent value be present.
00129 bool RequireExponent(chars_format flags) {
00130   bool fixed = (flags & chars_format::fixed) == chars_format::fixed;
00131   bool scientific =
00132       (flags & chars_format::scientific) == chars_format::scientific;
00133   return scientific && !fixed;
00134 }
00135 
00136 const int8_t kAsciiToInt[256] = {
00137     -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
00138     -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
00139     -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0,  1,  2,  3,  4,  5,  6,  7,  8,
00140     9,  -1, -1, -1, -1, -1, -1, -1, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1,
00141     -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
00142     -1, -1, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
00143     -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
00144     -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
00145     -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
00146     -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
00147     -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
00148     -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
00149     -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
00150     -1, -1, -1, -1, -1, -1, -1, -1, -1};
00151 
00152 // Returns true if `ch` is a digit in the given base
00153 template <int base>
00154 bool IsDigit(char ch);
00155 
00156 // Converts a valid `ch` to its digit value in the given base.
00157 template <int base>
00158 unsigned ToDigit(char ch);
00159 
00160 // Returns true if `ch` is the exponent delimiter for the given base.
00161 template <int base>
00162 bool IsExponentCharacter(char ch);
00163 
00164 // Returns the maximum number of significant digits we will read for a float
00165 // in the given base.
00166 template <int base>
00167 constexpr int MantissaDigitsMax();
00168 
00169 // Returns the largest consecutive run of digits we will accept when parsing a
00170 // number in the given base.
00171 template <int base>
00172 constexpr int DigitLimit();
00173 
00174 // Returns the amount the exponent must be adjusted by for each dropped digit.
00175 // (For decimal this is 1, since the digits are in base 10 and the exponent base
00176 // is also 10, but for hexadecimal this is 4, since the digits are base 16 but
00177 // the exponent base is 2.)
00178 template <int base>
00179 constexpr int DigitMagnitude();
00180 
00181 template <>
00182 bool IsDigit<10>(char ch) {
00183   return ch >= '0' && ch <= '9';
00184 }
00185 template <>
00186 bool IsDigit<16>(char ch) {
00187   return kAsciiToInt[static_cast<unsigned char>(ch)] >= 0;
00188 }
00189 
00190 template <>
00191 unsigned ToDigit<10>(char ch) {
00192   return ch - '0';
00193 }
00194 template <>
00195 unsigned ToDigit<16>(char ch) {
00196   return kAsciiToInt[static_cast<unsigned char>(ch)];
00197 }
00198 
00199 template <>
00200 bool IsExponentCharacter<10>(char ch) {
00201   return ch == 'e' || ch == 'E';
00202 }
00203 
00204 template <>
00205 bool IsExponentCharacter<16>(char ch) {
00206   return ch == 'p' || ch == 'P';
00207 }
00208 
00209 template <>
00210 constexpr int MantissaDigitsMax<10>() {
00211   return kDecimalMantissaDigitsMax;
00212 }
00213 template <>
00214 constexpr int MantissaDigitsMax<16>() {
00215   return kHexadecimalMantissaDigitsMax;
00216 }
00217 
00218 template <>
00219 constexpr int DigitLimit<10>() {
00220   return kDecimalDigitLimit;
00221 }
00222 template <>
00223 constexpr int DigitLimit<16>() {
00224   return kHexadecimalDigitLimit;
00225 }
00226 
00227 template <>
00228 constexpr int DigitMagnitude<10>() {
00229   return 1;
00230 }
00231 template <>
00232 constexpr int DigitMagnitude<16>() {
00233   return 4;
00234 }
00235 
00236 // Reads decimal digits from [begin, end) into *out.  Returns the number of
00237 // digits consumed.
00238 //
00239 // After max_digits has been read, keeps consuming characters, but no longer
00240 // adjusts *out.  If a nonzero digit is dropped this way, *dropped_nonzero_digit
00241 // is set; otherwise, it is left unmodified.
00242 //
00243 // If no digits are matched, returns 0 and leaves *out unchanged.
00244 //
00245 // ConsumeDigits does not protect against overflow on *out; max_digits must
00246 // be chosen with respect to type T to avoid the possibility of overflow.
00247 template <int base, typename T>
00248 std::size_t ConsumeDigits(const char* begin, const char* end, int max_digits,
00249                           T* out, bool* dropped_nonzero_digit) {
00250   if (base == 10) {
00251     assert(max_digits <= std::numeric_limits<T>::digits10);
00252   } else if (base == 16) {
00253     assert(max_digits * 4 <= std::numeric_limits<T>::digits);
00254   }
00255   const char* const original_begin = begin;
00256   T accumulator = *out;
00257   const char* significant_digits_end =
00258       (end - begin > max_digits) ? begin + max_digits : end;
00259   while (begin < significant_digits_end && IsDigit<base>(*begin)) {
00260     // Do not guard against *out overflow; max_digits was chosen to avoid this.
00261     // Do assert against it, to detect problems in debug builds.
00262     auto digit = static_cast<T>(ToDigit<base>(*begin));
00263     assert(accumulator * base >= accumulator);
00264     accumulator *= base;
00265     assert(accumulator + digit >= accumulator);
00266     accumulator += digit;
00267     ++begin;
00268   }
00269   bool dropped_nonzero = false;
00270   while (begin < end && IsDigit<base>(*begin)) {
00271     dropped_nonzero = dropped_nonzero || (*begin != '0');
00272     ++begin;
00273   }
00274   if (dropped_nonzero && dropped_nonzero_digit != nullptr) {
00275     *dropped_nonzero_digit = true;
00276   }
00277   *out = accumulator;
00278   return begin - original_begin;
00279 }
00280 
00281 // Returns true if `v` is one of the chars allowed inside parentheses following
00282 // a NaN.
00283 bool IsNanChar(char v) {
00284   return (v == '_') || (v >= '0' && v <= '9') || (v >= 'a' && v <= 'z') ||
00285          (v >= 'A' && v <= 'Z');
00286 }
00287 
00288 // Checks the range [begin, end) for a strtod()-formatted infinity or NaN.  If
00289 // one is found, sets `out` appropriately and returns true.
00290 bool ParseInfinityOrNan(const char* begin, const char* end,
00291                         strings_internal::ParsedFloat* out) {
00292   if (end - begin < 3) {
00293     return false;
00294   }
00295   switch (*begin) {
00296     case 'i':
00297     case 'I': {
00298       // An infinity std::string consists of the characters "inf" or "infinity",
00299       // case insensitive.
00300       if (strings_internal::memcasecmp(begin + 1, "nf", 2) != 0) {
00301         return false;
00302       }
00303       out->type = strings_internal::FloatType::kInfinity;
00304       if (end - begin >= 8 &&
00305           strings_internal::memcasecmp(begin + 3, "inity", 5) == 0) {
00306         out->end = begin + 8;
00307       } else {
00308         out->end = begin + 3;
00309       }
00310       return true;
00311     }
00312     case 'n':
00313     case 'N': {
00314       // A NaN consists of the characters "nan", case insensitive, optionally
00315       // followed by a parenthesized sequence of zero or more alphanumeric
00316       // characters and/or underscores.
00317       if (strings_internal::memcasecmp(begin + 1, "an", 2) != 0) {
00318         return false;
00319       }
00320       out->type = strings_internal::FloatType::kNan;
00321       out->end = begin + 3;
00322       // NaN is allowed to be followed by a parenthesized std::string, consisting of
00323       // only the characters [a-zA-Z0-9_].  Match that if it's present.
00324       begin += 3;
00325       if (begin < end && *begin == '(') {
00326         const char* nan_begin = begin + 1;
00327         while (nan_begin < end && IsNanChar(*nan_begin)) {
00328           ++nan_begin;
00329         }
00330         if (nan_begin < end && *nan_begin == ')') {
00331           // We found an extra NaN specifier range
00332           out->subrange_begin = begin + 1;
00333           out->subrange_end = nan_begin;
00334           out->end = nan_begin + 1;
00335         }
00336       }
00337       return true;
00338     }
00339     default:
00340       return false;
00341   }
00342 }
00343 }  // namespace
00344 
00345 namespace strings_internal {
00346 
00347 template <int base>
00348 strings_internal::ParsedFloat ParseFloat(const char* begin, const char* end,
00349                                          chars_format format_flags) {
00350   strings_internal::ParsedFloat result;
00351 
00352   // Exit early if we're given an empty range.
00353   if (begin == end) return result;
00354 
00355   // Handle the infinity and NaN cases.
00356   if (ParseInfinityOrNan(begin, end, &result)) {
00357     return result;
00358   }
00359 
00360   const char* const mantissa_begin = begin;
00361   while (begin < end && *begin == '0') {
00362     ++begin;  // skip leading zeros
00363   }
00364   uint64_t mantissa = 0;
00365 
00366   int exponent_adjustment = 0;
00367   bool mantissa_is_inexact = false;
00368   std::size_t pre_decimal_digits = ConsumeDigits<base>(
00369       begin, end, MantissaDigitsMax<base>(), &mantissa, &mantissa_is_inexact);
00370   begin += pre_decimal_digits;
00371   int digits_left;
00372   if (pre_decimal_digits >= DigitLimit<base>()) {
00373     // refuse to parse pathological inputs
00374     return result;
00375   } else if (pre_decimal_digits > MantissaDigitsMax<base>()) {
00376     // We dropped some non-fraction digits on the floor.  Adjust our exponent
00377     // to compensate.
00378     exponent_adjustment =
00379         static_cast<int>(pre_decimal_digits - MantissaDigitsMax<base>());
00380     digits_left = 0;
00381   } else {
00382     digits_left =
00383         static_cast<int>(MantissaDigitsMax<base>() - pre_decimal_digits);
00384   }
00385   if (begin < end && *begin == '.') {
00386     ++begin;
00387     if (mantissa == 0) {
00388       // If we haven't seen any nonzero digits yet, keep skipping zeros.  We
00389       // have to adjust the exponent to reflect the changed place value.
00390       const char* begin_zeros = begin;
00391       while (begin < end && *begin == '0') {
00392         ++begin;
00393       }
00394       std::size_t zeros_skipped = begin - begin_zeros;
00395       if (zeros_skipped >= DigitLimit<base>()) {
00396         // refuse to parse pathological inputs
00397         return result;
00398       }
00399       exponent_adjustment -= static_cast<int>(zeros_skipped);
00400     }
00401     std::size_t post_decimal_digits = ConsumeDigits<base>(
00402         begin, end, digits_left, &mantissa, &mantissa_is_inexact);
00403     begin += post_decimal_digits;
00404 
00405     // Since `mantissa` is an integer, each significant digit we read after
00406     // the decimal point requires an adjustment to the exponent. "1.23e0" will
00407     // be stored as `mantissa` == 123 and `exponent` == -2 (that is,
00408     // "123e-2").
00409     if (post_decimal_digits >= DigitLimit<base>()) {
00410       // refuse to parse pathological inputs
00411       return result;
00412     } else if (post_decimal_digits > digits_left) {
00413       exponent_adjustment -= digits_left;
00414     } else {
00415       exponent_adjustment -= post_decimal_digits;
00416     }
00417   }
00418   // If we've found no mantissa whatsoever, this isn't a number.
00419   if (mantissa_begin == begin) {
00420     return result;
00421   }
00422   // A bare "." doesn't count as a mantissa either.
00423   if (begin - mantissa_begin == 1 && *mantissa_begin == '.') {
00424     return result;
00425   }
00426 
00427   if (mantissa_is_inexact) {
00428     // We dropped significant digits on the floor.  Handle this appropriately.
00429     if (base == 10) {
00430       // If we truncated significant decimal digits, store the full range of the
00431       // mantissa for future big integer math for exact rounding.
00432       result.subrange_begin = mantissa_begin;
00433       result.subrange_end = begin;
00434     } else if (base == 16) {
00435       // If we truncated hex digits, reflect this fact by setting the low
00436       // ("sticky") bit.  This allows for correct rounding in all cases.
00437       mantissa |= 1;
00438     }
00439   }
00440   result.mantissa = mantissa;
00441 
00442   const char* const exponent_begin = begin;
00443   result.literal_exponent = 0;
00444   bool found_exponent = false;
00445   if (AllowExponent(format_flags) && begin < end &&
00446       IsExponentCharacter<base>(*begin)) {
00447     bool negative_exponent = false;
00448     ++begin;
00449     if (begin < end && *begin == '-') {
00450       negative_exponent = true;
00451       ++begin;
00452     } else if (begin < end && *begin == '+') {
00453       ++begin;
00454     }
00455     const char* const exponent_digits_begin = begin;
00456     // Exponent is always expressed in decimal, even for hexadecimal floats.
00457     begin += ConsumeDigits<10>(begin, end, kDecimalExponentDigitsMax,
00458                                &result.literal_exponent, nullptr);
00459     if (begin == exponent_digits_begin) {
00460       // there were no digits where we expected an exponent.  We failed to read
00461       // an exponent and should not consume the 'e' after all.  Rewind 'begin'.
00462       found_exponent = false;
00463       begin = exponent_begin;
00464     } else {
00465       found_exponent = true;
00466       if (negative_exponent) {
00467         result.literal_exponent = -result.literal_exponent;
00468       }
00469     }
00470   }
00471 
00472   if (!found_exponent && RequireExponent(format_flags)) {
00473     // Provided flags required an exponent, but none was found.  This results
00474     // in a failure to scan.
00475     return result;
00476   }
00477 
00478   // Success!
00479   result.type = strings_internal::FloatType::kNumber;
00480   if (result.mantissa > 0) {
00481     result.exponent = result.literal_exponent +
00482                       (DigitMagnitude<base>() * exponent_adjustment);
00483   } else {
00484     result.exponent = 0;
00485   }
00486   result.end = begin;
00487   return result;
00488 }
00489 
00490 template ParsedFloat ParseFloat<10>(const char* begin, const char* end,
00491                                     chars_format format_flags);
00492 template ParsedFloat ParseFloat<16>(const char* begin, const char* end,
00493                                     chars_format format_flags);
00494 
00495 }  // namespace strings_internal
00496 }  // namespace absl