91 #include <google/protobuf/io/tokenizer.h> 
   93 #include <google/protobuf/stubs/common.h> 
   94 #include <google/protobuf/stubs/logging.h> 
   95 #include <google/protobuf/stubs/stringprintf.h> 
   96 #include <google/protobuf/stubs/strutil.h> 
   97 #include <google/protobuf/io/strtod.h> 
   98 #include <google/protobuf/io/zero_copy_stream.h> 
   99 #include <google/protobuf/stubs/stl_util.h> 
  117 #define CHARACTER_CLASS(NAME, EXPRESSION)                     \ 
  120     static inline bool InClass(char c) { return EXPRESSION; } \ 
  123 CHARACTER_CLASS(Whitespace, c == 
' ' || c == 
'\n' || c == 
'\t' || c == 
'\r' ||
 
  124                                 c == 
'\v' || c == 
'\f');
 
  126                 c == 
' ' || c == 
'\t' || c == 
'\r' || c == 
'\v' || c == 
'\f');
 
  132 CHARACTER_CLASS(HexDigit, (
'0' <= c && c <= 
'9') || (
'a' <= c && c <= 
'f') ||
 
  133                               (
'A' <= c && c <= 
'F'));
 
  136                 (
'a' <= c && c <= 
'z') || (
'A' <= c && c <= 
'Z') || (c == 
'_'));
 
  139                                   (
'A' <= c && c <= 
'Z') ||
 
  140                                   (
'0' <= c && c <= 
'9') || (c == 
'_'));
 
  142 CHARACTER_CLASS(Escape, c == 
'a' || c == 
'b' || c == 
'f' || c == 
'n' ||
 
  143                             c == 
'r' || c == 
't' || c == 
'v' || c == 
'\\' ||
 
  144                             c == 
'?' || c == 
'\'' || c == 
'\"');
 
  146 #undef CHARACTER_CLASS 
  150 inline int DigitValue(
char digit) {
 
  151   if (
'0' <= digit && digit <= 
'9') 
return digit - 
'0';
 
  152   if (
'a' <= digit && digit <= 
'z') 
return digit - 
'a' + 10;
 
  153   if (
'A' <= digit && digit <= 
'Z') 
return digit - 
'A' + 10;
 
  158 inline char TranslateEscape(
char c) {
 
  196                      ErrorCollector* error_collector)
 
  205       record_target_(NULL),
 
  207       allow_f_after_float_(
false),
 
  208       comment_style_(CPP_COMMENT_STYLE),
 
  209       require_space_after_number_(
true),
 
  210       allow_multiline_strings_(
false) {
 
  219 Tokenizer::~Tokenizer() {
 
  222   if (buffer_size_ > buffer_pos_) {
 
  223     input_->BackUp(buffer_size_ - buffer_pos_);
 
  227 bool Tokenizer::report_whitespace()
 const { 
return report_whitespace_; }
 
  229 void Tokenizer::set_report_whitespace(
bool report) {
 
  230   report_whitespace_ = report;
 
  231   report_newlines_ &= report;
 
  235 bool Tokenizer::report_newlines()
 const { 
return report_newlines_; }
 
  237 void Tokenizer::set_report_newlines(
bool report) {
 
  238   report_newlines_ = report;
 
  239   report_whitespace_ |= report;  
 
  245 void Tokenizer::NextChar() {
 
  248   if (current_char_ == 
'\n') {
 
  251   } 
else if (current_char_ == 
'\t') {
 
  252     column_ += kTabWidth - column_ % kTabWidth;
 
  259   if (buffer_pos_ < buffer_size_) {
 
  260     current_char_ = 
buffer_[buffer_pos_];
 
  266 void Tokenizer::Refresh() {
 
  268     current_char_ = 
'\0';
 
  273   if (record_target_ != NULL && record_start_ < buffer_size_) {
 
  274     record_target_->append(
buffer_ + record_start_,
 
  275                            buffer_size_ - record_start_);
 
  279   const void* 
data = NULL;
 
  287       current_char_ = 
'\0';
 
  290   } 
while (buffer_size_ == 0);
 
  299   record_start_ = buffer_pos_;
 
  302 inline void Tokenizer::StopRecording() {
 
  307   if (buffer_pos_ != record_start_) {
 
  308     record_target_->append(
buffer_ + record_start_,
 
  309                            buffer_pos_ - record_start_);
 
  311   record_target_ = NULL;
 
  315 inline void Tokenizer::StartToken() {
 
  323 inline void Tokenizer::EndToken() {
 
  331 template <
typename CharacterClass>
 
  332 inline bool Tokenizer::LookingAt() {
 
  333   return CharacterClass::InClass(current_char_);
 
  336 template <
typename CharacterClass>
 
  337 inline bool Tokenizer::TryConsumeOne() {
 
  338   if (CharacterClass::InClass(current_char_)) {
 
  346 inline bool Tokenizer::TryConsume(
char c) {
 
  347   if (current_char_ == 
c) {
 
  355 template <
typename CharacterClass>
 
  356 inline void Tokenizer::ConsumeZeroOrMore() {
 
  357   while (CharacterClass::InClass(current_char_)) {
 
  362 template <
typename CharacterClass>
 
  363 inline void Tokenizer::ConsumeOneOrMore(
const char* 
error) {
 
  364   if (!CharacterClass::InClass(current_char_)) {
 
  369     } 
while (CharacterClass::InClass(current_char_));
 
  377 void Tokenizer::ConsumeString(
char delimiter) {
 
  379     switch (current_char_) {
 
  381         AddError(
"Unexpected end of string.");
 
  385         if (!allow_multiline_strings_) {
 
  386           AddError(
"String literals cannot cross line boundaries.");
 
  396         if (TryConsumeOne<Escape>()) {
 
  398         } 
else if (TryConsumeOne<OctalDigit>()) {
 
  402         } 
else if (TryConsume(
'x')) {
 
  403           if (!TryConsumeOne<HexDigit>()) {
 
  404             AddError(
"Expected hex digits for escape sequence.");
 
  407         } 
else if (TryConsume(
'u')) {
 
  408           if (!TryConsumeOne<HexDigit>() || !TryConsumeOne<HexDigit>() ||
 
  409               !TryConsumeOne<HexDigit>() || !TryConsumeOne<HexDigit>()) {
 
  410             AddError(
"Expected four hex digits for \\u escape sequence.");
 
  412         } 
else if (TryConsume(
'U')) {
 
  415           if (!TryConsume(
'0') || !TryConsume(
'0') ||
 
  416               !(TryConsume(
'0') || TryConsume(
'1')) ||
 
  417               !TryConsumeOne<HexDigit>() || !TryConsumeOne<HexDigit>() ||
 
  418               !TryConsumeOne<HexDigit>() || !TryConsumeOne<HexDigit>() ||
 
  419               !TryConsumeOne<HexDigit>()) {
 
  421                 "Expected eight hex digits up to 10ffff for \\U escape " 
  425           AddError(
"Invalid escape sequence in string literal.");
 
  431         if (current_char_ == delimiter) {
 
  442 Tokenizer::TokenType Tokenizer::ConsumeNumber(
bool started_with_zero,
 
  443                                               bool started_with_dot) {
 
  444   bool is_float = 
false;
 
  446   if (started_with_zero && (TryConsume(
'x') || TryConsume(
'X'))) {
 
  448     ConsumeOneOrMore<HexDigit>(
"\"0x\" must be followed by hex digits.");
 
  450   } 
else if (started_with_zero && LookingAt<Digit>()) {
 
  452     ConsumeZeroOrMore<OctalDigit>();
 
  453     if (LookingAt<Digit>()) {
 
  454       AddError(
"Numbers starting with leading zero must be in octal.");
 
  455       ConsumeZeroOrMore<Digit>();
 
  460     if (started_with_dot) {
 
  462       ConsumeZeroOrMore<Digit>();
 
  464       ConsumeZeroOrMore<Digit>();
 
  466       if (TryConsume(
'.')) {
 
  468         ConsumeZeroOrMore<Digit>();
 
  472     if (TryConsume(
'e') || TryConsume(
'E')) {
 
  474       TryConsume(
'-') || TryConsume(
'+');
 
  475       ConsumeOneOrMore<Digit>(
"\"e\" must be followed by exponent.");
 
  478     if (allow_f_after_float_ && (TryConsume(
'f') || TryConsume(
'F'))) {
 
  483   if (LookingAt<Letter>() && require_space_after_number_) {
 
  484     AddError(
"Need space between number and identifier.");
 
  485   } 
else if (current_char_ == 
'.') {
 
  488           "Already saw decimal point or exponent; can't have another one.");
 
  490       AddError(
"Hex and octal numbers must be integers.");
 
  494   return is_float ? TYPE_FLOAT : TYPE_INTEGER;
 
  500   while (current_char_ != 
'\0' && current_char_ != 
'\n') {
 
  505   if (
content != NULL) StopRecording();
 
  509   int start_line = 
line_;
 
  510   int start_column = column_ - 2;
 
  515     while (current_char_ != 
'\0' && current_char_ != 
'*' &&
 
  516            current_char_ != 
'/' && current_char_ != 
'\n') {
 
  520     if (TryConsume(
'\n')) {
 
  521       if (
content != NULL) StopRecording();
 
  524       ConsumeZeroOrMore<WhitespaceNoNewline>();
 
  525       if (TryConsume(
'*')) {
 
  526         if (TryConsume(
'/')) {
 
  533     } 
else if (TryConsume(
'*') && TryConsume(
'/')) {
 
  541     } 
else if (TryConsume(
'/') && current_char_ == 
'*') {
 
  545           "\"/*\" inside block comment.  Block comments cannot be nested.");
 
  546     } 
else if (current_char_ == 
'\0') {
 
  547       AddError(
"End-of-file inside block comment.");
 
  549                                  "  Comment started here.");
 
  550       if (
content != NULL) StopRecording();
 
  556 Tokenizer::NextCommentStatus Tokenizer::TryConsumeCommentStart() {
 
  557   if (comment_style_ == CPP_COMMENT_STYLE && TryConsume(
'/')) {
 
  558     if (TryConsume(
'/')) {
 
  560     } 
else if (TryConsume(
'*')) {
 
  561       return BLOCK_COMMENT;
 
  569       return SLASH_NOT_COMMENT;
 
  571   } 
else if (comment_style_ == SH_COMMENT_STYLE && TryConsume(
'#')) {
 
  578 bool Tokenizer::TryConsumeWhitespace() {
 
  579   if (report_newlines_) {
 
  580     if (TryConsumeOne<WhitespaceNoNewline>()) {
 
  581       ConsumeZeroOrMore<WhitespaceNoNewline>();
 
  587   if (TryConsumeOne<Whitespace>()) {
 
  588     ConsumeZeroOrMore<Whitespace>();
 
  590     return report_whitespace_;
 
  595 bool Tokenizer::TryConsumeNewline() {
 
  596   if (!report_whitespace_ || !report_newlines_) {
 
  599   if (TryConsume(
'\n')) {
 
  611   while (!read_error_) {
 
  613     bool report_token = TryConsumeWhitespace() || TryConsumeNewline();
 
  619     switch (TryConsumeCommentStart()) {
 
  621         ConsumeLineComment(NULL);
 
  624         ConsumeBlockComment(NULL);
 
  626       case SLASH_NOT_COMMENT:
 
  633     if (read_error_) 
break;
 
  635     if (LookingAt<Unprintable>() || current_char_ == 
'\0') {
 
  636       AddError(
"Invalid control characters encountered in text.");
 
  643       while (TryConsumeOne<Unprintable>() ||
 
  644              (!read_error_ && TryConsume(
'\0'))) {
 
  652       if (TryConsumeOne<Letter>()) {
 
  653         ConsumeZeroOrMore<Alphanumeric>();
 
  655       } 
else if (TryConsume(
'0')) {
 
  656         current_.type = ConsumeNumber(
true, 
false);
 
  657       } 
else if (TryConsume(
'.')) {
 
  661         if (TryConsumeOne<Digit>()) {
 
  663           if (previous_.type == TYPE_IDENTIFIER &&
 
  665               current_.column == previous_.end_column) {
 
  669                 "Need space between identifier and decimal point.");
 
  671           current_.type = ConsumeNumber(
false, 
true);
 
  675       } 
else if (TryConsumeOne<Digit>()) {
 
  676         current_.type = ConsumeNumber(
false, 
false);
 
  677       } 
else if (TryConsume(
'\"')) {
 
  680       } 
else if (TryConsume(
'\'')) {
 
  685         if (current_char_ & 0x80) {
 
  689                            static_cast<unsigned char>(current_char_)));
 
  718 class CommentCollector {
 
  734   ~CommentCollector() {
 
  819     if (TryConsume(
static_cast<char>(0xEF))) {
 
  820       if (!TryConsume(
static_cast<char>(0xBB)) ||
 
  821           !TryConsume(
static_cast<char>(0xBF))) {
 
  823             "Proto file starts with 0xEF but not UTF-8 BOM. " 
  824             "Only UTF-8 is accepted for proto file.");
 
  828     collector.DetachFromPrev();
 
  832     ConsumeZeroOrMore<WhitespaceNoNewline>();
 
  833     switch (TryConsumeCommentStart()) {
 
  835         ConsumeLineComment(collector.GetBufferForLineComment());
 
  842         ConsumeBlockComment(collector.GetBufferForBlockComment());
 
  844         ConsumeZeroOrMore<WhitespaceNoNewline>();
 
  845         if (!TryConsume(
'\n')) {
 
  848           collector.ClearBuffer();
 
  856       case SLASH_NOT_COMMENT:
 
  859         if (!TryConsume(
'\n')) {
 
  869     ConsumeZeroOrMore<WhitespaceNoNewline>();
 
  871     switch (TryConsumeCommentStart()) {
 
  873         ConsumeLineComment(collector.GetBufferForLineComment());
 
  876         ConsumeBlockComment(collector.GetBufferForBlockComment());
 
  880         ConsumeZeroOrMore<WhitespaceNoNewline>();
 
  883       case SLASH_NOT_COMMENT:
 
  886         if (TryConsume(
'\n')) {
 
  889           collector.DetachFromPrev();
 
  919   const char* 
ptr = 
text.c_str();
 
  922     if (
ptr[1] == 
'x' || 
ptr[1] == 
'X') {
 
  933   for (; *
ptr != 
'\0'; 
ptr++) {
 
  934     int digit = DigitValue(*
ptr);
 
  935     if (digit < 0 || digit >= 
base) {
 
  940     if (
static_cast<uint64_t>(digit) > max_value ||
 
  960   if (*
end == 
'e' || *
end == 
'E') {
 
  967   if (*
end == 
'f' || *
end == 
'F') {
 
  973       << 
" Tokenizer::ParseFloat() passed text that could not have been" 
  974          " tokenized as a float: " 
  984   if (code_point <= 0x7f) {
 
  987   } 
else if (code_point <= 0x07ff) {
 
  988     tmp = 0x0000c080 | ((code_point & 0x07c0) << 2) | (code_point & 0x003f);
 
  990   } 
else if (code_point <= 0xffff) {
 
  991     tmp = 0x00e08080 | ((code_point & 0xf000) << 4) |
 
  992           ((code_point & 0x0fc0) << 2) | (code_point & 0x003f);
 
  994   } 
else if (code_point <= 0x10ffff) {
 
  995     tmp = 0xf0808080 | ((code_point & 0x1c0000) << 6) |
 
  996           ((code_point & 0x03f000) << 4) | ((code_point & 0x000fc0) << 2) |
 
  997           (code_point & 0x003f);
 
 1014   if (
len == 0) 
return false;
 
 1016     if (*
ptr == 
'\0') 
return false;
 
 1052   if (
key == 
'u') 
return 4;
 
 1053   if (
key == 
'U') 
return 8;
 
 1062   const char* 
p = 
ptr;
 
 1092   const size_t text_size = 
text.size();
 
 1093   if (text_size == 0) {
 
 1094     GOOGLE_LOG(DFATAL) << 
" Tokenizer::ParseStringAppend() passed text that could not" 
 1095                    " have been tokenized as a string: " 
 1103   const size_t new_len = text_size + 
output->size();
 
 1104   if (new_len > 
output->capacity()) {
 
 1105     output->reserve(new_len);
 
 1112   for (
const char* 
ptr = 
text.c_str() + 1; *
ptr != 
'\0'; 
ptr++) {
 
 1113     if (*
ptr == 
'\\' && 
ptr[1] != 
'\0') {
 
 1117       if (OctalDigit::InClass(*
ptr)) {
 
 1120         if (OctalDigit::InClass(
ptr[1])) {
 
 1124         if (OctalDigit::InClass(
ptr[1])) {
 
 1130       } 
else if (*
ptr == 
'x') {
 
 1134         if (HexDigit::InClass(
ptr[1])) {
 
 1138         if (HexDigit::InClass(
ptr[1])) {
 
 1144       } 
else if (*
ptr == 
'u' || *
ptr == 
'U') {
 
 1156         output->push_back(TranslateEscape(*
ptr));
 
 1159     } 
else if (*
ptr == 
text[0] && 
ptr[1] == 
'\0') {
 
 1167 template <
typename CharacterClass>
 
 1169   for (
const char character : s) {
 
 1170     if (!CharacterClass::InClass(character)) 
return false;
 
 1177   if (
text.size() == 0) 
return false;
 
 1178   if (!Letter::InClass(
text.at(0))) 
return false;
 
 1179   if (!AllInClass<Alphanumeric>(
text.substr(1))) 
return false;