91 #include <google/protobuf/io/tokenizer.h>
93 #include <google/protobuf/stubs/common.h>
94 #include <google/protobuf/stubs/logging.h>
95 #include <google/protobuf/stubs/stringprintf.h>
96 #include <google/protobuf/stubs/strutil.h>
97 #include <google/protobuf/io/strtod.h>
98 #include <google/protobuf/io/zero_copy_stream.h>
99 #include <google/protobuf/stubs/stl_util.h>
117 #define CHARACTER_CLASS(NAME, EXPRESSION) \
120 static inline bool InClass(char c) { return EXPRESSION; } \
123 CHARACTER_CLASS(Whitespace, c ==
' ' || c ==
'\n' || c ==
'\t' || c ==
'\r' ||
124 c ==
'\v' || c ==
'\f');
126 c ==
' ' || c ==
'\t' || c ==
'\r' || c ==
'\v' || c ==
'\f');
132 CHARACTER_CLASS(HexDigit, (
'0' <= c && c <=
'9') || (
'a' <= c && c <=
'f') ||
133 (
'A' <= c && c <=
'F'));
136 (
'a' <= c && c <=
'z') || (
'A' <= c && c <=
'Z') || (c ==
'_'));
139 (
'A' <= c && c <=
'Z') ||
140 (
'0' <= c && c <=
'9') || (c ==
'_'));
142 CHARACTER_CLASS(Escape, c ==
'a' || c ==
'b' || c ==
'f' || c ==
'n' ||
143 c ==
'r' || c ==
't' || c ==
'v' || c ==
'\\' ||
144 c ==
'?' || c ==
'\'' || c ==
'\"');
146 #undef CHARACTER_CLASS
150 inline int DigitValue(
char digit) {
151 if (
'0' <= digit && digit <=
'9')
return digit -
'0';
152 if (
'a' <= digit && digit <=
'z')
return digit -
'a' + 10;
153 if (
'A' <= digit && digit <=
'Z')
return digit -
'A' + 10;
158 inline char TranslateEscape(
char c) {
196 ErrorCollector* error_collector)
205 record_target_(NULL),
207 allow_f_after_float_(
false),
208 comment_style_(CPP_COMMENT_STYLE),
209 require_space_after_number_(
true),
210 allow_multiline_strings_(
false) {
219 Tokenizer::~Tokenizer() {
222 if (buffer_size_ > buffer_pos_) {
223 input_->BackUp(buffer_size_ - buffer_pos_);
227 bool Tokenizer::report_whitespace()
const {
return report_whitespace_; }
229 void Tokenizer::set_report_whitespace(
bool report) {
230 report_whitespace_ = report;
231 report_newlines_ &= report;
235 bool Tokenizer::report_newlines()
const {
return report_newlines_; }
237 void Tokenizer::set_report_newlines(
bool report) {
238 report_newlines_ = report;
239 report_whitespace_ |= report;
245 void Tokenizer::NextChar() {
248 if (current_char_ ==
'\n') {
251 }
else if (current_char_ ==
'\t') {
252 column_ += kTabWidth - column_ % kTabWidth;
259 if (buffer_pos_ < buffer_size_) {
260 current_char_ =
buffer_[buffer_pos_];
266 void Tokenizer::Refresh() {
268 current_char_ =
'\0';
273 if (record_target_ != NULL && record_start_ < buffer_size_) {
274 record_target_->append(
buffer_ + record_start_,
275 buffer_size_ - record_start_);
279 const void*
data = NULL;
287 current_char_ =
'\0';
290 }
while (buffer_size_ == 0);
299 record_start_ = buffer_pos_;
302 inline void Tokenizer::StopRecording() {
307 if (buffer_pos_ != record_start_) {
308 record_target_->append(
buffer_ + record_start_,
309 buffer_pos_ - record_start_);
311 record_target_ = NULL;
315 inline void Tokenizer::StartToken() {
323 inline void Tokenizer::EndToken() {
331 template <
typename CharacterClass>
332 inline bool Tokenizer::LookingAt() {
333 return CharacterClass::InClass(current_char_);
336 template <
typename CharacterClass>
337 inline bool Tokenizer::TryConsumeOne() {
338 if (CharacterClass::InClass(current_char_)) {
346 inline bool Tokenizer::TryConsume(
char c) {
347 if (current_char_ ==
c) {
355 template <
typename CharacterClass>
356 inline void Tokenizer::ConsumeZeroOrMore() {
357 while (CharacterClass::InClass(current_char_)) {
362 template <
typename CharacterClass>
363 inline void Tokenizer::ConsumeOneOrMore(
const char*
error) {
364 if (!CharacterClass::InClass(current_char_)) {
369 }
while (CharacterClass::InClass(current_char_));
377 void Tokenizer::ConsumeString(
char delimiter) {
379 switch (current_char_) {
381 AddError(
"Unexpected end of string.");
385 if (!allow_multiline_strings_) {
386 AddError(
"String literals cannot cross line boundaries.");
396 if (TryConsumeOne<Escape>()) {
398 }
else if (TryConsumeOne<OctalDigit>()) {
402 }
else if (TryConsume(
'x')) {
403 if (!TryConsumeOne<HexDigit>()) {
404 AddError(
"Expected hex digits for escape sequence.");
407 }
else if (TryConsume(
'u')) {
408 if (!TryConsumeOne<HexDigit>() || !TryConsumeOne<HexDigit>() ||
409 !TryConsumeOne<HexDigit>() || !TryConsumeOne<HexDigit>()) {
410 AddError(
"Expected four hex digits for \\u escape sequence.");
412 }
else if (TryConsume(
'U')) {
415 if (!TryConsume(
'0') || !TryConsume(
'0') ||
416 !(TryConsume(
'0') || TryConsume(
'1')) ||
417 !TryConsumeOne<HexDigit>() || !TryConsumeOne<HexDigit>() ||
418 !TryConsumeOne<HexDigit>() || !TryConsumeOne<HexDigit>() ||
419 !TryConsumeOne<HexDigit>()) {
421 "Expected eight hex digits up to 10ffff for \\U escape "
425 AddError(
"Invalid escape sequence in string literal.");
431 if (current_char_ == delimiter) {
442 Tokenizer::TokenType Tokenizer::ConsumeNumber(
bool started_with_zero,
443 bool started_with_dot) {
444 bool is_float =
false;
446 if (started_with_zero && (TryConsume(
'x') || TryConsume(
'X'))) {
448 ConsumeOneOrMore<HexDigit>(
"\"0x\" must be followed by hex digits.");
450 }
else if (started_with_zero && LookingAt<Digit>()) {
452 ConsumeZeroOrMore<OctalDigit>();
453 if (LookingAt<Digit>()) {
454 AddError(
"Numbers starting with leading zero must be in octal.");
455 ConsumeZeroOrMore<Digit>();
460 if (started_with_dot) {
462 ConsumeZeroOrMore<Digit>();
464 ConsumeZeroOrMore<Digit>();
466 if (TryConsume(
'.')) {
468 ConsumeZeroOrMore<Digit>();
472 if (TryConsume(
'e') || TryConsume(
'E')) {
474 TryConsume(
'-') || TryConsume(
'+');
475 ConsumeOneOrMore<Digit>(
"\"e\" must be followed by exponent.");
478 if (allow_f_after_float_ && (TryConsume(
'f') || TryConsume(
'F'))) {
483 if (LookingAt<Letter>() && require_space_after_number_) {
484 AddError(
"Need space between number and identifier.");
485 }
else if (current_char_ ==
'.') {
488 "Already saw decimal point or exponent; can't have another one.");
490 AddError(
"Hex and octal numbers must be integers.");
494 return is_float ? TYPE_FLOAT : TYPE_INTEGER;
500 while (current_char_ !=
'\0' && current_char_ !=
'\n') {
505 if (
content != NULL) StopRecording();
509 int start_line =
line_;
510 int start_column = column_ - 2;
515 while (current_char_ !=
'\0' && current_char_ !=
'*' &&
516 current_char_ !=
'/' && current_char_ !=
'\n') {
520 if (TryConsume(
'\n')) {
521 if (
content != NULL) StopRecording();
524 ConsumeZeroOrMore<WhitespaceNoNewline>();
525 if (TryConsume(
'*')) {
526 if (TryConsume(
'/')) {
533 }
else if (TryConsume(
'*') && TryConsume(
'/')) {
541 }
else if (TryConsume(
'/') && current_char_ ==
'*') {
545 "\"/*\" inside block comment. Block comments cannot be nested.");
546 }
else if (current_char_ ==
'\0') {
547 AddError(
"End-of-file inside block comment.");
549 " Comment started here.");
550 if (
content != NULL) StopRecording();
556 Tokenizer::NextCommentStatus Tokenizer::TryConsumeCommentStart() {
557 if (comment_style_ == CPP_COMMENT_STYLE && TryConsume(
'/')) {
558 if (TryConsume(
'/')) {
560 }
else if (TryConsume(
'*')) {
561 return BLOCK_COMMENT;
569 return SLASH_NOT_COMMENT;
571 }
else if (comment_style_ == SH_COMMENT_STYLE && TryConsume(
'#')) {
578 bool Tokenizer::TryConsumeWhitespace() {
579 if (report_newlines_) {
580 if (TryConsumeOne<WhitespaceNoNewline>()) {
581 ConsumeZeroOrMore<WhitespaceNoNewline>();
587 if (TryConsumeOne<Whitespace>()) {
588 ConsumeZeroOrMore<Whitespace>();
590 return report_whitespace_;
595 bool Tokenizer::TryConsumeNewline() {
596 if (!report_whitespace_ || !report_newlines_) {
599 if (TryConsume(
'\n')) {
611 while (!read_error_) {
613 bool report_token = TryConsumeWhitespace() || TryConsumeNewline();
619 switch (TryConsumeCommentStart()) {
621 ConsumeLineComment(NULL);
624 ConsumeBlockComment(NULL);
626 case SLASH_NOT_COMMENT:
633 if (read_error_)
break;
635 if (LookingAt<Unprintable>() || current_char_ ==
'\0') {
636 AddError(
"Invalid control characters encountered in text.");
643 while (TryConsumeOne<Unprintable>() ||
644 (!read_error_ && TryConsume(
'\0'))) {
652 if (TryConsumeOne<Letter>()) {
653 ConsumeZeroOrMore<Alphanumeric>();
655 }
else if (TryConsume(
'0')) {
656 current_.type = ConsumeNumber(
true,
false);
657 }
else if (TryConsume(
'.')) {
661 if (TryConsumeOne<Digit>()) {
663 if (previous_.type == TYPE_IDENTIFIER &&
665 current_.column == previous_.end_column) {
669 "Need space between identifier and decimal point.");
671 current_.type = ConsumeNumber(
false,
true);
675 }
else if (TryConsumeOne<Digit>()) {
676 current_.type = ConsumeNumber(
false,
false);
677 }
else if (TryConsume(
'\"')) {
680 }
else if (TryConsume(
'\'')) {
685 if (current_char_ & 0x80) {
689 static_cast<unsigned char>(current_char_)));
718 class CommentCollector {
734 ~CommentCollector() {
819 if (TryConsume(
static_cast<char>(0xEF))) {
820 if (!TryConsume(
static_cast<char>(0xBB)) ||
821 !TryConsume(
static_cast<char>(0xBF))) {
823 "Proto file starts with 0xEF but not UTF-8 BOM. "
824 "Only UTF-8 is accepted for proto file.");
828 collector.DetachFromPrev();
832 ConsumeZeroOrMore<WhitespaceNoNewline>();
833 switch (TryConsumeCommentStart()) {
835 ConsumeLineComment(collector.GetBufferForLineComment());
842 ConsumeBlockComment(collector.GetBufferForBlockComment());
844 ConsumeZeroOrMore<WhitespaceNoNewline>();
845 if (!TryConsume(
'\n')) {
848 collector.ClearBuffer();
856 case SLASH_NOT_COMMENT:
859 if (!TryConsume(
'\n')) {
869 ConsumeZeroOrMore<WhitespaceNoNewline>();
871 switch (TryConsumeCommentStart()) {
873 ConsumeLineComment(collector.GetBufferForLineComment());
876 ConsumeBlockComment(collector.GetBufferForBlockComment());
880 ConsumeZeroOrMore<WhitespaceNoNewline>();
883 case SLASH_NOT_COMMENT:
886 if (TryConsume(
'\n')) {
889 collector.DetachFromPrev();
919 const char*
ptr =
text.c_str();
922 if (
ptr[1] ==
'x' ||
ptr[1] ==
'X') {
933 for (; *
ptr !=
'\0';
ptr++) {
934 int digit = DigitValue(*
ptr);
935 if (digit < 0 || digit >=
base) {
940 if (
static_cast<uint64_t>(digit) > max_value ||
960 if (*
end ==
'e' || *
end ==
'E') {
967 if (*
end ==
'f' || *
end ==
'F') {
973 <<
" Tokenizer::ParseFloat() passed text that could not have been"
974 " tokenized as a float: "
984 if (code_point <= 0x7f) {
987 }
else if (code_point <= 0x07ff) {
988 tmp = 0x0000c080 | ((code_point & 0x07c0) << 2) | (code_point & 0x003f);
990 }
else if (code_point <= 0xffff) {
991 tmp = 0x00e08080 | ((code_point & 0xf000) << 4) |
992 ((code_point & 0x0fc0) << 2) | (code_point & 0x003f);
994 }
else if (code_point <= 0x10ffff) {
995 tmp = 0xf0808080 | ((code_point & 0x1c0000) << 6) |
996 ((code_point & 0x03f000) << 4) | ((code_point & 0x000fc0) << 2) |
997 (code_point & 0x003f);
1014 if (
len == 0)
return false;
1016 if (*
ptr ==
'\0')
return false;
1052 if (
key ==
'u')
return 4;
1053 if (
key ==
'U')
return 8;
1062 const char*
p =
ptr;
1092 const size_t text_size =
text.size();
1093 if (text_size == 0) {
1094 GOOGLE_LOG(DFATAL) <<
" Tokenizer::ParseStringAppend() passed text that could not"
1095 " have been tokenized as a string: "
1103 const size_t new_len = text_size +
output->size();
1104 if (new_len >
output->capacity()) {
1105 output->reserve(new_len);
1112 for (
const char*
ptr =
text.c_str() + 1; *
ptr !=
'\0';
ptr++) {
1113 if (*
ptr ==
'\\' &&
ptr[1] !=
'\0') {
1117 if (OctalDigit::InClass(*
ptr)) {
1120 if (OctalDigit::InClass(
ptr[1])) {
1124 if (OctalDigit::InClass(
ptr[1])) {
1130 }
else if (*
ptr ==
'x') {
1134 if (HexDigit::InClass(
ptr[1])) {
1138 if (HexDigit::InClass(
ptr[1])) {
1144 }
else if (*
ptr ==
'u' || *
ptr ==
'U') {
1156 output->push_back(TranslateEscape(*
ptr));
1159 }
else if (*
ptr ==
text[0] &&
ptr[1] ==
'\0') {
1167 template <
typename CharacterClass>
1169 for (
const char character : s) {
1170 if (!CharacterClass::InClass(character))
return false;
1177 if (
text.size() == 0)
return false;
1178 if (!Letter::InClass(
text.at(0)))
return false;
1179 if (!AllInClass<Alphanumeric>(
text.substr(1)))
return false;