116 #define CHARACTER_CLASS(NAME, EXPRESSION) \
119 static inline bool InClass(char c) { return EXPRESSION; } \
122 CHARACTER_CLASS(Whitespace, c ==
' ' || c ==
'\n' || c ==
'\t' || c ==
'\r' ||
123 c ==
'\v' || c ==
'\f');
125 c ==
' ' || c ==
'\t' || c ==
'\r' || c ==
'\v' || c ==
'\f');
131 CHARACTER_CLASS(HexDigit, (
'0' <= c && c <=
'9') || (
'a' <= c && c <=
'f') ||
132 (
'A' <= c && c <=
'F'));
135 (
'a' <= c && c <=
'z') || (
'A' <= c && c <=
'Z') || (c ==
'_'));
138 (
'A' <= c && c <=
'Z') ||
139 (
'0' <= c && c <=
'9') || (c ==
'_'));
141 CHARACTER_CLASS(Escape, c ==
'a' || c ==
'b' || c ==
'f' || c ==
'n' ||
142 c ==
'r' || c ==
't' || c ==
'v' || c ==
'\\' ||
143 c ==
'?' || c ==
'\'' || c ==
'\"');
145 #undef CHARACTER_CLASS
149 inline int DigitValue(
char digit) {
150 if (
'0' <= digit && digit <=
'9')
return digit -
'0';
151 if (
'a' <= digit && digit <=
'z')
return digit -
'a' + 10;
152 if (
'A' <= digit && digit <=
'Z')
return digit -
'A' + 10;
157 inline char TranslateEscape(
char c) {
204 record_target_(
NULL),
206 allow_f_after_float_(
false),
207 comment_style_(CPP_COMMENT_STYLE),
208 require_space_after_number_(
true),
209 allow_multiline_strings_(
false) {
315 template <
typename CharacterClass>
320 template <
typename CharacterClass>
339 template <
typename CharacterClass>
346 template <
typename CharacterClass>
365 AddError(
"Unexpected end of string.");
370 AddError(
"String literals cannot cross line boundaries.");
380 if (TryConsumeOne<Escape>()) {
382 }
else if (TryConsumeOne<OctalDigit>()) {
387 if (!TryConsumeOne<HexDigit>()) {
388 AddError(
"Expected hex digits for escape sequence.");
392 if (!TryConsumeOne<HexDigit>() || !TryConsumeOne<HexDigit>() ||
393 !TryConsumeOne<HexDigit>() || !TryConsumeOne<HexDigit>()) {
394 AddError(
"Expected four hex digits for \\u escape sequence.");
401 !TryConsumeOne<HexDigit>() || !TryConsumeOne<HexDigit>() ||
402 !TryConsumeOne<HexDigit>() || !TryConsumeOne<HexDigit>() ||
403 !TryConsumeOne<HexDigit>()) {
405 "Expected eight hex digits up to 10ffff for \\U escape "
409 AddError(
"Invalid escape sequence in string literal.");
427 bool started_with_dot) {
432 ConsumeOneOrMore<HexDigit>(
"\"0x\" must be followed by hex digits.");
434 }
else if (started_with_zero && LookingAt<Digit>()) {
436 ConsumeZeroOrMore<OctalDigit>();
437 if (LookingAt<Digit>()) {
438 AddError(
"Numbers starting with leading zero must be in octal.");
439 ConsumeZeroOrMore<Digit>();
444 if (started_with_dot) {
446 ConsumeZeroOrMore<Digit>();
448 ConsumeZeroOrMore<Digit>();
452 ConsumeZeroOrMore<Digit>();
459 ConsumeOneOrMore<Digit>(
"\"e\" must be followed by exponent.");
468 AddError(
"Need space between number and identifier.");
472 "Already saw decimal point or exponent; can't have another one.");
474 AddError(
"Hex and octal numbers must be integers.");
493 int start_line =
line_;
494 int start_column =
column_ - 2;
508 ConsumeZeroOrMore<WhitespaceNoNewline>();
519 if (content !=
NULL) {
522 content->erase(content->size() - 2);
529 "\"/*\" inside block comment. Block comments cannot be nested.");
531 AddError(
"End-of-file inside block comment.");
533 " Comment started here.");
568 ConsumeZeroOrMore<Whitespace>();
587 AddError(
"Invalid control characters encountered in text.");
594 while (TryConsumeOne<Unprintable>() ||
603 if (TryConsumeOne<Letter>()) {
604 ConsumeZeroOrMore<Alphanumeric>();
612 if (TryConsumeOne<Digit>()) {
620 "Need space between identifier and decimal point.");
626 }
else if (TryConsumeOne<Digit>()) {
669 class CommentCollector {
685 ~CommentCollector() {
773 "Proto file starts with 0xEF but not UTF-8 BOM. "
774 "Only UTF-8 is accepted for proto file.");
778 collector.DetachFromPrev();
782 ConsumeZeroOrMore<WhitespaceNoNewline>();
794 ConsumeZeroOrMore<WhitespaceNoNewline>();
798 collector.ClearBuffer();
819 ConsumeZeroOrMore<WhitespaceNoNewline>();
830 ConsumeZeroOrMore<WhitespaceNoNewline>();
839 collector.DetachFromPrev();
841 bool result =
Next();
869 const char* ptr = text.c_str();
872 if (ptr[1] ==
'x' || ptr[1] ==
'X') {
883 for (; *ptr !=
'\0'; ptr++) {
884 int digit = DigitValue(*ptr);
885 if (digit < 0 || digit >=
base) {
890 if (digit > max_value || result > (max_value - digit) /
base) {
894 result = result *
base + digit;
902 const char*
start = text.c_str();
909 if (*
end ==
'e' || *
end ==
'E') {
916 if (*
end ==
'f' || *
end ==
'F') {
921 <<
" Tokenizer::ParseFloat() passed text that could not have been"
922 " tokenized as a float: "
932 if (code_point <= 0x7f) {
935 }
else if (code_point <= 0x07ff) {
936 tmp = 0x0000c080 | ((code_point & 0x07c0) << 2) | (code_point & 0x003f);
938 }
else if (code_point <= 0xffff) {
939 tmp = 0x00e08080 | ((code_point & 0xf000) << 4) |
940 ((code_point & 0x0fc0) << 2) | (code_point & 0x003f);
942 }
else if (code_point <= 0x1fffff) {
943 tmp = 0xf0808080 | ((code_point & 0x1c0000) << 6) |
944 ((code_point & 0x03f000) << 4) | ((code_point & 0x000fc0) << 2) |
945 (code_point & 0x003f);
954 output->append(
reinterpret_cast<const char*
>(&tmp) +
sizeof(tmp) -
len,
len);
961 if (
len == 0)
return false;
962 for (
const char*
end = ptr +
len; ptr <
end; ++ptr) {
963 if (*ptr ==
'\0')
return false;
964 *result = (*result << 4) + DigitValue(*ptr);
998 if (
key ==
'u')
return 4;
999 if (
key ==
'U')
return 8;
1008 const char*
p = ptr;
1038 const size_t text_size = text.size();
1039 if (text_size == 0) {
1040 GOOGLE_LOG(DFATAL) <<
" Tokenizer::ParseStringAppend() passed text that could not"
1041 " have been tokenized as a string: "
1049 const size_t new_len = text_size +
output->size();
1050 if (new_len >
output->capacity()) {
1051 output->reserve(new_len);
1058 for (
const char* ptr = text.c_str() + 1; *ptr !=
'\0'; ptr++) {
1059 if (*ptr ==
'\\' && ptr[1] !=
'\0') {
1063 if (OctalDigit::InClass(*ptr)) {
1065 int code = DigitValue(*ptr);
1066 if (OctalDigit::InClass(ptr[1])) {
1068 code = code * 8 + DigitValue(*ptr);
1070 if (OctalDigit::InClass(ptr[1])) {
1072 code = code * 8 + DigitValue(*ptr);
1074 output->push_back(
static_cast<char>(code));
1076 }
else if (*ptr ==
'x') {
1080 if (HexDigit::InClass(ptr[1])) {
1082 code = DigitValue(*ptr);
1084 if (HexDigit::InClass(ptr[1])) {
1086 code = code * 16 + DigitValue(*ptr);
1088 output->push_back(
static_cast<char>(code));
1090 }
else if (*ptr ==
'u' || *ptr ==
'U') {
1102 output->push_back(TranslateEscape(*ptr));
1105 }
else if (*ptr == text[0] && ptr[1] ==
'\0') {
1113 template <
typename CharacterClass>
1115 for (
int i = 0;
i <
s.size(); ++
i) {
1116 if (!CharacterClass::InClass(
s[
i]))
return false;
1123 if (text.size() == 0)
return false;
1124 if (!Letter::InClass(text.at(0)))
return false;
1125 if (!AllInClass<Alphanumeric>(text.substr(1)))
return false;