tokenizer_unittest.cc
Go to the documentation of this file.
1 // Protocol Buffers - Google's data interchange format
2 // Copyright 2008 Google Inc. All rights reserved.
3 // https://developers.google.com/protocol-buffers/
4 //
5 // Redistribution and use in source and binary forms, with or without
6 // modification, are permitted provided that the following conditions are
7 // met:
8 //
9 // * Redistributions of source code must retain the above copyright
10 // notice, this list of conditions and the following disclaimer.
11 // * Redistributions in binary form must reproduce the above
12 // copyright notice, this list of conditions and the following disclaimer
13 // in the documentation and/or other materials provided with the
14 // distribution.
15 // * Neither the name of Google Inc. nor the names of its
16 // contributors may be used to endorse or promote products derived from
17 // this software without specific prior written permission.
18 //
19 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
20 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
21 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
22 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
23 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
24 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
25 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
26 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
27 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 
31 // Author: kenton@google.com (Kenton Varda)
32 // Based on original Protocol Buffers design by
33 // Sanjay Ghemawat, Jeff Dean, and others.
34 
35 #include <limits.h>
36 #include <math.h>
37 
38 #include <vector>
39 
42 
48 #include <gtest/gtest.h>
49 
50 namespace google {
51 namespace protobuf {
52 namespace io {
53 namespace {
54 
55 // ===================================================================
56 // Data-Driven Test Infrastructure
57 
58 // TODO(kenton): This is copied from coded_stream_unittest. This is
59 // temporary until these fetaures are integrated into gTest itself.
60 
61 // TEST_1D and TEST_2D are macros I'd eventually like to see added to
62 // gTest. These macros can be used to declare tests which should be
63 // run multiple times, once for each item in some input array. TEST_1D
64 // tests all cases in a single input array. TEST_2D tests all
65 // combinations of cases from two arrays. The arrays must be statically
66 // defined such that the GOOGLE_ARRAYSIZE() macro works on them. Example:
67 //
68 // int kCases[] = {1, 2, 3, 4}
69 // TEST_1D(MyFixture, MyTest, kCases) {
70 // EXPECT_GT(kCases_case, 0);
71 // }
72 //
73 // This test iterates through the numbers 1, 2, 3, and 4 and tests that
74 // they are all grater than zero. In case of failure, the exact case
75 // which failed will be printed. The case type must be printable using
76 // ostream::operator<<.
77 
78 #define TEST_1D(FIXTURE, NAME, CASES) \
79  class FIXTURE##_##NAME##_DD : public FIXTURE { \
80  protected: \
81  template <typename CaseType> \
82  void DoSingleCase(const CaseType& CASES##_case); \
83  }; \
84  \
85  TEST_F(FIXTURE##_##NAME##_DD, NAME) { \
86  for (int i = 0; i < GOOGLE_ARRAYSIZE(CASES); i++) { \
87  SCOPED_TRACE(testing::Message() \
88  << #CASES " case #" << i << ": " << CASES[i]); \
89  DoSingleCase(CASES[i]); \
90  } \
91  } \
92  \
93  template <typename CaseType> \
94  void FIXTURE##_##NAME##_DD::DoSingleCase(const CaseType& CASES##_case)
95 
96 #define TEST_2D(FIXTURE, NAME, CASES1, CASES2) \
97  class FIXTURE##_##NAME##_DD : public FIXTURE { \
98  protected: \
99  template <typename CaseType1, typename CaseType2> \
100  void DoSingleCase(const CaseType1& CASES1##_case, \
101  const CaseType2& CASES2##_case); \
102  }; \
103  \
104  TEST_F(FIXTURE##_##NAME##_DD, NAME) { \
105  for (int i = 0; i < GOOGLE_ARRAYSIZE(CASES1); i++) { \
106  for (int j = 0; j < GOOGLE_ARRAYSIZE(CASES2); j++) { \
107  SCOPED_TRACE(testing::Message() \
108  << #CASES1 " case #" << i << ": " << CASES1[i] << ", " \
109  << #CASES2 " case #" << j << ": " << CASES2[j]); \
110  DoSingleCase(CASES1[i], CASES2[j]); \
111  } \
112  } \
113  } \
114  \
115  template <typename CaseType1, typename CaseType2> \
116  void FIXTURE##_##NAME##_DD::DoSingleCase(const CaseType1& CASES1##_case, \
117  const CaseType2& CASES2##_case)
118 
119 // -------------------------------------------------------------------
120 
121 // An input stream that is basically like an ArrayInputStream but sometimes
122 // returns empty buffers, just to throw us off.
123 class TestInputStream : public ZeroCopyInputStream {
124  public:
125  TestInputStream(const void* data, int size, int block_size)
126  : array_stream_(data, size, block_size), counter_(0) {}
127  ~TestInputStream() {}
128 
129  // implements ZeroCopyInputStream ----------------------------------
130  bool Next(const void** data, int* size) {
131  // We'll return empty buffers starting with the first buffer, and every
132  // 3 and 5 buffers after that.
133  if (counter_ % 3 == 0 || counter_ % 5 == 0) {
134  *data = NULL;
135  *size = 0;
136  ++counter_;
137  return true;
138  } else {
139  ++counter_;
140  return array_stream_.Next(data, size);
141  }
142  }
143 
144  void BackUp(int count) { return array_stream_.BackUp(count); }
145  bool Skip(int count) { return array_stream_.Skip(count); }
146  int64 ByteCount() const { return array_stream_.ByteCount(); }
147 
148  private:
149  ArrayInputStream array_stream_;
150  int counter_;
151 };
152 
153 // -------------------------------------------------------------------
154 
155 // An error collector which simply concatenates all its errors into a big
156 // block of text which can be checked.
157 class TestErrorCollector : public ErrorCollector {
158  public:
159  TestErrorCollector() {}
160  ~TestErrorCollector() {}
161 
163 
164  // implements ErrorCollector ---------------------------------------
165  void AddError(int line, int column, const std::string& message) {
166  strings::SubstituteAndAppend(&text_, "$0:$1: $2\n", line, column, message);
167  }
168 };
169 
170 // -------------------------------------------------------------------
171 
172 // We test each operation over a variety of block sizes to insure that
173 // we test cases where reads cross buffer boundaries as well as cases
174 // where they don't. This is sort of a brute-force approach to this,
175 // but it's easy to write and easy to understand.
176 const int kBlockSizes[] = {1, 2, 3, 5, 7, 13, 32, 1024};
177 
178 class TokenizerTest : public testing::Test {
179  protected:
180  // For easy testing.
181  uint64 ParseInteger(const std::string& text) {
182  uint64 result;
184  return result;
185  }
186 };
187 
188 // ===================================================================
189 
190 // These tests causes gcc 3.3.5 (and earlier?) to give the cryptic error:
191 // "sorry, unimplemented: `method_call_expr' not supported by dump_expr"
192 #if !defined(__GNUC__) || __GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ > 3)
193 
194 // In each test case, the entire input text should parse as a single token
195 // of the given type.
196 struct SimpleTokenCase {
199 };
200 
201 inline std::ostream& operator<<(std::ostream& out,
202  const SimpleTokenCase& test_case) {
203  return out << CEscape(test_case.input);
204 }
205 
206 SimpleTokenCase kSimpleTokenCases[] = {
207  // Test identifiers.
208  {"hello", Tokenizer::TYPE_IDENTIFIER},
209 
210  // Test integers.
211  {"123", Tokenizer::TYPE_INTEGER},
212  {"0xab6", Tokenizer::TYPE_INTEGER},
213  {"0XAB6", Tokenizer::TYPE_INTEGER},
214  {"0X1234567", Tokenizer::TYPE_INTEGER},
215  {"0x89abcdef", Tokenizer::TYPE_INTEGER},
216  {"0x89ABCDEF", Tokenizer::TYPE_INTEGER},
217  {"01234567", Tokenizer::TYPE_INTEGER},
218 
219  // Test floats.
220  {"123.45", Tokenizer::TYPE_FLOAT},
221  {"1.", Tokenizer::TYPE_FLOAT},
222  {"1e3", Tokenizer::TYPE_FLOAT},
223  {"1E3", Tokenizer::TYPE_FLOAT},
224  {"1e-3", Tokenizer::TYPE_FLOAT},
225  {"1e+3", Tokenizer::TYPE_FLOAT},
226  {"1.e3", Tokenizer::TYPE_FLOAT},
227  {"1.2e3", Tokenizer::TYPE_FLOAT},
228  {".1", Tokenizer::TYPE_FLOAT},
229  {".1e3", Tokenizer::TYPE_FLOAT},
230  {".1e-3", Tokenizer::TYPE_FLOAT},
231  {".1e+3", Tokenizer::TYPE_FLOAT},
232 
233  // Test strings.
234  {"'hello'", Tokenizer::TYPE_STRING},
235  {"\"foo\"", Tokenizer::TYPE_STRING},
236  {"'a\"b'", Tokenizer::TYPE_STRING},
237  {"\"a'b\"", Tokenizer::TYPE_STRING},
238  {"'a\\'b'", Tokenizer::TYPE_STRING},
239  {"\"a\\\"b\"", Tokenizer::TYPE_STRING},
240  {"'\\xf'", Tokenizer::TYPE_STRING},
241  {"'\\0'", Tokenizer::TYPE_STRING},
242 
243  // Test symbols.
244  {"+", Tokenizer::TYPE_SYMBOL},
245  {".", Tokenizer::TYPE_SYMBOL},
246 };
247 
248 TEST_2D(TokenizerTest, SimpleTokens, kSimpleTokenCases, kBlockSizes) {
249  // Set up the tokenizer.
250  TestInputStream input(kSimpleTokenCases_case.input.data(),
251  kSimpleTokenCases_case.input.size(), kBlockSizes_case);
252  TestErrorCollector error_collector;
253  Tokenizer tokenizer(&input, &error_collector);
254 
255  // Before Next() is called, the initial token should always be TYPE_START.
256  EXPECT_EQ(Tokenizer::TYPE_START, tokenizer.current().type);
257  EXPECT_EQ("", tokenizer.current().text);
258  EXPECT_EQ(0, tokenizer.current().line);
259  EXPECT_EQ(0, tokenizer.current().column);
260  EXPECT_EQ(0, tokenizer.current().end_column);
261 
262  // Parse the token.
263  ASSERT_TRUE(tokenizer.Next());
264 
265  // Check that it has the right type.
266  EXPECT_EQ(kSimpleTokenCases_case.type, tokenizer.current().type);
267  // Check that it contains the complete input text.
268  EXPECT_EQ(kSimpleTokenCases_case.input, tokenizer.current().text);
269  // Check that it is located at the beginning of the input
270  EXPECT_EQ(0, tokenizer.current().line);
271  EXPECT_EQ(0, tokenizer.current().column);
272  EXPECT_EQ(kSimpleTokenCases_case.input.size(),
273  tokenizer.current().end_column);
274 
275  // There should be no more input.
276  EXPECT_FALSE(tokenizer.Next());
277 
278  // After Next() returns false, the token should have type TYPE_END.
279  EXPECT_EQ(Tokenizer::TYPE_END, tokenizer.current().type);
280  EXPECT_EQ("", tokenizer.current().text);
281  EXPECT_EQ(0, tokenizer.current().line);
282  EXPECT_EQ(kSimpleTokenCases_case.input.size(), tokenizer.current().column);
283  EXPECT_EQ(kSimpleTokenCases_case.input.size(),
284  tokenizer.current().end_column);
285 
286  // There should be no errors.
287  EXPECT_TRUE(error_collector.text_.empty());
288 }
289 
290 TEST_1D(TokenizerTest, FloatSuffix, kBlockSizes) {
291  // Test the "allow_f_after_float" option.
292 
293  // Set up the tokenizer.
294  const char* text = "1f 2.5f 6e3f 7F";
295  TestInputStream input(text, strlen(text), kBlockSizes_case);
296  TestErrorCollector error_collector;
297  Tokenizer tokenizer(&input, &error_collector);
298  tokenizer.set_allow_f_after_float(true);
299 
300  // Advance through tokens and check that they are parsed as expected.
301  ASSERT_TRUE(tokenizer.Next());
302  EXPECT_EQ(tokenizer.current().text, "1f");
303  EXPECT_EQ(tokenizer.current().type, Tokenizer::TYPE_FLOAT);
304  ASSERT_TRUE(tokenizer.Next());
305  EXPECT_EQ(tokenizer.current().text, "2.5f");
306  EXPECT_EQ(tokenizer.current().type, Tokenizer::TYPE_FLOAT);
307  ASSERT_TRUE(tokenizer.Next());
308  EXPECT_EQ(tokenizer.current().text, "6e3f");
309  EXPECT_EQ(tokenizer.current().type, Tokenizer::TYPE_FLOAT);
310  ASSERT_TRUE(tokenizer.Next());
311  EXPECT_EQ(tokenizer.current().text, "7F");
312  EXPECT_EQ(tokenizer.current().type, Tokenizer::TYPE_FLOAT);
313 
314  // There should be no more input.
315  EXPECT_FALSE(tokenizer.Next());
316  // There should be no errors.
317  EXPECT_TRUE(error_collector.text_.empty());
318 }
319 
320 #endif
321 
322 // -------------------------------------------------------------------
323 
324 // In each case, the input is parsed to produce a list of tokens. The
325 // last token in "output" must have type TYPE_END.
326 struct MultiTokenCase {
328  Tokenizer::Token output[10]; // The compiler wants a constant array
329  // size for initialization to work. There
330  // is no reason this can't be increased if
331  // needed.
332 };
333 
334 inline std::ostream& operator<<(std::ostream& out,
335  const MultiTokenCase& test_case) {
336  return out << CEscape(test_case.input);
337 }
338 
339 MultiTokenCase kMultiTokenCases[] = {
340  // Test empty input.
341  {"",
342  {
343  {Tokenizer::TYPE_END, "", 0, 0, 0},
344  }},
345 
346  // Test all token types at the same time.
347  {"foo 1 1.2 + 'bar'",
348  {
349  {Tokenizer::TYPE_IDENTIFIER, "foo", 0, 0, 3},
350  {Tokenizer::TYPE_INTEGER, "1", 0, 4, 5},
351  {Tokenizer::TYPE_FLOAT, "1.2", 0, 6, 9},
352  {Tokenizer::TYPE_SYMBOL, "+", 0, 10, 11},
353  {Tokenizer::TYPE_STRING, "'bar'", 0, 12, 17},
354  {Tokenizer::TYPE_END, "", 0, 17, 17},
355  }},
356 
357  // Test that consecutive symbols are parsed as separate tokens.
358  {"!@+%",
359  {
360  {Tokenizer::TYPE_SYMBOL, "!", 0, 0, 1},
361  {Tokenizer::TYPE_SYMBOL, "@", 0, 1, 2},
362  {Tokenizer::TYPE_SYMBOL, "+", 0, 2, 3},
363  {Tokenizer::TYPE_SYMBOL, "%", 0, 3, 4},
364  {Tokenizer::TYPE_END, "", 0, 4, 4},
365  }},
366 
367  // Test that newlines affect line numbers correctly.
368  {"foo bar\nrab oof",
369  {
370  {Tokenizer::TYPE_IDENTIFIER, "foo", 0, 0, 3},
371  {Tokenizer::TYPE_IDENTIFIER, "bar", 0, 4, 7},
372  {Tokenizer::TYPE_IDENTIFIER, "rab", 1, 0, 3},
373  {Tokenizer::TYPE_IDENTIFIER, "oof", 1, 4, 7},
374  {Tokenizer::TYPE_END, "", 1, 7, 7},
375  }},
376 
377  // Test that tabs affect column numbers correctly.
378  {"foo\tbar \tbaz",
379  {
380  {Tokenizer::TYPE_IDENTIFIER, "foo", 0, 0, 3},
381  {Tokenizer::TYPE_IDENTIFIER, "bar", 0, 8, 11},
382  {Tokenizer::TYPE_IDENTIFIER, "baz", 0, 16, 19},
383  {Tokenizer::TYPE_END, "", 0, 19, 19},
384  }},
385 
386  // Test that tabs in string literals affect column numbers correctly.
387  {"\"foo\tbar\" baz",
388  {
389  {Tokenizer::TYPE_STRING, "\"foo\tbar\"", 0, 0, 12},
390  {Tokenizer::TYPE_IDENTIFIER, "baz", 0, 13, 16},
391  {Tokenizer::TYPE_END, "", 0, 16, 16},
392  }},
393 
394  // Test that line comments are ignored.
395  {"foo // This is a comment\n"
396  "bar // This is another comment",
397  {
398  {Tokenizer::TYPE_IDENTIFIER, "foo", 0, 0, 3},
399  {Tokenizer::TYPE_IDENTIFIER, "bar", 1, 0, 3},
400  {Tokenizer::TYPE_END, "", 1, 30, 30},
401  }},
402 
403  // Test that block comments are ignored.
404  {"foo /* This is a block comment */ bar",
405  {
406  {Tokenizer::TYPE_IDENTIFIER, "foo", 0, 0, 3},
407  {Tokenizer::TYPE_IDENTIFIER, "bar", 0, 34, 37},
408  {Tokenizer::TYPE_END, "", 0, 37, 37},
409  }},
410 
411  // Test that sh-style comments are not ignored by default.
412  {"foo # bar\n"
413  "baz",
414  {
415  {Tokenizer::TYPE_IDENTIFIER, "foo", 0, 0, 3},
416  {Tokenizer::TYPE_SYMBOL, "#", 0, 4, 5},
417  {Tokenizer::TYPE_IDENTIFIER, "bar", 0, 6, 9},
418  {Tokenizer::TYPE_IDENTIFIER, "baz", 1, 0, 3},
419  {Tokenizer::TYPE_END, "", 1, 3, 3},
420  }},
421 
422  // Test all whitespace chars
423  {"foo\n\t\r\v\fbar",
424  {
425  {Tokenizer::TYPE_IDENTIFIER, "foo", 0, 0, 3},
426  {Tokenizer::TYPE_IDENTIFIER, "bar", 1, 11, 14},
427  {Tokenizer::TYPE_END, "", 1, 14, 14},
428  }},
429 };
430 
431 TEST_2D(TokenizerTest, MultipleTokens, kMultiTokenCases, kBlockSizes) {
432  // Set up the tokenizer.
433  TestInputStream input(kMultiTokenCases_case.input.data(),
434  kMultiTokenCases_case.input.size(), kBlockSizes_case);
435  TestErrorCollector error_collector;
436  Tokenizer tokenizer(&input, &error_collector);
437 
438  // Before Next() is called, the initial token should always be TYPE_START.
439  EXPECT_EQ(Tokenizer::TYPE_START, tokenizer.current().type);
440  EXPECT_EQ("", tokenizer.current().text);
441  EXPECT_EQ(0, tokenizer.current().line);
442  EXPECT_EQ(0, tokenizer.current().column);
443  EXPECT_EQ(0, tokenizer.current().end_column);
444 
445  // Loop through all expected tokens.
446  int i = 0;
447  Tokenizer::Token token;
448  do {
449  token = kMultiTokenCases_case.output[i++];
450 
451  SCOPED_TRACE(testing::Message() << "Token #" << i << ": " << token.text);
452 
453  Tokenizer::Token previous = tokenizer.current();
454 
455  // Next() should only return false when it hits the end token.
456  if (token.type != Tokenizer::TYPE_END) {
457  ASSERT_TRUE(tokenizer.Next());
458  } else {
459  ASSERT_FALSE(tokenizer.Next());
460  }
461 
462  // Check that the previous token is set correctly.
463  EXPECT_EQ(previous.type, tokenizer.previous().type);
464  EXPECT_EQ(previous.text, tokenizer.previous().text);
465  EXPECT_EQ(previous.line, tokenizer.previous().line);
466  EXPECT_EQ(previous.column, tokenizer.previous().column);
467  EXPECT_EQ(previous.end_column, tokenizer.previous().end_column);
468 
469  // Check that the token matches the expected one.
470  EXPECT_EQ(token.type, tokenizer.current().type);
471  EXPECT_EQ(token.text, tokenizer.current().text);
472  EXPECT_EQ(token.line, tokenizer.current().line);
473  EXPECT_EQ(token.column, tokenizer.current().column);
474  EXPECT_EQ(token.end_column, tokenizer.current().end_column);
475 
476  } while (token.type != Tokenizer::TYPE_END);
477 
478  // There should be no errors.
479  EXPECT_TRUE(error_collector.text_.empty());
480 }
481 
482 // This test causes gcc 3.3.5 (and earlier?) to give the cryptic error:
483 // "sorry, unimplemented: `method_call_expr' not supported by dump_expr"
484 #if !defined(__GNUC__) || __GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ > 3)
485 
486 TEST_1D(TokenizerTest, ShCommentStyle, kBlockSizes) {
487  // Test the "comment_style" option.
488 
489  const char* text =
490  "foo # bar\n"
491  "baz // qux\n"
492  "corge /* grault */\n"
493  "garply";
494  const char* const kTokens[] = {"foo", // "# bar" is ignored
495  "baz", "/", "/", "qux", "corge", "/",
496  "*", "grault", "*", "/", "garply"};
497 
498  // Set up the tokenizer.
499  TestInputStream input(text, strlen(text), kBlockSizes_case);
500  TestErrorCollector error_collector;
501  Tokenizer tokenizer(&input, &error_collector);
502  tokenizer.set_comment_style(Tokenizer::SH_COMMENT_STYLE);
503 
504  // Advance through tokens and check that they are parsed as expected.
505  for (int i = 0; i < GOOGLE_ARRAYSIZE(kTokens); i++) {
506  EXPECT_TRUE(tokenizer.Next());
507  EXPECT_EQ(tokenizer.current().text, kTokens[i]);
508  }
509 
510  // There should be no more input.
511  EXPECT_FALSE(tokenizer.Next());
512  // There should be no errors.
513  EXPECT_TRUE(error_collector.text_.empty());
514 }
515 
516 #endif
517 
518 // -------------------------------------------------------------------
519 
520 // In each case, the input is expected to have two tokens named "prev" and
521 // "next" with comments in between.
522 struct DocCommentCase {
524 
526  const char* detached_comments[10];
528 };
529 
530 inline std::ostream& operator<<(std::ostream& out,
531  const DocCommentCase& test_case) {
532  return out << CEscape(test_case.input);
533 }
534 
535 DocCommentCase kDocCommentCases[] = {
536  {"prev next",
537 
538  "",
539  {},
540  ""},
541 
542  {"prev /* ignored */ next",
543 
544  "",
545  {},
546  ""},
547 
548  {"prev // trailing comment\n"
549  "next",
550 
551  " trailing comment\n",
552  {},
553  ""},
554 
555  {"prev\n"
556  "// leading comment\n"
557  "// line 2\n"
558  "next",
559 
560  "",
561  {},
562  " leading comment\n"
563  " line 2\n"},
564 
565  {"prev\n"
566  "// trailing comment\n"
567  "// line 2\n"
568  "\n"
569  "next",
570 
571  " trailing comment\n"
572  " line 2\n",
573  {},
574  ""},
575 
576  {"prev // trailing comment\n"
577  "// leading comment\n"
578  "// line 2\n"
579  "next",
580 
581  " trailing comment\n",
582  {},
583  " leading comment\n"
584  " line 2\n"},
585 
586  {"prev /* trailing block comment */\n"
587  "/* leading block comment\n"
588  " * line 2\n"
589  " * line 3 */"
590  "next",
591 
592  " trailing block comment ",
593  {},
594  " leading block comment\n"
595  " line 2\n"
596  " line 3 "},
597 
598  {"prev\n"
599  "/* trailing block comment\n"
600  " * line 2\n"
601  " * line 3\n"
602  " */\n"
603  "/* leading block comment\n"
604  " * line 2\n"
605  " * line 3 */"
606  "next",
607 
608  " trailing block comment\n"
609  " line 2\n"
610  " line 3\n",
611  {},
612  " leading block comment\n"
613  " line 2\n"
614  " line 3 "},
615 
616  {"prev\n"
617  "// trailing comment\n"
618  "\n"
619  "// detached comment\n"
620  "// line 2\n"
621  "\n"
622  "// second detached comment\n"
623  "/* third detached comment\n"
624  " * line 2 */\n"
625  "// leading comment\n"
626  "next",
627 
628  " trailing comment\n",
629  {" detached comment\n"
630  " line 2\n",
631  " second detached comment\n",
632  " third detached comment\n"
633  " line 2 "},
634  " leading comment\n"},
635 
636  {"prev /**/\n"
637  "\n"
638  "// detached comment\n"
639  "\n"
640  "// leading comment\n"
641  "next",
642 
643  "",
644  {" detached comment\n"},
645  " leading comment\n"},
646 
647  {"prev /**/\n"
648  "// leading comment\n"
649  "next",
650 
651  "",
652  {},
653  " leading comment\n"},
654 };
655 
656 TEST_2D(TokenizerTest, DocComments, kDocCommentCases, kBlockSizes) {
657  // Set up the tokenizer.
658  TestInputStream input(kDocCommentCases_case.input.data(),
659  kDocCommentCases_case.input.size(), kBlockSizes_case);
660  TestErrorCollector error_collector;
661  Tokenizer tokenizer(&input, &error_collector);
662 
663  // Set up a second tokenizer where we'll pass all NULLs to NextWithComments().
664  TestInputStream input2(kDocCommentCases_case.input.data(),
665  kDocCommentCases_case.input.size(), kBlockSizes_case);
666  Tokenizer tokenizer2(&input2, &error_collector);
667 
668  tokenizer.Next();
669  tokenizer2.Next();
670 
671  EXPECT_EQ("prev", tokenizer.current().text);
672  EXPECT_EQ("prev", tokenizer2.current().text);
673 
675  std::vector<std::string> detached_comments;
677  tokenizer.NextWithComments(&prev_trailing_comments, &detached_comments,
679  tokenizer2.NextWithComments(NULL, NULL, NULL);
680  EXPECT_EQ("next", tokenizer.current().text);
681  EXPECT_EQ("next", tokenizer2.current().text);
682 
683  EXPECT_EQ(kDocCommentCases_case.prev_trailing_comments,
685 
686  for (int i = 0; i < detached_comments.size(); i++) {
687  ASSERT_LT(i, GOOGLE_ARRAYSIZE(kDocCommentCases));
688  ASSERT_TRUE(kDocCommentCases_case.detached_comments[i] != NULL);
689  EXPECT_EQ(kDocCommentCases_case.detached_comments[i], detached_comments[i]);
690  }
691 
692  // Verify that we matched all the detached comments.
693  EXPECT_EQ(NULL,
694  kDocCommentCases_case.detached_comments[detached_comments.size()]);
695 
696  EXPECT_EQ(kDocCommentCases_case.next_leading_comments, next_leading_comments);
697 }
698 
699 // -------------------------------------------------------------------
700 
701 // Test parse helpers. It's not really worth setting up a full data-driven
702 // test here.
703 TEST_F(TokenizerTest, ParseInteger) {
704  EXPECT_EQ(0, ParseInteger("0"));
705  EXPECT_EQ(123, ParseInteger("123"));
706  EXPECT_EQ(0xabcdef12u, ParseInteger("0xabcdef12"));
707  EXPECT_EQ(0xabcdef12u, ParseInteger("0xABCDEF12"));
708  EXPECT_EQ(kuint64max, ParseInteger("0xFFFFFFFFFFFFFFFF"));
709  EXPECT_EQ(01234567, ParseInteger("01234567"));
710  EXPECT_EQ(0X123, ParseInteger("0X123"));
711 
712  // Test invalid integers that may still be tokenized as integers.
713  EXPECT_EQ(0, ParseInteger("0x"));
714 
715  uint64 i;
716 
717  // Test invalid integers that will never be tokenized as integers.
723 
724  // Test overflows.
728  EXPECT_TRUE(Tokenizer::ParseInteger("12345", 12345, &i));
729  EXPECT_FALSE(Tokenizer::ParseInteger("12346", 12345, &i));
730  EXPECT_TRUE(Tokenizer::ParseInteger("0xFFFFFFFFFFFFFFFF", kuint64max, &i));
731  EXPECT_FALSE(Tokenizer::ParseInteger("0x10000000000000000", kuint64max, &i));
732 }
733 
734 TEST_F(TokenizerTest, ParseFloat) {
738  EXPECT_DOUBLE_EQ(1.5e3, Tokenizer::ParseFloat("1.5e3"));
742  EXPECT_DOUBLE_EQ(.25e3, Tokenizer::ParseFloat(".25e3"));
743  EXPECT_DOUBLE_EQ(.1e+3, Tokenizer::ParseFloat(".1e+3"));
744  EXPECT_DOUBLE_EQ(.1e-3, Tokenizer::ParseFloat(".1e-3"));
746  EXPECT_DOUBLE_EQ(6e-12, Tokenizer::ParseFloat("6e-12"));
749 
750  // Test invalid integers that may still be tokenized as integers.
754 
755  // Test 'f' suffix.
759 
760  // These should parse successfully even though they are out of range.
761  // Overflows become infinity and underflows become zero.
762  EXPECT_EQ(0.0, Tokenizer::ParseFloat("1e-9999999999999999999999999999"));
763  EXPECT_EQ(HUGE_VAL, Tokenizer::ParseFloat("1e+9999999999999999999999999999"));
764 
765 #ifdef PROTOBUF_HAS_DEATH_TEST // death tests do not work on Windows yet
766  // Test invalid integers that will never be tokenized as integers.
767  EXPECT_DEBUG_DEATH(
768  Tokenizer::ParseFloat("zxy"),
769  "passed text that could not have been tokenized as a float");
770  EXPECT_DEBUG_DEATH(
771  Tokenizer::ParseFloat("1-e0"),
772  "passed text that could not have been tokenized as a float");
773  EXPECT_DEBUG_DEATH(
774  Tokenizer::ParseFloat("-1.0"),
775  "passed text that could not have been tokenized as a float");
776 #endif // PROTOBUF_HAS_DEATH_TEST
777 }
778 
779 TEST_F(TokenizerTest, ParseString) {
781  Tokenizer::ParseString("'hello'", &output);
782  EXPECT_EQ("hello", output);
783  Tokenizer::ParseString("\"blah\\nblah2\"", &output);
784  EXPECT_EQ("blah\nblah2", output);
785  Tokenizer::ParseString("'\\1x\\1\\123\\739\\52\\334n\\3'", &output);
786  EXPECT_EQ("\1x\1\123\739\52\334n\3", output);
787  Tokenizer::ParseString("'\\x20\\x4'", &output);
788  EXPECT_EQ("\x20\x4", output);
789 
790  // Test invalid strings that may still be tokenized as strings.
791  Tokenizer::ParseString("\"\\a\\l\\v\\t", &output); // \l is invalid
792  EXPECT_EQ("\a?\v\t", output);
794  EXPECT_EQ("", output);
796  EXPECT_EQ("\\", output);
797 
798  // Experiment with Unicode escapes. Here are one-, two- and three-byte Unicode
799  // characters.
800  Tokenizer::ParseString("'\\u0024\\u00a2\\u20ac\\U00024b62XX'", &output);
801  EXPECT_EQ("$¢€𤭢XX", output);
802  // Same thing encoded using UTF16.
803  Tokenizer::ParseString("'\\u0024\\u00a2\\u20ac\\ud852\\udf62XX'", &output);
804  EXPECT_EQ("$¢€𤭢XX", output);
805  // Here's some broken UTF16; there's a head surrogate with no tail surrogate.
806  // We just output this as if it were UTF8; it's not a defined code point, but
807  // it has a defined encoding.
808  Tokenizer::ParseString("'\\ud852XX'", &output);
809  EXPECT_EQ("\xed\xa1\x92XX", output);
810  // Malformed escape: Demons may fly out of the nose.
811  Tokenizer::ParseString("\\u0", &output);
812  EXPECT_EQ("u0", output);
813 
814  // Test invalid strings that will never be tokenized as strings.
815 #ifdef PROTOBUF_HAS_DEATH_TEST // death tests do not work on Windows yet
816  EXPECT_DEBUG_DEATH(
818  "passed text that could not have been tokenized as a string");
819 #endif // PROTOBUF_HAS_DEATH_TEST
820 }
821 
822 TEST_F(TokenizerTest, ParseStringAppend) {
823  // Check that ParseString and ParseStringAppend differ.
824  std::string output("stuff+");
826  EXPECT_EQ("stuff+hello", output);
827  Tokenizer::ParseString("'hello'", &output);
828  EXPECT_EQ("hello", output);
829 }
830 
831 // -------------------------------------------------------------------
832 
833 // Each case parses some input text, ignoring the tokens produced, and
834 // checks that the error output matches what is expected.
835 struct ErrorCase {
837  bool recoverable; // True if the tokenizer should be able to recover and
838  // parse more tokens after seeing this error. Cases
839  // for which this is true must end with "foo" as
840  // the last token, which the test will check for.
841  const char* errors;
842 };
843 
844 inline std::ostream& operator<<(std::ostream& out, const ErrorCase& test_case) {
845  return out << CEscape(test_case.input);
846 }
847 
848 ErrorCase kErrorCases[] = {
849  // String errors.
850  {"'\\l' foo", true, "0:2: Invalid escape sequence in string literal.\n"},
851  {"'\\X' foo", true, "0:2: Invalid escape sequence in string literal.\n"},
852  {"'\\x' foo", true, "0:3: Expected hex digits for escape sequence.\n"},
853  {"'foo", false, "0:4: Unexpected end of string.\n"},
854  {"'bar\nfoo", true, "0:4: String literals cannot cross line boundaries.\n"},
855  {"'\\u01' foo", true,
856  "0:5: Expected four hex digits for \\u escape sequence.\n"},
857  {"'\\u01' foo", true,
858  "0:5: Expected four hex digits for \\u escape sequence.\n"},
859  {"'\\uXYZ' foo", true,
860  "0:3: Expected four hex digits for \\u escape sequence.\n"},
861 
862  // Integer errors.
863  {"123foo", true, "0:3: Need space between number and identifier.\n"},
864 
865  // Hex/octal errors.
866  {"0x foo", true, "0:2: \"0x\" must be followed by hex digits.\n"},
867  {"0541823 foo", true,
868  "0:4: Numbers starting with leading zero must be in octal.\n"},
869  {"0x123z foo", true, "0:5: Need space between number and identifier.\n"},
870  {"0x123.4 foo", true, "0:5: Hex and octal numbers must be integers.\n"},
871  {"0123.4 foo", true, "0:4: Hex and octal numbers must be integers.\n"},
872 
873  // Float errors.
874  {"1e foo", true, "0:2: \"e\" must be followed by exponent.\n"},
875  {"1e- foo", true, "0:3: \"e\" must be followed by exponent.\n"},
876  {"1.2.3 foo", true,
877  "0:3: Already saw decimal point or exponent; can't have another one.\n"},
878  {"1e2.3 foo", true,
879  "0:3: Already saw decimal point or exponent; can't have another one.\n"},
880  {"a.1 foo", true,
881  "0:1: Need space between identifier and decimal point.\n"},
882  // allow_f_after_float not enabled, so this should be an error.
883  {"1.0f foo", true, "0:3: Need space between number and identifier.\n"},
884 
885  // Block comment errors.
886  {"/*", false,
887  "0:2: End-of-file inside block comment.\n"
888  "0:0: Comment started here.\n"},
889  {"/*/*/ foo", true,
890  "0:3: \"/*\" inside block comment. Block comments cannot be nested.\n"},
891 
892  // Control characters. Multiple consecutive control characters should only
893  // produce one error.
894  {"\b foo", true, "0:0: Invalid control characters encountered in text.\n"},
895  {"\b\b foo", true,
896  "0:0: Invalid control characters encountered in text.\n"},
897 
898  // Check that control characters at end of input don't result in an
899  // infinite loop.
900  {"\b", false, "0:0: Invalid control characters encountered in text.\n"},
901 
902  // Check recovery from '\0'. We have to explicitly specify the length of
903  // these strings because otherwise the string constructor will just call
904  // strlen() which will see the first '\0' and think that is the end of the
905  // string.
906  {std::string("\0foo", 4), true,
907  "0:0: Invalid control characters encountered in text.\n"},
908  {std::string("\0\0foo", 5), true,
909  "0:0: Invalid control characters encountered in text.\n"},
910 
911  // Check error from high order bits set
912  {"\300foo", true, "0:0: Interpreting non ascii codepoint 192.\n"},
913 };
914 
915 TEST_2D(TokenizerTest, Errors, kErrorCases, kBlockSizes) {
916  // Set up the tokenizer.
917  TestInputStream input(kErrorCases_case.input.data(),
918  kErrorCases_case.input.size(), kBlockSizes_case);
919  TestErrorCollector error_collector;
920  Tokenizer tokenizer(&input, &error_collector);
921 
922  // Ignore all input, except remember if the last token was "foo".
923  bool last_was_foo = false;
924  while (tokenizer.Next()) {
925  last_was_foo = tokenizer.current().text == "foo";
926  }
927 
928  // Check that the errors match what was expected.
929  EXPECT_EQ(kErrorCases_case.errors, error_collector.text_);
930 
931  // If the error was recoverable, make sure we saw "foo" after it.
932  if (kErrorCases_case.recoverable) {
933  EXPECT_TRUE(last_was_foo);
934  }
935 }
936 
937 // -------------------------------------------------------------------
938 
939 TEST_1D(TokenizerTest, BackUpOnDestruction, kBlockSizes) {
940  std::string text = "foo bar";
941  TestInputStream input(text.data(), text.size(), kBlockSizes_case);
942 
943  // Create a tokenizer, read one token, then destroy it.
944  {
945  TestErrorCollector error_collector;
946  Tokenizer tokenizer(&input, &error_collector);
947 
948  tokenizer.Next();
949  }
950 
951  // Only "foo" should have been read.
952  EXPECT_EQ(strlen("foo"), input.ByteCount());
953 }
954 
955 
956 } // namespace
957 } // namespace io
958 } // namespace protobuf
959 } // namespace google
ASSERT_FALSE
#define ASSERT_FALSE(condition)
Definition: gtest.h:1998
type
Tokenizer::TokenType type
Definition: tokenizer_unittest.cc:198
google::protobuf::io::Tokenizer::TYPE_FLOAT
@ TYPE_FLOAT
Definition: tokenizer.h:115
google::protobuf::io::Tokenizer::TYPE_IDENTIFIER
@ TYPE_IDENTIFIER
Definition: tokenizer.h:105
zero_copy_stream_impl.h
google::protobuf::io::Tokenizer::TokenType
TokenType
Definition: tokenizer.h:101
NULL
NULL
Definition: test_security_zap.cpp:405
google::protobuf::int64
int64_t int64
Definition: protobuf/src/google/protobuf/stubs/port.h:151
google::protobuf::io::Tokenizer::ParseInteger
static bool ParseInteger(const std::string &text, uint64 max_value, uint64 *output)
Definition: tokenizer.cc:862
input
std::string input
Definition: tokenizer_unittest.cc:197
generate_changelog.previous
previous
Definition: generate_changelog.py:55
SCOPED_TRACE
#define SCOPED_TRACE(message)
Definition: gtest.h:2296
gtest.h
tokenizer.h
EXPECT_EQ
#define EXPECT_EQ(val1, val2)
Definition: glog/src/googletest.h:155
text_
std::string text_
Definition: tokenizer_unittest.cc:162
google::protobuf::CEscape
string CEscape(const string &src)
Definition: strutil.cc:615
errors
const char * errors
Definition: tokenizer_unittest.cc:841
string
GLsizei const GLchar *const * string
Definition: glcorearb.h:3083
TEST_1D
#define TEST_1D(FIXTURE, NAME, CASES)
Definition: tokenizer_unittest.cc:78
testing::Message
Definition: gtest-message.h:90
testing::Test
Definition: gtest.h:415
kBlockSizes
static const int kBlockSizes[]
Definition: zero_copy_stream_unittest.cc:133
GOOGLE_ARRAYSIZE
#define GOOGLE_ARRAYSIZE(a)
Definition: macros.h:88
strutil.h
google::protobuf::io::Tokenizer::SH_COMMENT_STYLE
@ SH_COMMENT_STYLE
Definition: tokenizer.h:236
google::protobuf::io::Tokenizer::ParseFloat
static double ParseFloat(const std::string &text)
Definition: tokenizer.cc:901
google::protobuf.text_format.ParseFloat
def ParseFloat(text)
Definition: text_format.py:1686
google::protobuf::uint64
uint64_t uint64
Definition: protobuf/src/google/protobuf/stubs/port.h:156
prev_trailing_comments
const char * prev_trailing_comments
Definition: tokenizer_unittest.cc:525
detached_comments
const char * detached_comments[10]
Definition: tokenizer_unittest.cc:526
google::protobuf::io::Tokenizer::TYPE_STRING
@ TYPE_STRING
Definition: tokenizer.h:118
EXPECT_TRUE
#define EXPECT_TRUE(cond)
Definition: glog/src/googletest.h:137
ASSERT_LT
#define ASSERT_LT(val1, val2)
Definition: gtest.h:2094
google::protobuf::io::Tokenizer::ParseStringAppend
static void ParseStringAppend(const std::string &text, std::string *output)
Definition: tokenizer.cc:1034
ASSERT_TRUE
#define ASSERT_TRUE(condition)
Definition: gtest.h:1995
google::protobuf::operator<<
std::ostream & operator<<(std::ostream &o, const uint128 &b)
Definition: int128.cc:128
i
int i
Definition: gmock-matchers_test.cc:764
google::protobuf::io::Tokenizer::TYPE_END
@ TYPE_END
Definition: tokenizer.h:103
array_stream_
ArrayInputStream array_stream_
Definition: tokenizer_unittest.cc:149
common.h
google::protobuf::io::Tokenizer::ParseString
static void ParseString(const std::string &text, std::string *output)
Definition: tokenizer.h:400
pump.Skip
def Skip(lines, pos, regex)
Definition: pump.py:261
output
Tokenizer::Token output[10]
Definition: tokenizer_unittest.cc:328
size
GLsizeiptr size
Definition: glcorearb.h:2943
google::protobuf::TEST_F
TEST_F(DynamicMessageTest, Descriptor)
Definition: dynamic_message_unittest.cc:126
googletest.h
EXPECT_FALSE
#define EXPECT_FALSE(cond)
Definition: glog/src/googletest.h:145
google::protobuf::io::Tokenizer::TYPE_START
@ TYPE_START
Definition: tokenizer.h:102
logging.h
recoverable
bool recoverable
Definition: tokenizer_unittest.cc:837
google::protobuf::io::Tokenizer::TYPE_INTEGER
@ TYPE_INTEGER
Definition: tokenizer.h:109
google::protobuf.text_format.ParseInteger
def ParseInteger(text, is_signed=False, is_long=False)
Definition: text_format.py:1631
TEST_2D
#define TEST_2D(FIXTURE, NAME, CASES1, CASES2)
Definition: tokenizer_unittest.cc:96
substitute.h
data
GLint GLenum GLsizei GLsizei GLsizei GLint GLsizei const GLvoid * data
Definition: glcorearb.h:2879
google::protobuf::kuint64max
static const uint64 kuint64max
Definition: protobuf/src/google/protobuf/stubs/port.h:164
count
GLint GLsizei count
Definition: glcorearb.h:2830
next_leading_comments
const char * next_leading_comments
Definition: tokenizer_unittest.cc:527
google
Definition: data_proto2_to_proto3_util.h:11
message
GLenum GLuint GLenum GLsizei const GLchar * message
Definition: glcorearb.h:2695
google::protobuf::io::Tokenizer::TYPE_SYMBOL
@ TYPE_SYMBOL
Definition: tokenizer.h:121
google::protobuf::strings::SubstituteAndAppend
void SubstituteAndAppend(string *output, const char *format, const SubstituteArg &arg0, const SubstituteArg &arg1, const SubstituteArg &arg2, const SubstituteArg &arg3, const SubstituteArg &arg4, const SubstituteArg &arg5, const SubstituteArg &arg6, const SubstituteArg &arg7, const SubstituteArg &arg8, const SubstituteArg &arg9)
Definition: substitute.cc:68
EXPECT_DOUBLE_EQ
#define EXPECT_DOUBLE_EQ(val1, val2)
Definition: glog/src/googletest.h:176
counter_
int counter_
Definition: tokenizer_unittest.cc:150


libaditof
Author(s):
autogenerated on Wed May 21 2025 02:07:00