re2/re2/testing/compile_test.cc
Go to the documentation of this file.
1 // Copyright 2007 The RE2 Authors. All Rights Reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4 
5 // Test prog.cc, compile.cc
6 
7 #include <string>
8 
9 #include "util/test.h"
10 #include "util/logging.h"
11 #include "re2/regexp.h"
12 #include "re2/prog.h"
13 
14 namespace re2 {
15 
16 // Simple input/output tests checking that
17 // the regexp compiles to the expected code.
18 // These are just to sanity check the basic implementation.
19 // The real confidence tests happen by testing the NFA/DFA
20 // that run the compiled code.
21 
22 struct Test {
23  const char* regexp;
24  const char* code;
25 };
26 
27 static Test tests[] = {
28  { "a",
29  "3. byte [61-61] 0 -> 4\n"
30  "4. match! 0\n" },
31  { "ab",
32  "3. byte [61-61] 0 -> 4\n"
33  "4. byte [62-62] 0 -> 5\n"
34  "5. match! 0\n" },
35  { "a|c",
36  "3+ byte [61-61] 0 -> 5\n"
37  "4. byte [63-63] 0 -> 5\n"
38  "5. match! 0\n" },
39  { "a|b",
40  "3. byte [61-62] 0 -> 4\n"
41  "4. match! 0\n" },
42  { "[ab]",
43  "3. byte [61-62] 0 -> 4\n"
44  "4. match! 0\n" },
45  { "a+",
46  "3. byte [61-61] 0 -> 4\n"
47  "4+ nop -> 3\n"
48  "5. match! 0\n" },
49  { "a+?",
50  "3. byte [61-61] 0 -> 4\n"
51  "4+ match! 0\n"
52  "5. nop -> 3\n" },
53  { "a*",
54  "3+ byte [61-61] 1 -> 3\n"
55  "4. match! 0\n" },
56  { "a*?",
57  "3+ match! 0\n"
58  "4. byte [61-61] 0 -> 3\n" },
59  { "a?",
60  "3+ byte [61-61] 1 -> 5\n"
61  "4. nop -> 5\n"
62  "5. match! 0\n" },
63  { "a??",
64  "3+ nop -> 5\n"
65  "4. byte [61-61] 0 -> 5\n"
66  "5. match! 0\n" },
67  { "a{4}",
68  "3. byte [61-61] 0 -> 4\n"
69  "4. byte [61-61] 0 -> 5\n"
70  "5. byte [61-61] 0 -> 6\n"
71  "6. byte [61-61] 0 -> 7\n"
72  "7. match! 0\n" },
73  { "(a)",
74  "3. capture 2 -> 4\n"
75  "4. byte [61-61] 0 -> 5\n"
76  "5. capture 3 -> 6\n"
77  "6. match! 0\n" },
78  { "(?:a)",
79  "3. byte [61-61] 0 -> 4\n"
80  "4. match! 0\n" },
81  { "",
82  "3. match! 0\n" },
83  { ".",
84  "3+ byte [00-09] 0 -> 5\n"
85  "4. byte [0b-ff] 0 -> 5\n"
86  "5. match! 0\n" },
87  { "[^ab]",
88  "3+ byte [00-09] 0 -> 6\n"
89  "4+ byte [0b-60] 0 -> 6\n"
90  "5. byte [63-ff] 0 -> 6\n"
91  "6. match! 0\n" },
92  { "[Aa]",
93  "3. byte/i [61-61] 0 -> 4\n"
94  "4. match! 0\n" },
95  { "\\C+",
96  "3. byte [00-ff] 0 -> 4\n"
97  "4+ altmatch -> 5 | 6\n"
98  "5+ nop -> 3\n"
99  "6. match! 0\n" },
100  { "\\C*",
101  "3+ altmatch -> 4 | 5\n"
102  "4+ byte [00-ff] 1 -> 3\n"
103  "5. match! 0\n" },
104  { "\\C?",
105  "3+ byte [00-ff] 1 -> 5\n"
106  "4. nop -> 5\n"
107  "5. match! 0\n" },
108  // Issue 20992936
109  { "[[-`]",
110  "3. byte [5b-60] 0 -> 4\n"
111  "4. match! 0\n" },
112  // Issue 310
113  { "(?:|a)*",
114  "3+ nop -> 7\n"
115  "4. nop -> 9\n"
116  "5+ nop -> 7\n"
117  "6. nop -> 9\n"
118  "7+ nop -> 5\n"
119  "8. byte [61-61] 0 -> 5\n"
120  "9. match! 0\n" },
121  { "(?:|a)+",
122  "3+ nop -> 5\n"
123  "4. byte [61-61] 0 -> 5\n"
124  "5+ nop -> 3\n"
125  "6. match! 0\n" },
126 };
127 
128 TEST(TestRegexpCompileToProg, Simple) {
129  int failed = 0;
130  for (size_t i = 0; i < arraysize(tests); i++) {
131  const re2::Test& t = tests[i];
132  Regexp* re = Regexp::Parse(t.regexp, Regexp::PerlX|Regexp::Latin1, NULL);
133  if (re == NULL) {
134  LOG(ERROR) << "Cannot parse: " << t.regexp;
135  failed++;
136  continue;
137  }
138  Prog* prog = re->CompileToProg(0);
139  if (prog == NULL) {
140  LOG(ERROR) << "Cannot compile: " << t.regexp;
141  re->Decref();
142  failed++;
143  continue;
144  }
145  ASSERT_TRUE(re->CompileToProg(1) == NULL);
146  std::string s = prog->Dump();
147  if (s != t.code) {
148  LOG(ERROR) << "Incorrect compiled code for: " << t.regexp;
149  LOG(ERROR) << "Want:\n" << t.code;
150  LOG(ERROR) << "Got:\n" << s;
151  failed++;
152  }
153  delete prog;
154  re->Decref();
155  }
156  EXPECT_EQ(failed, 0);
157 }
158 
160  std::string* bytemap) {
161  Regexp* re = Regexp::Parse(pattern, flags, NULL);
162  EXPECT_TRUE(re != NULL);
163 
164  {
165  Prog* prog = re->CompileToProg(0);
166  EXPECT_TRUE(prog != NULL);
167  *bytemap = prog->DumpByteMap();
168  delete prog;
169  }
170 
171  {
172  Prog* prog = re->CompileToReverseProg(0);
173  EXPECT_TRUE(prog != NULL);
174  EXPECT_EQ(*bytemap, prog->DumpByteMap());
175  delete prog;
176  }
177 
178  re->Decref();
179 }
180 
181 TEST(TestCompile, Latin1Ranges) {
182  // The distinct byte ranges involved in the Latin-1 dot ([^\n]).
183 
184  std::string bytemap;
185 
186  DumpByteMap(".", Regexp::PerlX|Regexp::Latin1, &bytemap);
187  EXPECT_EQ("[00-09] -> 0\n"
188  "[0a-0a] -> 1\n"
189  "[0b-ff] -> 0\n",
190  bytemap);
191 }
192 
193 TEST(TestCompile, OtherByteMapTests) {
194  std::string bytemap;
195 
196  // Test that "absent" ranges are mapped to the same byte class.
197  DumpByteMap("[0-9A-Fa-f]+", Regexp::PerlX|Regexp::Latin1, &bytemap);
198  EXPECT_EQ("[00-2f] -> 0\n"
199  "[30-39] -> 1\n"
200  "[3a-40] -> 0\n"
201  "[41-46] -> 1\n"
202  "[47-60] -> 0\n"
203  "[61-66] -> 1\n"
204  "[67-ff] -> 0\n",
205  bytemap);
206 
207  // Test the byte classes for \b.
208  DumpByteMap("\\b", Regexp::LikePerl|Regexp::Latin1, &bytemap);
209  EXPECT_EQ("[00-2f] -> 0\n"
210  "[30-39] -> 1\n"
211  "[3a-40] -> 0\n"
212  "[41-5a] -> 1\n"
213  "[5b-5e] -> 0\n"
214  "[5f-5f] -> 1\n"
215  "[60-60] -> 0\n"
216  "[61-7a] -> 1\n"
217  "[7b-ff] -> 0\n",
218  bytemap);
219 
220  // Bug in the ASCII case-folding optimization created too many byte classes.
221  DumpByteMap("[^_]", Regexp::LikePerl|Regexp::Latin1, &bytemap);
222  EXPECT_EQ("[00-5e] -> 0\n"
223  "[5f-5f] -> 1\n"
224  "[60-ff] -> 0\n",
225  bytemap);
226 }
227 
228 TEST(TestCompile, UTF8Ranges) {
229  // The distinct byte ranges involved in the UTF-8 dot ([^\n]).
230  // Once, erroneously split between 0x3f and 0x40 because it is
231  // a 6-bit boundary.
232 
233  std::string bytemap;
234 
235  DumpByteMap(".", Regexp::PerlX, &bytemap);
236  EXPECT_EQ("[00-09] -> 0\n"
237  "[0a-0a] -> 1\n"
238  "[0b-7f] -> 0\n"
239  "[80-bf] -> 2\n"
240  "[c0-c1] -> 1\n"
241  "[c2-df] -> 3\n"
242  "[e0-ef] -> 4\n"
243  "[f0-f4] -> 5\n"
244  "[f5-ff] -> 1\n",
245  bytemap);
246 }
247 
248 TEST(TestCompile, InsufficientMemory) {
249  Regexp* re = Regexp::Parse(
250  "^(?P<name1>[^\\s]+)\\s+(?P<name2>[^\\s]+)\\s+(?P<name3>.+)$",
251  Regexp::LikePerl, NULL);
252  EXPECT_TRUE(re != NULL);
253  Prog* prog = re->CompileToProg(850);
254  // If the memory budget has been exhausted, compilation should fail
255  // and return NULL instead of trying to do anything with NoMatch().
256  EXPECT_TRUE(prog == NULL);
257  re->Decref();
258 }
259 
262  Regexp* re = Regexp::Parse(pattern, flags, NULL);
263  EXPECT_TRUE(re != NULL);
264 
265  if (forward != NULL) {
266  Prog* prog = re->CompileToProg(0);
267  EXPECT_TRUE(prog != NULL);
268  *forward = prog->Dump();
269  delete prog;
270  }
271 
272  if (reverse != NULL) {
273  Prog* prog = re->CompileToReverseProg(0);
274  EXPECT_TRUE(prog != NULL);
275  *reverse = prog->Dump();
276  delete prog;
277  }
278 
279  re->Decref();
280 }
281 
282 TEST(TestCompile, Bug26705922) {
283  // Bug in the compiler caused inefficient bytecode to be generated for Unicode
284  // groups: common suffixes were cached, but common prefixes were not factored.
285 
287 
288  Dump("[\\x{10000}\\x{10010}]", Regexp::LikePerl, &forward, &reverse);
289  EXPECT_EQ("3. byte [f0-f0] 0 -> 4\n"
290  "4. byte [90-90] 0 -> 5\n"
291  "5. byte [80-80] 0 -> 6\n"
292  "6+ byte [80-80] 0 -> 8\n"
293  "7. byte [90-90] 0 -> 8\n"
294  "8. match! 0\n",
295  forward);
296  EXPECT_EQ("3+ byte [80-80] 0 -> 5\n"
297  "4. byte [90-90] 0 -> 5\n"
298  "5. byte [80-80] 0 -> 6\n"
299  "6. byte [90-90] 0 -> 7\n"
300  "7. byte [f0-f0] 0 -> 8\n"
301  "8. match! 0\n",
302  reverse);
303 
304  Dump("[\\x{8000}-\\x{10FFF}]", Regexp::LikePerl, &forward, &reverse);
305  EXPECT_EQ("3+ byte [e8-ef] 0 -> 5\n"
306  "4. byte [f0-f0] 0 -> 8\n"
307  "5. byte [80-bf] 0 -> 6\n"
308  "6. byte [80-bf] 0 -> 7\n"
309  "7. match! 0\n"
310  "8. byte [90-90] 0 -> 5\n",
311  forward);
312  EXPECT_EQ("3. byte [80-bf] 0 -> 4\n"
313  "4. byte [80-bf] 0 -> 5\n"
314  "5+ byte [e8-ef] 0 -> 7\n"
315  "6. byte [90-90] 0 -> 8\n"
316  "7. match! 0\n"
317  "8. byte [f0-f0] 0 -> 7\n",
318  reverse);
319 
320  Dump("[\\x{80}-\\x{10FFFF}]", Regexp::LikePerl, &forward, &reverse);
321  EXPECT_EQ("3+ byte [c2-df] 0 -> 6\n"
322  "4+ byte [e0-ef] 0 -> 8\n"
323  "5. byte [f0-f4] 0 -> 9\n"
324  "6. byte [80-bf] 0 -> 7\n"
325  "7. match! 0\n"
326  "8. byte [80-bf] 0 -> 6\n"
327  "9. byte [80-bf] 0 -> 8\n",
328  forward);
329  EXPECT_EQ("3. byte [80-bf] 0 -> 4\n"
330  "4+ byte [c2-df] 0 -> 6\n"
331  "5. byte [80-bf] 0 -> 7\n"
332  "6. match! 0\n"
333  "7+ byte [e0-ef] 0 -> 6\n"
334  "8. byte [80-bf] 0 -> 9\n"
335  "9. byte [f0-f4] 0 -> 6\n",
336  reverse);
337 }
338 
339 TEST(TestCompile, Bug35237384) {
340  // Bug in the compiler caused inefficient bytecode to be generated for
341  // nested nullable subexpressions.
342 
344 
345  Dump("a**{3,}", Regexp::Latin1|Regexp::NeverCapture, &forward, NULL);
346  EXPECT_EQ("3+ byte [61-61] 1 -> 3\n"
347  "4. nop -> 5\n"
348  "5+ byte [61-61] 1 -> 5\n"
349  "6. nop -> 7\n"
350  "7+ byte [61-61] 1 -> 7\n"
351  "8. match! 0\n",
352  forward);
353 
354  Dump("(a*|b*)*{3,}", Regexp::Latin1|Regexp::NeverCapture, &forward, NULL);
355  EXPECT_EQ("3+ nop -> 28\n"
356  "4. nop -> 30\n"
357  "5+ byte [61-61] 1 -> 5\n"
358  "6. nop -> 32\n"
359  "7+ byte [61-61] 1 -> 7\n"
360  "8. nop -> 26\n"
361  "9+ byte [61-61] 1 -> 9\n"
362  "10. nop -> 20\n"
363  "11+ byte [62-62] 1 -> 11\n"
364  "12. nop -> 20\n"
365  "13+ byte [62-62] 1 -> 13\n"
366  "14. nop -> 26\n"
367  "15+ byte [62-62] 1 -> 15\n"
368  "16. nop -> 32\n"
369  "17+ nop -> 9\n"
370  "18. nop -> 11\n"
371  "19. match! 0\n"
372  "20+ nop -> 17\n"
373  "21. nop -> 19\n"
374  "22+ nop -> 7\n"
375  "23. nop -> 13\n"
376  "24+ nop -> 17\n"
377  "25. nop -> 19\n"
378  "26+ nop -> 22\n"
379  "27. nop -> 24\n"
380  "28+ nop -> 5\n"
381  "29. nop -> 15\n"
382  "30+ nop -> 22\n"
383  "31. nop -> 24\n"
384  "32+ nop -> 28\n"
385  "33. nop -> 30\n",
386  forward);
387 
388  Dump("((|S.+)+|(|S.+)+|){2}", Regexp::Latin1|Regexp::NeverCapture, &forward, NULL);
389  EXPECT_EQ("3+ nop -> 36\n"
390  "4+ nop -> 31\n"
391  "5. nop -> 33\n"
392  "6+ byte [00-09] 0 -> 8\n"
393  "7. byte [0b-ff] 0 -> 8\n"
394  "8+ nop -> 6\n"
395  "9+ nop -> 29\n"
396  "10. nop -> 28\n"
397  "11+ byte [00-09] 0 -> 13\n"
398  "12. byte [0b-ff] 0 -> 13\n"
399  "13+ nop -> 11\n"
400  "14+ nop -> 26\n"
401  "15. nop -> 28\n"
402  "16+ byte [00-09] 0 -> 18\n"
403  "17. byte [0b-ff] 0 -> 18\n"
404  "18+ nop -> 16\n"
405  "19+ nop -> 36\n"
406  "20. nop -> 33\n"
407  "21+ byte [00-09] 0 -> 23\n"
408  "22. byte [0b-ff] 0 -> 23\n"
409  "23+ nop -> 21\n"
410  "24+ nop -> 31\n"
411  "25. nop -> 33\n"
412  "26+ nop -> 28\n"
413  "27. byte [53-53] 0 -> 11\n"
414  "28. match! 0\n"
415  "29+ nop -> 28\n"
416  "30. byte [53-53] 0 -> 6\n"
417  "31+ nop -> 33\n"
418  "32. byte [53-53] 0 -> 21\n"
419  "33+ nop -> 29\n"
420  "34+ nop -> 26\n"
421  "35. nop -> 28\n"
422  "36+ nop -> 33\n"
423  "37. byte [53-53] 0 -> 16\n",
424  forward);
425 }
426 
427 } // namespace re2
prog
char * prog
Definition: bloaty/third_party/zlib/contrib/untgz/untgz.c:125
re2::Regexp::Decref
void Decref()
Definition: bloaty/third_party/re2/re2/regexp.cc:115
check_tracer_sanity.pattern
pattern
Definition: check_tracer_sanity.py:25
re2::Dump
static void Dump(StringPiece pattern, Regexp::ParseFlags flags, std::string *forward, std::string *reverse)
Definition: bloaty/third_party/re2/re2/testing/compile_test.cc:242
re2::Regexp
Definition: bloaty/third_party/re2/re2/regexp.h:274
testing::internal::string
::std::string string
Definition: bloaty/third_party/protobuf/third_party/googletest/googletest/include/gtest/internal/gtest-port.h:881
re2
Definition: bloaty/third_party/re2/re2/bitmap256.h:17
absl::FormatConversionChar::s
@ s
re2::Regexp::CompileToProg
Prog * CompileToProg(int64_t max_mem)
Definition: bloaty/third_party/re2/re2/compile.cc:1220
re2::Regexp::NeverCapture
@ NeverCapture
Definition: bloaty/third_party/re2/re2/regexp.h:309
re2::Regexp::Latin1
@ Latin1
Definition: bloaty/third_party/re2/re2/regexp.h:289
EXPECT_EQ
#define EXPECT_EQ(a, b)
Definition: iomgr/time_averaged_stats_test.cc:27
make_dist_html.reverse
reverse
Definition: make_dist_html.py:119
LOG
#define LOG(severity)
Definition: bloaty/third_party/re2/util/logging.h:53
re2::Regexp::Parse
static Regexp * Parse(const StringPiece &s, ParseFlags flags, RegexpStatus *status)
Definition: bloaty/third_party/re2/re2/parse.cc:2200
re2::Regexp::ParseFlags
ParseFlags
Definition: bloaty/third_party/re2/re2/regexp.h:278
google::protobuf::ERROR
static const LogLevel ERROR
Definition: bloaty/third_party/protobuf/src/google/protobuf/testing/googletest.h:70
arraysize
#define arraysize(array)
Definition: benchmark/src/arraysize.h:28
re2::DumpByteMap
static void DumpByteMap(StringPiece pattern, Regexp::ParseFlags flags, std::string *bytemap)
Definition: bloaty/third_party/re2/re2/testing/compile_test.cc:145
re2::Regexp::CompileToReverseProg
Prog * CompileToReverseProg(int64_t max_mem)
Definition: bloaty/third_party/re2/re2/compile.cc:1224
tests
Definition: src/python/grpcio_tests/tests/__init__.py:1
absl::flags_internal
Definition: abseil-cpp/absl/flags/commandlineflag.h:40
re2::Prog
Definition: bloaty/third_party/re2/re2/prog.h:56
absl::str_format_internal::LengthMod::t
@ t
ASSERT_TRUE
#define ASSERT_TRUE(condition)
Definition: bloaty/third_party/googletest/googletest/include/gtest/gtest.h:1973
re2::Test
Definition: bloaty/third_party/re2/re2/testing/compile_test.cc:22
re2::Test::code
const char * code
Definition: bloaty/third_party/re2/re2/testing/compile_test.cc:24
EXPECT_TRUE
#define EXPECT_TRUE(condition)
Definition: bloaty/third_party/googletest/googletest/include/gtest/gtest.h:1967
re2::Test::regexp
const char * regexp
Definition: bloaty/third_party/re2/re2/testing/compile_test.cc:23
absl::forward
constexpr T && forward(absl::remove_reference_t< T > &t) noexcept
Definition: abseil-cpp/absl/utility/utility.h:230
re2::TEST
TEST(TestCharClassBuilder, Adds)
Definition: bloaty/third_party/re2/re2/testing/charclass_test.cc:198
re2::Regexp::PerlX
@ PerlX
Definition: bloaty/third_party/re2/re2/regexp.h:293
re2::StringPiece
Definition: bloaty/third_party/re2/re2/stringpiece.h:39
Test
Definition: hpack_parser_test.cc:43
i
uint64_t i
Definition: abseil-cpp/absl/container/btree_benchmark.cc:230
re2::Regexp::LikePerl
@ LikePerl
Definition: bloaty/third_party/re2/re2/regexp.h:312


grpc
Author(s):
autogenerated on Fri May 16 2025 02:58:00