bloaty/third_party/re2/re2/testing/compile_test.cc
Go to the documentation of this file.
1 // Copyright 2007 The RE2 Authors. All Rights Reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4 
5 // Test prog.cc, compile.cc
6 
7 #include <string>
8 
9 #include "util/test.h"
10 #include "util/logging.h"
11 #include "re2/regexp.h"
12 #include "re2/prog.h"
13 
14 namespace re2 {
15 
16 // Simple input/output tests checking that
17 // the regexp compiles to the expected code.
18 // These are just to sanity check the basic implementation.
19 // The real confidence tests happen by testing the NFA/DFA
20 // that run the compiled code.
21 
22 struct Test {
23  const char* regexp;
24  const char* code;
25 };
26 
27 static Test tests[] = {
28  { "a",
29  "3. byte [61-61] 0 -> 4\n"
30  "4. match! 0\n" },
31  { "ab",
32  "3. byte [61-61] 0 -> 4\n"
33  "4. byte [62-62] 0 -> 5\n"
34  "5. match! 0\n" },
35  { "a|c",
36  "3+ byte [61-61] 0 -> 5\n"
37  "4. byte [63-63] 0 -> 5\n"
38  "5. match! 0\n" },
39  { "a|b",
40  "3. byte [61-62] 0 -> 4\n"
41  "4. match! 0\n" },
42  { "[ab]",
43  "3. byte [61-62] 0 -> 4\n"
44  "4. match! 0\n" },
45  { "a+",
46  "3. byte [61-61] 0 -> 4\n"
47  "4+ nop -> 3\n"
48  "5. match! 0\n" },
49  { "a+?",
50  "3. byte [61-61] 0 -> 4\n"
51  "4+ match! 0\n"
52  "5. nop -> 3\n" },
53  { "a*",
54  "3+ byte [61-61] 1 -> 3\n"
55  "4. match! 0\n" },
56  { "a*?",
57  "3+ match! 0\n"
58  "4. byte [61-61] 0 -> 3\n" },
59  { "a?",
60  "3+ byte [61-61] 1 -> 5\n"
61  "4. nop -> 5\n"
62  "5. match! 0\n" },
63  { "a??",
64  "3+ nop -> 5\n"
65  "4. byte [61-61] 0 -> 5\n"
66  "5. match! 0\n" },
67  { "a{4}",
68  "3. byte [61-61] 0 -> 4\n"
69  "4. byte [61-61] 0 -> 5\n"
70  "5. byte [61-61] 0 -> 6\n"
71  "6. byte [61-61] 0 -> 7\n"
72  "7. match! 0\n" },
73  { "(a)",
74  "3. capture 2 -> 4\n"
75  "4. byte [61-61] 0 -> 5\n"
76  "5. capture 3 -> 6\n"
77  "6. match! 0\n" },
78  { "(?:a)",
79  "3. byte [61-61] 0 -> 4\n"
80  "4. match! 0\n" },
81  { "",
82  "3. match! 0\n" },
83  { ".",
84  "3+ byte [00-09] 0 -> 5\n"
85  "4. byte [0b-ff] 0 -> 5\n"
86  "5. match! 0\n" },
87  { "[^ab]",
88  "3+ byte [00-09] 0 -> 6\n"
89  "4+ byte [0b-60] 0 -> 6\n"
90  "5. byte [63-ff] 0 -> 6\n"
91  "6. match! 0\n" },
92  { "[Aa]",
93  "3. byte/i [61-61] 0 -> 4\n"
94  "4. match! 0\n" },
95  { "\\C+",
96  "3. byte [00-ff] 0 -> 4\n"
97  "4+ altmatch -> 5 | 6\n"
98  "5+ nop -> 3\n"
99  "6. match! 0\n" },
100  { "\\C*",
101  "3+ altmatch -> 4 | 5\n"
102  "4+ byte [00-ff] 1 -> 3\n"
103  "5. match! 0\n" },
104  { "\\C?",
105  "3+ byte [00-ff] 1 -> 5\n"
106  "4. nop -> 5\n"
107  "5. match! 0\n" },
108  // Issue 20992936
109  { "[[-`]",
110  "3. byte [5b-60] 0 -> 4\n"
111  "4. match! 0\n" },
112 };
113 
114 TEST(TestRegexpCompileToProg, Simple) {
115  int failed = 0;
116  for (size_t i = 0; i < arraysize(tests); i++) {
117  const re2::Test& t = tests[i];
118  Regexp* re = Regexp::Parse(t.regexp, Regexp::PerlX|Regexp::Latin1, NULL);
119  if (re == NULL) {
120  LOG(ERROR) << "Cannot parse: " << t.regexp;
121  failed++;
122  continue;
123  }
124  Prog* prog = re->CompileToProg(0);
125  if (prog == NULL) {
126  LOG(ERROR) << "Cannot compile: " << t.regexp;
127  re->Decref();
128  failed++;
129  continue;
130  }
131  ASSERT_TRUE(re->CompileToProg(1) == NULL);
132  std::string s = prog->Dump();
133  if (s != t.code) {
134  LOG(ERROR) << "Incorrect compiled code for: " << t.regexp;
135  LOG(ERROR) << "Want:\n" << t.code;
136  LOG(ERROR) << "Got:\n" << s;
137  failed++;
138  }
139  delete prog;
140  re->Decref();
141  }
142  EXPECT_EQ(failed, 0);
143 }
144 
146  std::string* bytemap) {
147  Regexp* re = Regexp::Parse(pattern, flags, NULL);
148  EXPECT_TRUE(re != NULL);
149 
150  Prog* prog = re->CompileToProg(0);
151  EXPECT_TRUE(prog != NULL);
152  *bytemap = prog->DumpByteMap();
153  delete prog;
154 
155  re->Decref();
156 }
157 
158 TEST(TestCompile, Latin1Ranges) {
159  // The distinct byte ranges involved in the Latin-1 dot ([^\n]).
160 
161  std::string bytemap;
162 
163  DumpByteMap(".", Regexp::PerlX|Regexp::Latin1, &bytemap);
164  EXPECT_EQ("[00-09] -> 0\n"
165  "[0a-0a] -> 1\n"
166  "[0b-ff] -> 0\n",
167  bytemap);
168 }
169 
170 TEST(TestCompile, OtherByteMapTests) {
171  std::string bytemap;
172 
173  // Test that "absent" ranges are mapped to the same byte class.
174  DumpByteMap("[0-9A-Fa-f]+", Regexp::PerlX|Regexp::Latin1, &bytemap);
175  EXPECT_EQ("[00-2f] -> 0\n"
176  "[30-39] -> 1\n"
177  "[3a-40] -> 0\n"
178  "[41-46] -> 1\n"
179  "[47-60] -> 0\n"
180  "[61-66] -> 1\n"
181  "[67-ff] -> 0\n",
182  bytemap);
183 
184  // Test the byte classes for \b.
185  DumpByteMap("\\b", Regexp::LikePerl|Regexp::Latin1, &bytemap);
186  EXPECT_EQ("[00-2f] -> 0\n"
187  "[30-39] -> 1\n"
188  "[3a-40] -> 0\n"
189  "[41-5a] -> 1\n"
190  "[5b-5e] -> 0\n"
191  "[5f-5f] -> 1\n"
192  "[60-60] -> 0\n"
193  "[61-7a] -> 1\n"
194  "[7b-ff] -> 0\n",
195  bytemap);
196 
197  // Bug in the ASCII case-folding optimization created too many byte classes.
198  DumpByteMap("[^_]", Regexp::LikePerl|Regexp::Latin1, &bytemap);
199  EXPECT_EQ("[00-5e] -> 0\n"
200  "[5f-5f] -> 1\n"
201  "[60-ff] -> 0\n",
202  bytemap);
203 }
204 
205 TEST(TestCompile, UTF8Ranges) {
206  // The distinct byte ranges involved in the UTF-8 dot ([^\n]).
207  // Once, erroneously split between 0x3f and 0x40 because it is
208  // a 6-bit boundary.
209 
210  std::string bytemap;
211 
212  DumpByteMap(".", Regexp::PerlX, &bytemap);
213  EXPECT_EQ("[00-09] -> 0\n"
214  "[0a-0a] -> 1\n"
215  "[0b-7f] -> 0\n"
216  "[80-8f] -> 2\n"
217  "[90-9f] -> 3\n"
218  "[a0-bf] -> 4\n"
219  "[c0-c1] -> 1\n"
220  "[c2-df] -> 5\n"
221  "[e0-e0] -> 6\n"
222  "[e1-ef] -> 7\n"
223  "[f0-f0] -> 8\n"
224  "[f1-f3] -> 9\n"
225  "[f4-f4] -> 10\n"
226  "[f5-ff] -> 1\n",
227  bytemap);
228 }
229 
230 TEST(TestCompile, InsufficientMemory) {
231  Regexp* re = Regexp::Parse(
232  "^(?P<name1>[^\\s]+)\\s+(?P<name2>[^\\s]+)\\s+(?P<name3>.+)$",
233  Regexp::LikePerl, NULL);
234  EXPECT_TRUE(re != NULL);
235  Prog* prog = re->CompileToProg(920);
236  // If the memory budget has been exhausted, compilation should fail
237  // and return NULL instead of trying to do anything with NoMatch().
238  EXPECT_TRUE(prog == NULL);
239  re->Decref();
240 }
241 
244  Regexp* re = Regexp::Parse(pattern, flags, NULL);
245  EXPECT_TRUE(re != NULL);
246 
247  if (forward != NULL) {
248  Prog* prog = re->CompileToProg(0);
249  EXPECT_TRUE(prog != NULL);
250  *forward = prog->Dump();
251  delete prog;
252  }
253 
254  if (reverse != NULL) {
255  Prog* prog = re->CompileToReverseProg(0);
256  EXPECT_TRUE(prog != NULL);
257  *reverse = prog->Dump();
258  delete prog;
259  }
260 
261  re->Decref();
262 }
263 
264 TEST(TestCompile, Bug26705922) {
265  // Bug in the compiler caused inefficient bytecode to be generated for Unicode
266  // groups: common suffixes were cached, but common prefixes were not factored.
267 
269 
270  Dump("[\\x{10000}\\x{10010}]", Regexp::LikePerl, &forward, &reverse);
271  EXPECT_EQ("3. byte [f0-f0] 0 -> 4\n"
272  "4. byte [90-90] 0 -> 5\n"
273  "5. byte [80-80] 0 -> 6\n"
274  "6+ byte [80-80] 0 -> 8\n"
275  "7. byte [90-90] 0 -> 8\n"
276  "8. match! 0\n",
277  forward);
278  EXPECT_EQ("3+ byte [80-80] 0 -> 5\n"
279  "4. byte [90-90] 0 -> 5\n"
280  "5. byte [80-80] 0 -> 6\n"
281  "6. byte [90-90] 0 -> 7\n"
282  "7. byte [f0-f0] 0 -> 8\n"
283  "8. match! 0\n",
284  reverse);
285 
286  Dump("[\\x{8000}-\\x{10FFF}]", Regexp::LikePerl, &forward, &reverse);
287  EXPECT_EQ("3+ byte [e8-ef] 0 -> 5\n"
288  "4. byte [f0-f0] 0 -> 8\n"
289  "5. byte [80-bf] 0 -> 6\n"
290  "6. byte [80-bf] 0 -> 7\n"
291  "7. match! 0\n"
292  "8. byte [90-90] 0 -> 5\n",
293  forward);
294  EXPECT_EQ("3. byte [80-bf] 0 -> 4\n"
295  "4. byte [80-bf] 0 -> 5\n"
296  "5+ byte [e8-ef] 0 -> 7\n"
297  "6. byte [90-90] 0 -> 8\n"
298  "7. match! 0\n"
299  "8. byte [f0-f0] 0 -> 7\n",
300  reverse);
301 
302  Dump("[\\x{80}-\\x{10FFFF}]", Regexp::LikePerl, NULL, &reverse);
303  EXPECT_EQ("3. byte [80-bf] 0 -> 4\n"
304  "4+ byte [c2-df] 0 -> 7\n"
305  "5+ byte [a0-bf] 1 -> 8\n"
306  "6. byte [80-bf] 0 -> 9\n"
307  "7. match! 0\n"
308  "8. byte [e0-e0] 0 -> 7\n"
309  "9+ byte [e1-ef] 0 -> 7\n"
310  "10+ byte [90-bf] 1 -> 13\n"
311  "11+ byte [80-bf] 1 -> 14\n"
312  "12. byte [80-8f] 0 -> 15\n"
313  "13. byte [f0-f0] 0 -> 7\n"
314  "14. byte [f1-f3] 0 -> 7\n"
315  "15. byte [f4-f4] 0 -> 7\n",
316  reverse);
317 }
318 
319 TEST(TestCompile, Bug35237384) {
320  // Bug in the compiler caused inefficient bytecode to be generated for
321  // nested nullable subexpressions.
322 
324 
325  Dump("a**{3,}", Regexp::Latin1|Regexp::NeverCapture, &forward, NULL);
326  EXPECT_EQ("3+ byte [61-61] 1 -> 3\n"
327  "4. nop -> 5\n"
328  "5+ byte [61-61] 1 -> 5\n"
329  "6. nop -> 7\n"
330  "7+ byte [61-61] 1 -> 7\n"
331  "8. match! 0\n",
332  forward);
333 
334  Dump("(a*|b*)*{3,}", Regexp::Latin1|Regexp::NeverCapture, &forward, NULL);
335  EXPECT_EQ("3+ nop -> 6\n"
336  "4+ nop -> 8\n"
337  "5. nop -> 21\n"
338  "6+ byte [61-61] 1 -> 6\n"
339  "7. nop -> 3\n"
340  "8+ byte [62-62] 1 -> 8\n"
341  "9. nop -> 3\n"
342  "10+ byte [61-61] 1 -> 10\n"
343  "11. nop -> 21\n"
344  "12+ byte [62-62] 1 -> 12\n"
345  "13. nop -> 21\n"
346  "14+ byte [61-61] 1 -> 14\n"
347  "15. nop -> 18\n"
348  "16+ byte [62-62] 1 -> 16\n"
349  "17. nop -> 18\n"
350  "18+ nop -> 14\n"
351  "19+ nop -> 16\n"
352  "20. match! 0\n"
353  "21+ nop -> 10\n"
354  "22+ nop -> 12\n"
355  "23. nop -> 18\n",
356  forward);
357 
358  Dump("((|S.+)+|(|S.+)+|){2}", Regexp::Latin1|Regexp::NeverCapture, &forward, NULL);
359  EXPECT_EQ("3+ nop -> 36\n"
360  "4+ nop -> 31\n"
361  "5. nop -> 33\n"
362  "6+ byte [00-09] 0 -> 8\n"
363  "7. byte [0b-ff] 0 -> 8\n"
364  "8+ nop -> 6\n"
365  "9+ nop -> 29\n"
366  "10. nop -> 28\n"
367  "11+ byte [00-09] 0 -> 13\n"
368  "12. byte [0b-ff] 0 -> 13\n"
369  "13+ nop -> 11\n"
370  "14+ nop -> 26\n"
371  "15. nop -> 28\n"
372  "16+ byte [00-09] 0 -> 18\n"
373  "17. byte [0b-ff] 0 -> 18\n"
374  "18+ nop -> 16\n"
375  "19+ nop -> 36\n"
376  "20. nop -> 33\n"
377  "21+ byte [00-09] 0 -> 23\n"
378  "22. byte [0b-ff] 0 -> 23\n"
379  "23+ nop -> 21\n"
380  "24+ nop -> 31\n"
381  "25. nop -> 33\n"
382  "26+ nop -> 28\n"
383  "27. byte [53-53] 0 -> 11\n"
384  "28. match! 0\n"
385  "29+ nop -> 28\n"
386  "30. byte [53-53] 0 -> 6\n"
387  "31+ nop -> 33\n"
388  "32. byte [53-53] 0 -> 21\n"
389  "33+ nop -> 29\n"
390  "34+ nop -> 26\n"
391  "35. nop -> 28\n"
392  "36+ nop -> 33\n"
393  "37. byte [53-53] 0 -> 16\n",
394  forward);
395 }
396 
397 } // namespace re2
prog
char * prog
Definition: bloaty/third_party/zlib/contrib/untgz/untgz.c:125
re2::Regexp::Decref
void Decref()
Definition: bloaty/third_party/re2/re2/regexp.cc:115
check_tracer_sanity.pattern
pattern
Definition: check_tracer_sanity.py:25
re2::Dump
static void Dump(StringPiece pattern, Regexp::ParseFlags flags, std::string *forward, std::string *reverse)
Definition: bloaty/third_party/re2/re2/testing/compile_test.cc:242
re2::Regexp
Definition: bloaty/third_party/re2/re2/regexp.h:274
testing::internal::string
::std::string string
Definition: bloaty/third_party/protobuf/third_party/googletest/googletest/include/gtest/internal/gtest-port.h:881
re2
Definition: bloaty/third_party/re2/re2/bitmap256.h:17
re2::Regexp::CompileToProg
Prog * CompileToProg(int64_t max_mem)
Definition: bloaty/third_party/re2/re2/compile.cc:1220
re2::Regexp::NeverCapture
@ NeverCapture
Definition: bloaty/third_party/re2/re2/regexp.h:309
re2::Regexp::Latin1
@ Latin1
Definition: bloaty/third_party/re2/re2/regexp.h:289
EXPECT_EQ
#define EXPECT_EQ(a, b)
Definition: iomgr/time_averaged_stats_test.cc:27
make_dist_html.reverse
reverse
Definition: make_dist_html.py:119
LOG
#define LOG(severity)
Definition: bloaty/third_party/re2/util/logging.h:53
re2::Regexp::Parse
static Regexp * Parse(const StringPiece &s, ParseFlags flags, RegexpStatus *status)
Definition: bloaty/third_party/re2/re2/parse.cc:2200
re2::Regexp::ParseFlags
ParseFlags
Definition: bloaty/third_party/re2/re2/regexp.h:278
google::protobuf::ERROR
static const LogLevel ERROR
Definition: bloaty/third_party/protobuf/src/google/protobuf/testing/googletest.h:70
arraysize
#define arraysize(array)
Definition: benchmark/src/arraysize.h:28
re2::DumpByteMap
static void DumpByteMap(StringPiece pattern, Regexp::ParseFlags flags, std::string *bytemap)
Definition: bloaty/third_party/re2/re2/testing/compile_test.cc:145
re2::Regexp::CompileToReverseProg
Prog * CompileToReverseProg(int64_t max_mem)
Definition: bloaty/third_party/re2/re2/compile.cc:1224
tests
Definition: src/python/grpcio_tests/tests/__init__.py:1
absl::flags_internal
Definition: abseil-cpp/absl/flags/commandlineflag.h:40
re2::Prog
Definition: bloaty/third_party/re2/re2/prog.h:56
ASSERT_TRUE
#define ASSERT_TRUE(condition)
Definition: bloaty/third_party/googletest/googletest/include/gtest/gtest.h:1973
re2::Test
Definition: bloaty/third_party/re2/re2/testing/compile_test.cc:22
re2::Test::code
const char * code
Definition: bloaty/third_party/re2/re2/testing/compile_test.cc:24
EXPECT_TRUE
#define EXPECT_TRUE(condition)
Definition: bloaty/third_party/googletest/googletest/include/gtest/gtest.h:1967
re2::Test::regexp
const char * regexp
Definition: bloaty/third_party/re2/re2/testing/compile_test.cc:23
absl::forward
constexpr T && forward(absl::remove_reference_t< T > &t) noexcept
Definition: abseil-cpp/absl/utility/utility.h:230
re2::TEST
TEST(TestCharClassBuilder, Adds)
Definition: bloaty/third_party/re2/re2/testing/charclass_test.cc:198
re2::Regexp::PerlX
@ PerlX
Definition: bloaty/third_party/re2/re2/regexp.h:293
re2::StringPiece
Definition: bloaty/third_party/re2/re2/stringpiece.h:39
i
uint64_t i
Definition: abseil-cpp/absl/container/btree_benchmark.cc:230
re2::Regexp::LikePerl
@ LikePerl
Definition: bloaty/third_party/re2/re2/regexp.h:312


grpc
Author(s):
autogenerated on Fri May 16 2025 02:58:00