re2/re2/testing/simplify_test.cc
Go to the documentation of this file.
1 // Copyright 2006 The RE2 Authors. All Rights Reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4 
5 // Test simplify.cc.
6 
7 #include <string.h>
8 #include <string>
9 
10 #include "util/test.h"
11 #include "util/logging.h"
12 #include "re2/regexp.h"
13 
14 namespace re2 {
15 
16 struct Test {
17  const char* regexp;
18  const char* simplified;
19 };
20 
21 static Test tests[] = {
22  // Already-simple constructs
23  { "a", "a" },
24  { "ab", "ab" },
25  { "a|b", "[a-b]" },
26  { "ab|cd", "ab|cd" },
27  { "(ab)*", "(ab)*" },
28  { "(ab)+", "(ab)+" },
29  { "(ab)?", "(ab)?" },
30  { ".", "." },
31  { "^", "^" },
32  { "$", "$" },
33  { "[ac]", "[ac]" },
34  { "[^ac]", "[^ac]" },
35 
36  // Posix character classes
37  { "[[:alnum:]]", "[0-9A-Za-z]" },
38  { "[[:alpha:]]", "[A-Za-z]" },
39  { "[[:blank:]]", "[\\t ]" },
40  { "[[:cntrl:]]", "[\\x00-\\x1f\\x7f]" },
41  { "[[:digit:]]", "[0-9]" },
42  { "[[:graph:]]", "[!-~]" },
43  { "[[:lower:]]", "[a-z]" },
44  { "[[:print:]]", "[ -~]" },
45  { "[[:punct:]]", "[!-/:-@\\[-`{-~]" },
46  { "[[:space:]]" , "[\\t-\\r ]" },
47  { "[[:upper:]]", "[A-Z]" },
48  { "[[:xdigit:]]", "[0-9A-Fa-f]" },
49 
50  // Perl character classes
51  { "\\d", "[0-9]" },
52  { "\\s", "[\\t-\\n\\f-\\r ]" },
53  { "\\w", "[0-9A-Z_a-z]" },
54  { "\\D", "[^0-9]" },
55  { "\\S", "[^\\t-\\n\\f-\\r ]" },
56  { "\\W", "[^0-9A-Z_a-z]" },
57  { "[\\d]", "[0-9]" },
58  { "[\\s]", "[\\t-\\n\\f-\\r ]" },
59  { "[\\w]", "[0-9A-Z_a-z]" },
60  { "[\\D]", "[^0-9]" },
61  { "[\\S]", "[^\\t-\\n\\f-\\r ]" },
62  { "[\\W]", "[^0-9A-Z_a-z]" },
63 
64  // Posix repetitions
65  { "a{1}", "a" },
66  { "a{2}", "aa" },
67  { "a{5}", "aaaaa" },
68  { "a{0,1}", "a?" },
69  // The next three are illegible because Simplify inserts (?:)
70  // parens instead of () parens to avoid creating extra
71  // captured subexpressions. The comments show a version fewer parens.
72  { "(a){0,2}", "(?:(a)(a)?)?" }, // (aa?)?
73  { "(a){0,4}", "(?:(a)(?:(a)(?:(a)(a)?)?)?)?" }, // (a(a(aa?)?)?)?
74  { "(a){2,6}", "(a)(a)(?:(a)(?:(a)(?:(a)(a)?)?)?)?" }, // aa(a(a(aa?)?)?)?
75  { "a{0,2}", "(?:aa?)?" }, // (aa?)?
76  { "a{0,4}", "(?:a(?:a(?:aa?)?)?)?" }, // (a(a(aa?)?)?)?
77  { "a{2,6}", "aa(?:a(?:a(?:aa?)?)?)?" }, // aa(a(a(aa?)?)?)?
78  { "a{0,}", "a*" },
79  { "a{1,}", "a+" },
80  { "a{2,}", "aa+" },
81  { "a{5,}", "aaaaa+" },
82 
83  // Test that operators simplify their arguments.
84  // (Simplify used to not simplify arguments to a {} repeat.)
85  { "(?:a{1,}){1,}", "a+" },
86  { "(a{1,}b{1,})", "(a+b+)" },
87  { "a{1,}|b{1,}", "a+|b+" },
88  { "(?:a{1,})*", "(?:a+)*" },
89  { "(?:a{1,})+", "a+" },
90  { "(?:a{1,})?", "(?:a+)?" },
91  { "a{0}", "" },
92 
93  // Character class simplification
94  { "[ab]", "[a-b]" },
95  { "[a-za-za-z]", "[a-z]" },
96  { "[A-Za-zA-Za-z]", "[A-Za-z]" },
97  { "[ABCDEFGH]", "[A-H]" },
98  { "[AB-CD-EF-GH]", "[A-H]" },
99  { "[W-ZP-XE-R]", "[E-Z]" },
100  { "[a-ee-gg-m]", "[a-m]" },
101  { "[a-ea-ha-m]", "[a-m]" },
102  { "[a-ma-ha-e]", "[a-m]" },
103  { "[a-zA-Z0-9 -~]", "[ -~]" },
104 
105  // Empty character classes
106  { "[^[:cntrl:][:^cntrl:]]", "[^\\x00-\\x{10ffff}]" },
107 
108  // Full character classes
109  { "[[:cntrl:][:^cntrl:]]", "." },
110 
111  // Unicode case folding.
112  { "(?i)A", "[Aa]" },
113  { "(?i)a", "[Aa]" },
114  { "(?i)K", "[Kk\\x{212a}]" },
115  { "(?i)k", "[Kk\\x{212a}]" },
116  { "(?i)\\x{212a}", "[Kk\\x{212a}]" },
117  { "(?i)[a-z]", "[A-Za-z\\x{17f}\\x{212a}]" },
118  { "(?i)[\\x00-\\x{FFFD}]", "[\\x00-\\x{fffd}]" },
119  { "(?i)[\\x00-\\x{10ffff}]", "." },
120 
121  // Empty string as a regular expression.
122  // Empty string must be preserved inside parens in order
123  // to make submatches work right, so these are less
124  // interesting than they used to be. ToString inserts
125  // explicit (?:) in place of non-parenthesized empty strings,
126  // to make them easier to spot for other parsers.
127  { "(a|b|)", "([a-b]|(?:))" },
128  { "(|)", "((?:)|(?:))" },
129  { "a()", "a()" },
130  { "(()|())", "(()|())" },
131  { "(a|)", "(a|(?:))" },
132  { "ab()cd()", "ab()cd()" },
133  { "()", "()" },
134  { "()*", "()*" },
135  { "()+", "()+" },
136  { "()?" , "()?" },
137  { "(){0}", "" },
138  { "(){1}", "()" },
139  { "(){1,}", "()+" },
140  { "(){0,2}", "(?:()()?)?" },
141 
142  // Test that coalescing occurs and that the resulting repeats are simplified.
143  // Two-op combinations of *, +, ?, {n}, {n,} and {n,m} with a literal:
144  { "a*a*", "a*" },
145  { "a*a+", "a+" },
146  { "a*a?", "a*" },
147  { "a*a{2}", "aa+" },
148  { "a*a{2,}", "aa+" },
149  { "a*a{2,3}", "aa+" },
150  { "a+a*", "a+" },
151  { "a+a+", "aa+" },
152  { "a+a?", "a+" },
153  { "a+a{2}", "aaa+" },
154  { "a+a{2,}", "aaa+" },
155  { "a+a{2,3}", "aaa+" },
156  { "a?a*", "a*" },
157  { "a?a+", "a+" },
158  { "a?a?", "(?:aa?)?" },
159  { "a?a{2}", "aaa?" },
160  { "a?a{2,}", "aa+" },
161  { "a?a{2,3}", "aa(?:aa?)?" },
162  { "a{2}a*", "aa+" },
163  { "a{2}a+", "aaa+" },
164  { "a{2}a?", "aaa?" },
165  { "a{2}a{2}", "aaaa" },
166  { "a{2}a{2,}", "aaaa+" },
167  { "a{2}a{2,3}", "aaaaa?" },
168  { "a{2,}a*", "aa+" },
169  { "a{2,}a+", "aaa+" },
170  { "a{2,}a?", "aa+" },
171  { "a{2,}a{2}", "aaaa+" },
172  { "a{2,}a{2,}", "aaaa+" },
173  { "a{2,}a{2,3}", "aaaa+" },
174  { "a{2,3}a*", "aa+" },
175  { "a{2,3}a+", "aaa+" },
176  { "a{2,3}a?", "aa(?:aa?)?" },
177  { "a{2,3}a{2}", "aaaaa?" },
178  { "a{2,3}a{2,}", "aaaa+" },
179  { "a{2,3}a{2,3}", "aaaa(?:aa?)?" },
180  // With a char class, any char and any byte:
181  { "\\d*\\d*", "[0-9]*" },
182  { ".*.*", ".*" },
183  { "\\C*\\C*", "\\C*" },
184  // FoldCase works, but must be consistent:
185  { "(?i)A*a*", "[Aa]*" },
186  { "(?i)a+A+", "[Aa][Aa]+" },
187  { "(?i)A*(?-i)a*", "[Aa]*a*" },
188  { "(?i)a+(?-i)A+", "[Aa]+A+" },
189  // NonGreedy works, but must be consistent:
190  { "a*?a*?", "a*?" },
191  { "a+?a+?", "aa+?" },
192  { "a*?a*", "a*?a*" },
193  { "a+a+?", "a+a+?" },
194  // The second element is the literal, char class, any char or any byte:
195  { "a*a", "a+" },
196  { "\\d*\\d", "[0-9]+" },
197  { ".*.", ".+" },
198  { "\\C*\\C", "\\C+" },
199  // FoldCase works, but must be consistent:
200  { "(?i)A*a", "[Aa]+" },
201  { "(?i)a+A", "[Aa][Aa]+" },
202  { "(?i)A*(?-i)a", "[Aa]*a" },
203  { "(?i)a+(?-i)A", "[Aa]+A" },
204  // The second element is a literal string that begins with the literal:
205  { "a*aa", "aa+" },
206  { "a*aab", "aa+b" },
207  // FoldCase works, but must be consistent:
208  { "(?i)a*aa", "[Aa][Aa]+" },
209  { "(?i)a*aab", "[Aa][Aa]+[Bb]" },
210  { "(?i)a*(?-i)aa", "[Aa]*aa" },
211  { "(?i)a*(?-i)aab", "[Aa]*aab" },
212  // Negative tests with mismatching ops:
213  { "a*b*", "a*b*" },
214  { "\\d*\\D*", "[0-9]*[^0-9]*" },
215  { "a+b", "a+b" },
216  { "\\d+\\D", "[0-9]+[^0-9]" },
217  { "a?bb", "a?bb" },
218  // Negative tests with capturing groups:
219  { "(a*)a*", "(a*)a*" },
220  { "a+(a)", "a+(a)" },
221  { "(a?)(aa)", "(a?)(aa)" },
222  // Just for fun:
223  { "aa*aa+aa?aa{2}aaa{2,}aaa{2,3}a", "aaaaaaaaaaaaaaaa+" },
224 
225  // During coalescing, the child of the repeat changes, so we build a new
226  // repeat. The new repeat must have the min and max of the old repeat.
227  // Failure to copy them results in min=0 and max=0 -> empty match.
228  { "(?:a*aab){2}", "aa+baa+b" },
229 
230  // During coalescing, the child of the capture changes, so we build a new
231  // capture. The new capture must have the cap of the old capture.
232  // Failure to copy it results in cap=0 -> ToString() logs a fatal error.
233  { "(a*aab)", "(aa+b)" },
234 
235  // Test squashing of **, ++, ?? et cetera.
236  { "(?:(?:a){0,}){0,}", "a*" },
237  { "(?:(?:a){1,}){1,}", "a+" },
238  { "(?:(?:a){0,1}){0,1}", "a?" },
239  { "(?:(?:a){0,}){1,}", "a*" },
240  { "(?:(?:a){0,}){0,1}", "a*" },
241  { "(?:(?:a){1,}){0,}", "a*" },
242  { "(?:(?:a){1,}){0,1}", "a*" },
243  { "(?:(?:a){0,1}){0,}", "a*" },
244  { "(?:(?:a){0,1}){1,}", "a*" },
245 };
246 
247 TEST(TestSimplify, SimpleRegexps) {
248  for (size_t i = 0; i < arraysize(tests); i++) {
249  RegexpStatus status;
250  VLOG(1) << "Testing " << tests[i].regexp;
251  Regexp* re = Regexp::Parse(tests[i].regexp,
253  ~Regexp::OneLine),
254  &status);
255  ASSERT_TRUE(re != NULL) << " " << tests[i].regexp << " " << status.Text();
256  Regexp* sre = re->Simplify();
257  ASSERT_TRUE(sre != NULL);
258 
259  // Check that already-simple regexps don't allocate new ones.
260  if (strcmp(tests[i].regexp, tests[i].simplified) == 0) {
261  ASSERT_TRUE(re == sre) << " " << tests[i].regexp
262  << " " << re->ToString() << " " << sre->ToString();
263  }
264 
265  EXPECT_EQ(tests[i].simplified, sre->ToString())
266  << " " << tests[i].regexp << " " << sre->Dump();
267 
268  re->Decref();
269  sre->Decref();
270  }
271 }
272 
273 } // namespace re2
VLOG
#define VLOG(x)
Definition: third_party/bloaty/third_party/protobuf/third_party/benchmark/src/log.h:69
string.h
status
absl::Status status
Definition: rls.cc:251
re2
Definition: bloaty/third_party/re2/re2/bitmap256.h:17
re2::Regexp::OneLine
@ OneLine
Definition: bloaty/third_party/re2/re2/regexp.h:286
EXPECT_EQ
#define EXPECT_EQ(a, b)
Definition: iomgr/time_averaged_stats_test.cc:27
re2::Regexp::Parse
static Regexp * Parse(const StringPiece &s, ParseFlags flags, RegexpStatus *status)
Definition: bloaty/third_party/re2/re2/parse.cc:2200
arraysize
#define arraysize(array)
Definition: benchmark/src/arraysize.h:28
re2::Test::simplified
const char * simplified
Definition: bloaty/third_party/re2/re2/testing/simplify_test.cc:18
tests
Definition: src/python/grpcio_tests/tests/__init__.py:1
ASSERT_TRUE
#define ASSERT_TRUE(condition)
Definition: bloaty/third_party/googletest/googletest/include/gtest/gtest.h:1973
re2::Test
Definition: bloaty/third_party/re2/re2/testing/compile_test.cc:22
re2::Test::regexp
const char * regexp
Definition: bloaty/third_party/re2/re2/testing/compile_test.cc:23
re2::Regexp::MatchNL
@ MatchNL
Definition: bloaty/third_party/re2/re2/regexp.h:285
re2::TEST
TEST(TestCharClassBuilder, Adds)
Definition: bloaty/third_party/re2/re2/testing/charclass_test.cc:198
Test
Definition: hpack_parser_test.cc:43
i
uint64_t i
Definition: abseil-cpp/absl/container/btree_benchmark.cc:230
re2::Regexp::LikePerl
@ LikePerl
Definition: bloaty/third_party/re2/re2/regexp.h:312


grpc
Author(s):
autogenerated on Fri May 16 2025 03:00:13