bloaty/third_party/re2/re2/testing/dfa_test.cc
Go to the documentation of this file.
1 // Copyright 2006-2008 The RE2 Authors. All Rights Reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4 
5 #include <stdint.h>
6 #include <string>
7 #include <thread>
8 #include <vector>
9 
10 #include "util/test.h"
11 #include "util/logging.h"
12 #include "util/strutil.h"
13 #include "re2/prog.h"
14 #include "re2/re2.h"
15 #include "re2/regexp.h"
16 #include "re2/testing/regexp_generator.h"
17 #include "re2/testing/string_generator.h"
18 
19 static const bool UsingMallocCounter = false;
20 
21 DEFINE_int32(size, 8, "log2(number of DFA nodes)");
22 DEFINE_int32(repeat, 2, "Repetition count.");
23 DEFINE_int32(threads, 4, "number of threads");
24 
25 namespace re2 {
26 
27 // Check that multithreaded access to DFA class works.
28 
29 // Helper function: builds entire DFA for prog.
30 static void DoBuild(Prog* prog) {
31  ASSERT_TRUE(prog->BuildEntireDFA(Prog::kFirstMatch, nullptr));
32 }
33 
34 TEST(Multithreaded, BuildEntireDFA) {
35  // Create regexp with 2^FLAGS_size states in DFA.
36  std::string s = "a";
37  for (int i = 0; i < FLAGS_size; i++)
38  s += "[ab]";
39  s += "b";
40  Regexp* re = Regexp::Parse(s, Regexp::LikePerl, NULL);
41  ASSERT_TRUE(re != NULL);
42 
43  // Check that single-threaded code works.
44  {
45  Prog* prog = re->CompileToProg(0);
46  ASSERT_TRUE(prog != NULL);
47 
49  t.join();
50 
51  delete prog;
52  }
53 
54  // Build the DFA simultaneously in a bunch of threads.
55  for (int i = 0; i < FLAGS_repeat; i++) {
56  Prog* prog = re->CompileToProg(0);
57  ASSERT_TRUE(prog != NULL);
58 
59  std::vector<std::thread> threads;
60  for (int j = 0; j < FLAGS_threads; j++)
61  threads.emplace_back(DoBuild, prog);
62  for (int j = 0; j < FLAGS_threads; j++)
63  threads[j].join();
64 
65  // One more compile, to make sure everything is okay.
66  prog->BuildEntireDFA(Prog::kFirstMatch, nullptr);
67  delete prog;
68  }
69 
70  re->Decref();
71 }
72 
73 // Check that DFA size requirements are followed.
74 // BuildEntireDFA will, like SearchDFA, stop building out
75 // the DFA once the memory limits are reached.
76 TEST(SingleThreaded, BuildEntireDFA) {
77  // Create regexp with 2^30 states in DFA.
78  Regexp* re = Regexp::Parse("a[ab]{30}b", Regexp::LikePerl, NULL);
79  ASSERT_TRUE(re != NULL);
80 
81  for (int i = 17; i < 24; i++) {
82  int64_t limit = int64_t{1}<<i;
83  int64_t usage;
84  //int64_t progusage, dfamem;
85  {
87  Prog* prog = re->CompileToProg(limit);
88  ASSERT_TRUE(prog != NULL);
89  //progusage = m.HeapGrowth();
90  //dfamem = prog->dfa_mem();
91  prog->BuildEntireDFA(Prog::kFirstMatch, nullptr);
92  prog->BuildEntireDFA(Prog::kLongestMatch, nullptr);
93  usage = m.HeapGrowth();
94  delete prog;
95  }
96  if (UsingMallocCounter) {
97  //LOG(INFO) << "limit " << limit << ", "
98  // << "prog usage " << progusage << ", "
99  // << "DFA budget " << dfamem << ", "
100  // << "total " << usage;
101  // Tolerate +/- 10%.
102  ASSERT_GT(usage, limit*9/10);
103  ASSERT_LT(usage, limit*11/10);
104  }
105  }
106  re->Decref();
107 }
108 
109 // Generates and returns a string over binary alphabet {0,1} that contains
110 // all possible binary sequences of length n as subsequences. The obvious
111 // brute force method would generate a string of length n * 2^n, but this
112 // generates a string of length n + 2^n - 1 called a De Bruijn cycle.
113 // See Knuth, The Art of Computer Programming, Vol 2, Exercise 3.2.2 #17.
114 // Such a string is useful for testing a DFA. If you have a DFA
115 // where distinct last n bytes implies distinct states, then running on a
116 // DeBruijn string causes the DFA to need to create a new state at every
117 // position in the input, never reusing any states until it gets to the
118 // end of the string. This is the worst possible case for DFA execution.
120  CHECK_LT(n, static_cast<int>(8*sizeof(int)));
121  CHECK_GT(n, 0);
122 
123  std::vector<bool> did(size_t{1}<<n);
124  for (int i = 0; i < 1<<n; i++)
125  did[i] = false;
126 
127  std::string s;
128  for (int i = 0; i < n-1; i++)
129  s.append("0");
130  int bits = 0;
131  int mask = (1<<n) - 1;
132  for (int i = 0; i < (1<<n); i++) {
133  bits <<= 1;
134  bits &= mask;
135  if (!did[bits|1]) {
136  bits |= 1;
137  s.append("1");
138  } else {
139  s.append("0");
140  }
141  CHECK(!did[bits]);
142  did[bits] = true;
143  }
144  return s;
145 }
146 
147 // Test that the DFA gets the right result even if it runs
148 // out of memory during a search. The regular expression
149 // 0[01]{n}$ matches a binary string of 0s and 1s only if
150 // the (n+1)th-to-last character is a 0. Matching this in
151 // a single forward pass (as done by the DFA) requires
152 // keeping one bit for each of the last n+1 characters
153 // (whether each was a 0), or 2^(n+1) possible states.
154 // If we run this regexp to search in a string that contains
155 // every possible n-character binary string as a substring,
156 // then it will have to run through at least 2^n states.
157 // States are big data structures -- certainly more than 1 byte --
158 // so if the DFA can search correctly while staying within a
159 // 2^n byte limit, it must be handling out-of-memory conditions
160 // gracefully.
161 TEST(SingleThreaded, SearchDFA) {
162  // The De Bruijn string is the worst case input for this regexp.
163  // By default, the DFA will notice that it is flushing its cache
164  // too frequently and will bail out early, so that RE2 can use the
165  // NFA implementation instead. (The DFA loses its speed advantage
166  // if it can't get a good cache hit rate.)
167  // Tell the DFA to trudge along instead.
169 
170  // Choice of n is mostly arbitrary, except that:
171  // * making n too big makes the test run for too long.
172  // * making n too small makes the DFA refuse to run,
173  // because it has so little memory compared to the program size.
174  // Empirically, n = 18 is a good compromise between the two.
175  const int n = 18;
176 
177  Regexp* re = Regexp::Parse(StringPrintf("0[01]{%d}$", n),
178  Regexp::LikePerl, NULL);
179  ASSERT_TRUE(re != NULL);
180 
181  // The De Bruijn string for n ends with a 1 followed by n 0s in a row,
182  // which is not a match for 0[01]{n}$. Adding one more 0 is a match.
183  std::string no_match = DeBruijnString(n);
184  std::string match = no_match + "0";
185 
186  int64_t usage;
187  int64_t peak_usage;
188  {
190  Prog* prog = re->CompileToProg(1<<n);
191  ASSERT_TRUE(prog != NULL);
192  for (int i = 0; i < 10; i++) {
193  bool matched = false;
194  bool failed = false;
195  matched = prog->SearchDFA(match, StringPiece(), Prog::kUnanchored,
196  Prog::kFirstMatch, NULL, &failed, NULL);
197  ASSERT_FALSE(failed);
198  ASSERT_TRUE(matched);
199  matched = prog->SearchDFA(no_match, StringPiece(), Prog::kUnanchored,
200  Prog::kFirstMatch, NULL, &failed, NULL);
201  ASSERT_FALSE(failed);
202  ASSERT_FALSE(matched);
203  }
204  usage = m.HeapGrowth();
205  peak_usage = m.PeakHeapGrowth();
206  delete prog;
207  }
208  if (UsingMallocCounter) {
209  //LOG(INFO) << "usage " << usage << ", "
210  // << "peak usage " << peak_usage;
211  ASSERT_LT(usage, 1<<n);
212  ASSERT_LT(peak_usage, 1<<n);
213  }
214  re->Decref();
215 
216  // Reset to original behaviour.
218 }
219 
220 // Helper function: searches for match, which should match,
221 // and no_match, which should not.
222 static void DoSearch(Prog* prog, const StringPiece& match,
223  const StringPiece& no_match) {
224  for (int i = 0; i < 2; i++) {
225  bool matched = false;
226  bool failed = false;
227  matched = prog->SearchDFA(match, StringPiece(), Prog::kUnanchored,
228  Prog::kFirstMatch, NULL, &failed, NULL);
229  ASSERT_FALSE(failed);
230  ASSERT_TRUE(matched);
231  matched = prog->SearchDFA(no_match, StringPiece(), Prog::kUnanchored,
232  Prog::kFirstMatch, NULL, &failed, NULL);
233  ASSERT_FALSE(failed);
234  ASSERT_FALSE(matched);
235  }
236 }
237 
238 TEST(Multithreaded, SearchDFA) {
240 
241  // Same as single-threaded test above.
242  const int n = 18;
243  Regexp* re = Regexp::Parse(StringPrintf("0[01]{%d}$", n),
244  Regexp::LikePerl, NULL);
245  ASSERT_TRUE(re != NULL);
246  std::string no_match = DeBruijnString(n);
247  std::string match = no_match + "0";
248 
249  // Check that single-threaded code works.
250  {
251  Prog* prog = re->CompileToProg(1<<n);
252  ASSERT_TRUE(prog != NULL);
253 
254  std::thread t(DoSearch, prog, match, no_match);
255  t.join();
256 
257  delete prog;
258  }
259 
260  // Run the search simultaneously in a bunch of threads.
261  // Reuse same flags for Multithreaded.BuildDFA above.
262  for (int i = 0; i < FLAGS_repeat; i++) {
263  Prog* prog = re->CompileToProg(1<<n);
264  ASSERT_TRUE(prog != NULL);
265 
266  std::vector<std::thread> threads;
267  for (int j = 0; j < FLAGS_threads; j++)
268  threads.emplace_back(DoSearch, prog, match, no_match);
269  for (int j = 0; j < FLAGS_threads; j++)
270  threads[j].join();
271 
272  delete prog;
273  }
274 
275  re->Decref();
276 
277  // Reset to original behaviour.
279 }
280 
281 struct ReverseTest {
282  const char* regexp;
283  const char* text;
284  bool match;
285 };
286 
287 // Test that reverse DFA handles anchored/unanchored correctly.
288 // It's in the DFA interface but not used by RE2.
290  { "\\A(a|b)", "abc", true },
291  { "(a|b)\\z", "cba", true },
292  { "\\A(a|b)", "cba", false },
293  { "(a|b)\\z", "abc", false },
294 };
295 
296 TEST(DFA, ReverseMatch) {
297  int nfail = 0;
298  for (size_t i = 0; i < arraysize(reverse_tests); i++) {
299  const ReverseTest& t = reverse_tests[i];
300  Regexp* re = Regexp::Parse(t.regexp, Regexp::LikePerl, NULL);
301  ASSERT_TRUE(re != NULL);
302  Prog* prog = re->CompileToReverseProg(0);
303  ASSERT_TRUE(prog != NULL);
304  bool failed = false;
305  bool matched = prog->SearchDFA(t.text, StringPiece(), Prog::kUnanchored,
306  Prog::kFirstMatch, NULL, &failed, NULL);
307  if (matched != t.match) {
308  LOG(ERROR) << t.regexp << " on " << t.text << ": want " << t.match;
309  nfail++;
310  }
311  delete prog;
312  re->Decref();
313  }
314  EXPECT_EQ(nfail, 0);
315 }
316 
317 struct CallbackTest {
318  const char* regexp;
319  const char* dump;
320 };
321 
322 // Test that DFA::BuildAllStates() builds the expected DFA states
323 // and issues the expected callbacks. These test cases reflect the
324 // very compact encoding of the callbacks, but that also makes them
325 // very difficult to understand, so let's work through "\\Aa\\z".
326 // There are three slots per DFA state because the bytemap has two
327 // equivalence classes and there is a third slot for kByteEndText:
328 // 0: all bytes that are not 'a'
329 // 1: the byte 'a'
330 // 2: kByteEndText
331 // -1 means that there is no transition from that DFA state to any
332 // other DFA state for that slot. The valid transitions are thus:
333 // state 0 --slot 1--> state 1
334 // state 1 --slot 2--> state 2
335 // The double brackets indicate that state 2 is a matching state.
336 // Putting it together, this means that the DFA must consume the
337 // byte 'a' and then hit end of text. Q.E.D.
339  { "\\Aa\\z", "[-1,1,-1] [-1,-1,2] [[-1,-1,-1]]" },
340  { "\\Aab\\z", "[-1,1,-1,-1] [-1,-1,2,-1] [-1,-1,-1,3] [[-1,-1,-1,-1]]" },
341  { "\\Aa*b\\z", "[-1,0,1,-1] [-1,-1,-1,2] [[-1,-1,-1,-1]]" },
342  { "\\Aa+b\\z", "[-1,1,-1,-1] [-1,1,2,-1] [-1,-1,-1,3] [[-1,-1,-1,-1]]" },
343  { "\\Aa?b\\z", "[-1,1,2,-1] [-1,-1,2,-1] [-1,-1,-1,3] [[-1,-1,-1,-1]]" },
344  { "\\Aa\\C*\\z", "[-1,1,-1] [1,1,2] [[-1,-1,-1]]" },
345  { "\\Aa\\C*", "[-1,1,-1] [2,2,3] [[2,2,2]] [[-1,-1,-1]]" },
346  { "a\\C*", "[0,1,-1] [2,2,3] [[2,2,2]] [[-1,-1,-1]]" },
347  { "\\C*", "[1,2] [[1,1]] [[-1,-1]]" },
348  { "a", "[0,1,-1] [2,2,2] [[-1,-1,-1]]"} ,
349 };
350 
351 TEST(DFA, Callback) {
352  int nfail = 0;
353  for (size_t i = 0; i < arraysize(callback_tests); i++) {
354  const CallbackTest& t = callback_tests[i];
355  Regexp* re = Regexp::Parse(t.regexp, Regexp::LikePerl, NULL);
356  ASSERT_TRUE(re != NULL);
357  Prog* prog = re->CompileToProg(0);
358  ASSERT_TRUE(prog != NULL);
359  std::string dump;
360  prog->BuildEntireDFA(Prog::kLongestMatch, [&](const int* next, bool match) {
361  ASSERT_TRUE(next != NULL);
362  if (!dump.empty())
363  dump += " ";
364  dump += match ? "[[" : "[";
365  for (int b = 0; b < prog->bytemap_range() + 1; b++)
366  dump += StringPrintf("%d,", next[b]);
367  dump.pop_back();
368  dump += match ? "]]" : "]";
369  });
370  if (dump != t.dump) {
371  LOG(ERROR) << t.regexp << " bytemap:\n" << prog->DumpByteMap();
372  LOG(ERROR) << t.regexp << " dump:\ngot " << dump << "\nwant " << t.dump;
373  nfail++;
374  }
375  delete prog;
376  re->Decref();
377  }
378  EXPECT_EQ(nfail, 0);
379 }
380 
381 } // namespace re2
re2::DoSearch
static void DoSearch(Prog *prog, const StringPiece &match, const StringPiece &no_match)
Definition: bloaty/third_party/re2/re2/testing/dfa_test.cc:222
prog
char * prog
Definition: bloaty/third_party/zlib/contrib/untgz/untgz.c:125
CHECK_GT
#define CHECK_GT(a, b)
Definition: bloaty/third_party/protobuf/third_party/benchmark/src/check.h:69
re2::Regexp::Decref
void Decref()
Definition: bloaty/third_party/re2/re2/regexp.cc:115
re2::reverse_tests
ReverseTest reverse_tests[]
Definition: bloaty/third_party/re2/re2/testing/dfa_test.cc:289
match
unsigned char match[65280+2]
Definition: bloaty/third_party/zlib/examples/gun.c:165
re2::Regexp
Definition: bloaty/third_party/re2/re2/regexp.h:274
testing::internal::string
::std::string string
Definition: bloaty/third_party/protobuf/third_party/googletest/googletest/include/gtest/internal/gtest-port.h:881
re2
Definition: bloaty/third_party/re2/re2/bitmap256.h:17
re2::Regexp::CompileToProg
Prog * CompileToProg(int64_t max_mem)
Definition: bloaty/third_party/re2/re2/compile.cc:1220
re2::CallbackTest::dump
const char * dump
Definition: bloaty/third_party/re2/re2/testing/dfa_test.cc:319
re2::ReverseTest::text
const char * text
Definition: bloaty/third_party/re2/re2/testing/dfa_test.cc:283
threads
static uv_thread_t * threads
Definition: threadpool.c:38
EXPECT_EQ
#define EXPECT_EQ(a, b)
Definition: iomgr/time_averaged_stats_test.cc:27
UsingMallocCounter
static const bool UsingMallocCounter
Definition: bloaty/third_party/re2/re2/testing/dfa_test.cc:19
re2::CallbackTest::regexp
const char * regexp
Definition: bloaty/third_party/re2/re2/testing/dfa_test.cc:318
ASSERT_LT
#define ASSERT_LT(val1, val2)
Definition: bloaty/third_party/googletest/googletest/include/gtest/gtest.h:2068
int64_t
signed __int64 int64_t
Definition: stdint-msvc2008.h:89
LOG
#define LOG(severity)
Definition: bloaty/third_party/re2/util/logging.h:53
re2::StringPrintf
std::string StringPrintf(const char *format,...)
Definition: bloaty/third_party/re2/util/strutil.cc:140
DEFINE_int32
DEFINE_int32(size, 8, "log2(number of DFA nodes)")
bits
OPENSSL_EXPORT ASN1_BIT_STRING * bits
Definition: x509v3.h:482
re2::Regexp::Parse
static Regexp * Parse(const StringPiece &s, ParseFlags flags, RegexpStatus *status)
Definition: bloaty/third_party/re2/re2/parse.cc:2200
CHECK
#define CHECK(x)
Definition: bloaty/third_party/protobuf/php/ext/google/protobuf/upb.c:8085
re2::ReverseTest::match
bool match
Definition: bloaty/third_party/re2/re2/testing/dfa_test.cc:284
b
uint64_t b
Definition: abseil-cpp/absl/container/internal/layout_test.cc:53
n
int n
Definition: abseil-cpp/absl/container/btree_test.cc:1080
google::protobuf::ERROR
static const LogLevel ERROR
Definition: bloaty/third_party/protobuf/src/google/protobuf/testing/googletest.h:70
stdint.h
arraysize
#define arraysize(array)
Definition: benchmark/src/arraysize.h:28
re2::Regexp::CompileToReverseProg
Prog * CompileToReverseProg(int64_t max_mem)
Definition: bloaty/third_party/re2/re2/compile.cc:1224
re2::DFA
Definition: bloaty/third_party/re2/re2/dfa.cc:77
CHECK_LT
#define CHECK_LT(a, b)
Definition: bloaty/third_party/protobuf/third_party/benchmark/src/check.h:70
re2::DeBruijnString
static std::string DeBruijnString(int n)
Definition: bloaty/third_party/re2/re2/testing/dfa_test.cc:119
re2::DoBuild
static void DoBuild(Prog *prog)
Definition: bloaty/third_party/re2/re2/testing/dfa_test.cc:30
bloaty::usage
const char usage[]
Definition: bloaty.cc:1843
re2::Prog
Definition: bloaty/third_party/re2/re2/prog.h:56
re2::ReverseTest::regexp
const char * regexp
Definition: bloaty/third_party/re2/re2/testing/dfa_test.cc:282
re2::callback_tests
CallbackTest callback_tests[]
Definition: bloaty/third_party/re2/re2/testing/dfa_test.cc:338
next
AllocList * next[kMaxLevel]
Definition: abseil-cpp/absl/base/internal/low_level_alloc.cc:100
ASSERT_TRUE
#define ASSERT_TRUE(condition)
Definition: bloaty/third_party/googletest/googletest/include/gtest/gtest.h:1973
ASSERT_FALSE
#define ASSERT_FALSE(condition)
Definition: bloaty/third_party/googletest/googletest/include/gtest/gtest.h:1976
testing::MallocCounter
Definition: bloaty/third_party/re2/util/test.h:48
re2::Prog::kUnanchored
@ kUnanchored
Definition: bloaty/third_party/re2/re2/prog.h:176
re2::Prog::TEST_dfa_should_bail_when_slow
static void TEST_dfa_should_bail_when_slow(bool b)
Definition: bloaty/third_party/re2/re2/dfa.cc:2004
testing::MallocCounter::THIS_THREAD_ONLY
static const int THIS_THREAD_ONLY
Definition: bloaty/third_party/re2/util/test.h:51
re2::Prog::kFirstMatch
@ kFirstMatch
Definition: bloaty/third_party/re2/re2/prog.h:193
re2::CallbackTest
Definition: bloaty/third_party/re2/re2/testing/dfa_test.cc:317
re2::TEST
TEST(TestCharClassBuilder, Adds)
Definition: bloaty/third_party/re2/re2/testing/charclass_test.cc:198
ASSERT_GT
#define ASSERT_GT(val1, val2)
Definition: bloaty/third_party/googletest/googletest/include/gtest/gtest.h:2076
size
voidpf void uLong size
Definition: bloaty/third_party/zlib/contrib/minizip/ioapi.h:136
re2::SearchDFA
SearchImpl SearchDFA
Definition: bloaty/third_party/re2/re2/testing/regexp_benchmark.cc:115
regress.m
m
Definition: regress/regress.py:25
re2::StringPiece
Definition: bloaty/third_party/re2/re2/stringpiece.h:39
re2::ReverseTest
Definition: bloaty/third_party/re2/re2/testing/dfa_test.cc:281
re2::Prog::kLongestMatch
@ kLongestMatch
Definition: bloaty/third_party/re2/re2/prog.h:194
thread
static uv_thread_t thread
Definition: test-async-null-cb.c:29
i
uint64_t i
Definition: abseil-cpp/absl/container/btree_benchmark.cc:230
re2::Regexp::LikePerl
@ LikePerl
Definition: bloaty/third_party/re2/re2/regexp.h:312


grpc
Author(s):
autogenerated on Fri May 16 2025 02:58:16