re2/re2/testing/dfa_test.cc
Go to the documentation of this file.
1 // Copyright 2006-2008 The RE2 Authors. All Rights Reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4 
5 #include <stdint.h>
6 #include <string>
7 #include <thread>
8 #include <vector>
9 
10 #include "util/test.h"
11 #include "util/flags.h"
12 #include "util/logging.h"
13 #include "util/malloc_counter.h"
14 #include "util/strutil.h"
15 #include "re2/prog.h"
16 #include "re2/re2.h"
17 #include "re2/regexp.h"
18 #include "re2/testing/regexp_generator.h"
19 #include "re2/testing/string_generator.h"
20 
21 static const bool UsingMallocCounter = false;
22 
23 DEFINE_FLAG(int, size, 8, "log2(number of DFA nodes)");
24 DEFINE_FLAG(int, repeat, 2, "Repetition count.");
25 DEFINE_FLAG(int, threads, 4, "number of threads");
26 
27 namespace re2 {
28 
29 static int state_cache_resets = 0;
30 static int search_failures = 0;
31 
32 struct SetHooks {
34  hooks::SetDFAStateCacheResetHook([](const hooks::DFAStateCacheReset&) {
36  });
37  hooks::SetDFASearchFailureHook([](const hooks::DFASearchFailure&) {
39  });
40  }
41 } set_hooks;
42 
43 // Check that multithreaded access to DFA class works.
44 
45 // Helper function: builds entire DFA for prog.
46 static void DoBuild(Prog* prog) {
47  ASSERT_TRUE(prog->BuildEntireDFA(Prog::kFirstMatch, nullptr));
48 }
49 
50 TEST(Multithreaded, BuildEntireDFA) {
51  // Create regexp with 2^FLAGS_size states in DFA.
52  std::string s = "a";
53  for (int i = 0; i < GetFlag(FLAGS_size); i++)
54  s += "[ab]";
55  s += "b";
56  Regexp* re = Regexp::Parse(s, Regexp::LikePerl, NULL);
57  ASSERT_TRUE(re != NULL);
58 
59  // Check that single-threaded code works.
60  {
61  Prog* prog = re->CompileToProg(0);
62  ASSERT_TRUE(prog != NULL);
63 
65  t.join();
66 
67  delete prog;
68  }
69 
70  // Build the DFA simultaneously in a bunch of threads.
71  for (int i = 0; i < GetFlag(FLAGS_repeat); i++) {
72  Prog* prog = re->CompileToProg(0);
73  ASSERT_TRUE(prog != NULL);
74 
75  std::vector<std::thread> threads;
76  for (int j = 0; j < GetFlag(FLAGS_threads); j++)
77  threads.emplace_back(DoBuild, prog);
78  for (int j = 0; j < GetFlag(FLAGS_threads); j++)
79  threads[j].join();
80 
81  // One more compile, to make sure everything is okay.
82  prog->BuildEntireDFA(Prog::kFirstMatch, nullptr);
83  delete prog;
84  }
85 
86  re->Decref();
87 }
88 
89 // Check that DFA size requirements are followed.
90 // BuildEntireDFA will, like SearchDFA, stop building out
91 // the DFA once the memory limits are reached.
92 TEST(SingleThreaded, BuildEntireDFA) {
93  // Create regexp with 2^30 states in DFA.
94  Regexp* re = Regexp::Parse("a[ab]{30}b", Regexp::LikePerl, NULL);
95  ASSERT_TRUE(re != NULL);
96 
97  for (int i = 17; i < 24; i++) {
98  int64_t limit = int64_t{1}<<i;
99  int64_t usage;
100  //int64_t progusage, dfamem;
101  {
103  Prog* prog = re->CompileToProg(limit);
104  ASSERT_TRUE(prog != NULL);
105  //progusage = m.HeapGrowth();
106  //dfamem = prog->dfa_mem();
107  prog->BuildEntireDFA(Prog::kFirstMatch, nullptr);
108  prog->BuildEntireDFA(Prog::kLongestMatch, nullptr);
109  usage = m.HeapGrowth();
110  delete prog;
111  }
112  if (UsingMallocCounter) {
113  //LOG(INFO) << "limit " << limit << ", "
114  // << "prog usage " << progusage << ", "
115  // << "DFA budget " << dfamem << ", "
116  // << "total " << usage;
117  // Tolerate +/- 10%.
118  ASSERT_GT(usage, limit*9/10);
119  ASSERT_LT(usage, limit*11/10);
120  }
121  }
122  re->Decref();
123 }
124 
125 // Test that the DFA gets the right result even if it runs
126 // out of memory during a search. The regular expression
127 // 0[01]{n}$ matches a binary string of 0s and 1s only if
128 // the (n+1)th-to-last character is a 0. Matching this in
129 // a single forward pass (as done by the DFA) requires
130 // keeping one bit for each of the last n+1 characters
131 // (whether each was a 0), or 2^(n+1) possible states.
132 // If we run this regexp to search in a string that contains
133 // every possible n-character binary string as a substring,
134 // then it will have to run through at least 2^n states.
135 // States are big data structures -- certainly more than 1 byte --
136 // so if the DFA can search correctly while staying within a
137 // 2^n byte limit, it must be handling out-of-memory conditions
138 // gracefully.
139 TEST(SingleThreaded, SearchDFA) {
140  // The De Bruijn string is the worst case input for this regexp.
141  // By default, the DFA will notice that it is flushing its cache
142  // too frequently and will bail out early, so that RE2 can use the
143  // NFA implementation instead. (The DFA loses its speed advantage
144  // if it can't get a good cache hit rate.)
145  // Tell the DFA to trudge along instead.
147  state_cache_resets = 0;
148  search_failures = 0;
149 
150  // Choice of n is mostly arbitrary, except that:
151  // * making n too big makes the test run for too long.
152  // * making n too small makes the DFA refuse to run,
153  // because it has so little memory compared to the program size.
154  // Empirically, n = 18 is a good compromise between the two.
155  const int n = 18;
156 
157  Regexp* re = Regexp::Parse(StringPrintf("0[01]{%d}$", n),
158  Regexp::LikePerl, NULL);
159  ASSERT_TRUE(re != NULL);
160 
161  // The De Bruijn string for n ends with a 1 followed by n 0s in a row,
162  // which is not a match for 0[01]{n}$. Adding one more 0 is a match.
163  std::string no_match = DeBruijnString(n);
164  std::string match = no_match + "0";
165 
166  int64_t usage;
167  int64_t peak_usage;
168  {
170  Prog* prog = re->CompileToProg(1<<n);
171  ASSERT_TRUE(prog != NULL);
172  for (int i = 0; i < 10; i++) {
173  bool matched = false;
174  bool failed = false;
175  matched = prog->SearchDFA(match, StringPiece(), Prog::kUnanchored,
176  Prog::kFirstMatch, NULL, &failed, NULL);
177  ASSERT_FALSE(failed);
178  ASSERT_TRUE(matched);
179  matched = prog->SearchDFA(no_match, StringPiece(), Prog::kUnanchored,
180  Prog::kFirstMatch, NULL, &failed, NULL);
181  ASSERT_FALSE(failed);
182  ASSERT_FALSE(matched);
183  }
184  usage = m.HeapGrowth();
185  peak_usage = m.PeakHeapGrowth();
186  delete prog;
187  }
188  if (UsingMallocCounter) {
189  //LOG(INFO) << "usage " << usage << ", "
190  // << "peak usage " << peak_usage;
191  ASSERT_LT(usage, 1<<n);
192  ASSERT_LT(peak_usage, 1<<n);
193  }
194  re->Decref();
195 
196  // Reset to original behaviour.
200 }
201 
202 // Helper function: searches for match, which should match,
203 // and no_match, which should not.
204 static void DoSearch(Prog* prog, const StringPiece& match,
205  const StringPiece& no_match) {
206  for (int i = 0; i < 2; i++) {
207  bool matched = false;
208  bool failed = false;
209  matched = prog->SearchDFA(match, StringPiece(), Prog::kUnanchored,
210  Prog::kFirstMatch, NULL, &failed, NULL);
211  ASSERT_FALSE(failed);
212  ASSERT_TRUE(matched);
213  matched = prog->SearchDFA(no_match, StringPiece(), Prog::kUnanchored,
214  Prog::kFirstMatch, NULL, &failed, NULL);
215  ASSERT_FALSE(failed);
216  ASSERT_FALSE(matched);
217  }
218 }
219 
220 TEST(Multithreaded, SearchDFA) {
222  state_cache_resets = 0;
223  search_failures = 0;
224 
225  // Same as single-threaded test above.
226  const int n = 18;
227  Regexp* re = Regexp::Parse(StringPrintf("0[01]{%d}$", n),
228  Regexp::LikePerl, NULL);
229  ASSERT_TRUE(re != NULL);
230  std::string no_match = DeBruijnString(n);
231  std::string match = no_match + "0";
232 
233  // Check that single-threaded code works.
234  {
235  Prog* prog = re->CompileToProg(1<<n);
236  ASSERT_TRUE(prog != NULL);
237 
238  std::thread t(DoSearch, prog, match, no_match);
239  t.join();
240 
241  delete prog;
242  }
243 
244  // Run the search simultaneously in a bunch of threads.
245  // Reuse same flags for Multithreaded.BuildDFA above.
246  for (int i = 0; i < GetFlag(FLAGS_repeat); i++) {
247  Prog* prog = re->CompileToProg(1<<n);
248  ASSERT_TRUE(prog != NULL);
249 
250  std::vector<std::thread> threads;
251  for (int j = 0; j < GetFlag(FLAGS_threads); j++)
252  threads.emplace_back(DoSearch, prog, match, no_match);
253  for (int j = 0; j < GetFlag(FLAGS_threads); j++)
254  threads[j].join();
255 
256  delete prog;
257  }
258 
259  re->Decref();
260 
261  // Reset to original behaviour.
265 }
266 
267 struct ReverseTest {
268  const char* regexp;
269  const char* text;
270  bool match;
271 };
272 
273 // Test that reverse DFA handles anchored/unanchored correctly.
274 // It's in the DFA interface but not used by RE2.
275 ReverseTest reverse_tests[] = {
276  { "\\A(a|b)", "abc", true },
277  { "(a|b)\\z", "cba", true },
278  { "\\A(a|b)", "cba", false },
279  { "(a|b)\\z", "abc", false },
280 };
281 
282 TEST(DFA, ReverseMatch) {
283  int nfail = 0;
284  for (size_t i = 0; i < arraysize(reverse_tests); i++) {
285  const ReverseTest& t = reverse_tests[i];
286  Regexp* re = Regexp::Parse(t.regexp, Regexp::LikePerl, NULL);
287  ASSERT_TRUE(re != NULL);
288  Prog* prog = re->CompileToReverseProg(0);
289  ASSERT_TRUE(prog != NULL);
290  bool failed = false;
291  bool matched = prog->SearchDFA(t.text, StringPiece(), Prog::kUnanchored,
292  Prog::kFirstMatch, NULL, &failed, NULL);
293  if (matched != t.match) {
294  LOG(ERROR) << t.regexp << " on " << t.text << ": want " << t.match;
295  nfail++;
296  }
297  delete prog;
298  re->Decref();
299  }
300  EXPECT_EQ(nfail, 0);
301 }
302 
303 struct CallbackTest {
304  const char* regexp;
305  const char* dump;
306 };
307 
308 // Test that DFA::BuildAllStates() builds the expected DFA states
309 // and issues the expected callbacks. These test cases reflect the
310 // very compact encoding of the callbacks, but that also makes them
311 // very difficult to understand, so let's work through "\\Aa\\z".
312 // There are three slots per DFA state because the bytemap has two
313 // equivalence classes and there is a third slot for kByteEndText:
314 // 0: all bytes that are not 'a'
315 // 1: the byte 'a'
316 // 2: kByteEndText
317 // -1 means that there is no transition from that DFA state to any
318 // other DFA state for that slot. The valid transitions are thus:
319 // state 0 --slot 1--> state 1
320 // state 1 --slot 2--> state 2
321 // The double brackets indicate that state 2 is a matching state.
322 // Putting it together, this means that the DFA must consume the
323 // byte 'a' and then hit end of text. Q.E.D.
324 CallbackTest callback_tests[] = {
325  { "\\Aa\\z", "[-1,1,-1] [-1,-1,2] [[-1,-1,-1]]" },
326  { "\\Aab\\z", "[-1,1,-1,-1] [-1,-1,2,-1] [-1,-1,-1,3] [[-1,-1,-1,-1]]" },
327  { "\\Aa*b\\z", "[-1,0,1,-1] [-1,-1,-1,2] [[-1,-1,-1,-1]]" },
328  { "\\Aa+b\\z", "[-1,1,-1,-1] [-1,1,2,-1] [-1,-1,-1,3] [[-1,-1,-1,-1]]" },
329  { "\\Aa?b\\z", "[-1,1,2,-1] [-1,-1,2,-1] [-1,-1,-1,3] [[-1,-1,-1,-1]]" },
330  { "\\Aa\\C*\\z", "[-1,1,-1] [1,1,2] [[-1,-1,-1]]" },
331  { "\\Aa\\C*", "[-1,1,-1] [2,2,3] [[2,2,2]] [[-1,-1,-1]]" },
332  { "a\\C*", "[0,1,-1] [2,2,3] [[2,2,2]] [[-1,-1,-1]]" },
333  { "\\C*", "[1,2] [[1,1]] [[-1,-1]]" },
334  { "a", "[0,1,-1] [2,2,2] [[-1,-1,-1]]"} ,
335 };
336 
337 TEST(DFA, Callback) {
338  int nfail = 0;
339  for (size_t i = 0; i < arraysize(callback_tests); i++) {
340  const CallbackTest& t = callback_tests[i];
341  Regexp* re = Regexp::Parse(t.regexp, Regexp::LikePerl, NULL);
342  ASSERT_TRUE(re != NULL);
343  Prog* prog = re->CompileToProg(0);
344  ASSERT_TRUE(prog != NULL);
345  std::string dump;
346  prog->BuildEntireDFA(Prog::kLongestMatch, [&](const int* next, bool match) {
347  ASSERT_TRUE(next != NULL);
348  if (!dump.empty())
349  dump += " ";
350  dump += match ? "[[" : "[";
351  for (int b = 0; b < prog->bytemap_range() + 1; b++)
352  dump += StringPrintf("%d,", next[b]);
353  dump.pop_back();
354  dump += match ? "]]" : "]";
355  });
356  if (dump != t.dump) {
357  LOG(ERROR) << t.regexp << " bytemap:\n" << prog->DumpByteMap();
358  LOG(ERROR) << t.regexp << " dump:\ngot " << dump << "\nwant " << t.dump;
359  nfail++;
360  }
361  delete prog;
362  re->Decref();
363  }
364  EXPECT_EQ(nfail, 0);
365 }
366 
367 } // namespace re2
re2::DoSearch
static void DoSearch(Prog *prog, const StringPiece &match, const StringPiece &no_match)
Definition: bloaty/third_party/re2/re2/testing/dfa_test.cc:222
UsingMallocCounter
static const bool UsingMallocCounter
Definition: re2/re2/testing/dfa_test.cc:21
prog
char * prog
Definition: bloaty/third_party/zlib/contrib/untgz/untgz.c:125
re2::hooks::DFASearchFailure
Definition: re2/re2/re2.h:996
absl::str_format_internal::LengthMod::j
@ j
re2::SetHooks
Definition: re2/re2/testing/dfa_test.cc:32
re2::reverse_tests
ReverseTest reverse_tests[]
Definition: bloaty/third_party/re2/re2/testing/dfa_test.cc:289
match
unsigned char match[65280+2]
Definition: bloaty/third_party/zlib/examples/gun.c:165
testing::internal::string
::std::string string
Definition: bloaty/third_party/protobuf/third_party/googletest/googletest/include/gtest/internal/gtest-port.h:881
re2::search_failures
static int search_failures
Definition: re2/re2/testing/dfa_test.cc:30
re2
Definition: bloaty/third_party/re2/re2/bitmap256.h:17
re2::CallbackTest::dump
const char * dump
Definition: bloaty/third_party/re2/re2/testing/dfa_test.cc:319
re2::ReverseTest::text
const char * text
Definition: bloaty/third_party/re2/re2/testing/dfa_test.cc:283
re2::GetFlag
T GetFlag(const T &flag)
Definition: re2/util/flags.h:21
threads
static uv_thread_t * threads
Definition: threadpool.c:38
EXPECT_EQ
#define EXPECT_EQ(a, b)
Definition: iomgr/time_averaged_stats_test.cc:27
re2::CallbackTest::regexp
const char * regexp
Definition: bloaty/third_party/re2/re2/testing/dfa_test.cc:318
ASSERT_LT
#define ASSERT_LT(val1, val2)
Definition: bloaty/third_party/googletest/googletest/include/gtest/gtest.h:2068
int64_t
signed __int64 int64_t
Definition: stdint-msvc2008.h:89
LOG
#define LOG(severity)
Definition: bloaty/third_party/re2/util/logging.h:53
malloc_counter.h
re2::StringPrintf
std::string StringPrintf(const char *format,...)
Definition: bloaty/third_party/re2/util/strutil.cc:140
re2::Regexp::Parse
static Regexp * Parse(const StringPiece &s, ParseFlags flags, RegexpStatus *status)
Definition: bloaty/third_party/re2/re2/parse.cc:2200
re2::ReverseTest::match
bool match
Definition: bloaty/third_party/re2/re2/testing/dfa_test.cc:284
b
uint64_t b
Definition: abseil-cpp/absl/container/internal/layout_test.cc:53
n
int n
Definition: abseil-cpp/absl/container/btree_test.cc:1080
google::protobuf::ERROR
static const LogLevel ERROR
Definition: bloaty/third_party/protobuf/src/google/protobuf/testing/googletest.h:70
stdint.h
arraysize
#define arraysize(array)
Definition: benchmark/src/arraysize.h:28
re2::DeBruijnString
static std::string DeBruijnString(int n)
Definition: bloaty/third_party/re2/re2/testing/dfa_test.cc:119
re2::state_cache_resets
static int state_cache_resets
Definition: re2/re2/testing/dfa_test.cc:29
re2::DoBuild
static void DoBuild(Prog *prog)
Definition: bloaty/third_party/re2/re2/testing/dfa_test.cc:30
bloaty::usage
const char usage[]
Definition: bloaty.cc:1843
re2::SetHooks::SetHooks
SetHooks()
Definition: re2/re2/testing/dfa_test.cc:33
re2::Prog
Definition: bloaty/third_party/re2/re2/prog.h:56
re2::ReverseTest::regexp
const char * regexp
Definition: bloaty/third_party/re2/re2/testing/dfa_test.cc:282
re2::callback_tests
CallbackTest callback_tests[]
Definition: bloaty/third_party/re2/re2/testing/dfa_test.cc:338
absl::str_format_internal::LengthMod::t
@ t
next
AllocList * next[kMaxLevel]
Definition: abseil-cpp/absl/base/internal/low_level_alloc.cc:100
ASSERT_TRUE
#define ASSERT_TRUE(condition)
Definition: bloaty/third_party/googletest/googletest/include/gtest/gtest.h:1973
ASSERT_FALSE
#define ASSERT_FALSE(condition)
Definition: bloaty/third_party/googletest/googletest/include/gtest/gtest.h:1976
testing::MallocCounter
Definition: bloaty/third_party/re2/util/test.h:48
re2::Prog::kUnanchored
@ kUnanchored
Definition: bloaty/third_party/re2/re2/prog.h:176
re2::set_hooks
struct re2::SetHooks set_hooks
testing::MallocCounter::THIS_THREAD_ONLY
static const int THIS_THREAD_ONLY
Definition: bloaty/third_party/re2/util/test.h:51
re2::Prog::kFirstMatch
@ kFirstMatch
Definition: bloaty/third_party/re2/re2/prog.h:193
re2::TEST
TEST(TestCharClassBuilder, Adds)
Definition: bloaty/third_party/re2/re2/testing/charclass_test.cc:198
ASSERT_GT
#define ASSERT_GT(val1, val2)
Definition: bloaty/third_party/googletest/googletest/include/gtest/gtest.h:2076
re2::hooks::DFAStateCacheReset
Definition: re2/re2/re2.h:991
size
voidpf void uLong size
Definition: bloaty/third_party/zlib/contrib/minizip/ioapi.h:136
re2::Prog::TESTING_ONLY_set_dfa_should_bail_when_slow
static void TESTING_ONLY_set_dfa_should_bail_when_slow(bool b)
Definition: re2/re2/dfa.cc:59
re2::SearchDFA
SearchImpl SearchDFA
Definition: bloaty/third_party/re2/re2/testing/regexp_benchmark.cc:115
regress.m
m
Definition: regress/regress.py:25
re2::StringPiece
Definition: bloaty/third_party/re2/re2/stringpiece.h:39
re2::Prog::kLongestMatch
@ kLongestMatch
Definition: bloaty/third_party/re2/re2/prog.h:194
DEFINE_FLAG
DEFINE_FLAG(int, size, 8, "log2(number of DFA nodes)")
thread
static uv_thread_t thread
Definition: test-async-null-cb.c:29
i
uint64_t i
Definition: abseil-cpp/absl/container/btree_benchmark.cc:230
re2::Regexp::LikePerl
@ LikePerl
Definition: bloaty/third_party/re2/re2/regexp.h:312
ASSERT_EQ
#define ASSERT_EQ(val1, val2)
Definition: bloaty/third_party/googletest/googletest/include/gtest/gtest.h:2056


grpc
Author(s):
autogenerated on Fri May 16 2025 02:58:16