re2/re2/testing/regexp_generator.cc
Go to the documentation of this file.
1 // Copyright 2008 The RE2 Authors. All Rights Reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4 
5 // Regular expression generator: generates all possible
6 // regular expressions within parameters (see regexp_generator.h for details).
7 
8 // The regexp generator first generates a sequence of commands in a simple
9 // postfix language. Each command in the language is a string,
10 // like "a" or "%s*" or "%s|%s".
11 //
12 // To evaluate a command, enough arguments are popped from the value stack to
13 // plug into the %s slots. Then the result is pushed onto the stack.
14 // For example, the command sequence
15 // a b %s%s c
16 // results in the stack
17 // ab c
18 //
19 // GeneratePostfix generates all possible command sequences.
20 // Then RunPostfix turns each sequence into a regular expression
21 // and passes the regexp to HandleRegexp.
22 
23 #include <stddef.h>
24 #include <stdint.h>
25 #include <stdio.h>
26 #include <string.h>
27 #include <memory>
28 #include <stack>
29 #include <string>
30 #include <vector>
31 
32 #include "util/test.h"
33 #include "util/logging.h"
34 #include "util/strutil.h"
35 #include "util/utf.h"
36 #include "re2/testing/regexp_generator.h"
37 
38 namespace re2 {
39 
40 // Returns a vector of the egrep regexp operators.
41 const std::vector<std::string>& RegexpGenerator::EgrepOps() {
42  static const char *ops[] = {
43  "%s%s",
44  "%s|%s",
45  "%s*",
46  "%s+",
47  "%s?",
48  "%s\\C*",
49  };
50  static std::vector<std::string> v(ops, ops + arraysize(ops));
51  return v;
52 }
53 
54 RegexpGenerator::RegexpGenerator(int maxatoms, int maxops,
55  const std::vector<std::string>& atoms,
56  const std::vector<std::string>& ops)
57  : maxatoms_(maxatoms), maxops_(maxops), atoms_(atoms), ops_(ops) {
58  // Degenerate case.
59  if (atoms_.empty())
60  maxatoms_ = 0;
61  if (ops_.empty())
62  maxops_ = 0;
63 }
64 
65 // Generates all possible regular expressions (within the parameters),
66 // calling HandleRegexp for each one.
67 void RegexpGenerator::Generate() {
68  std::vector<std::string> postfix;
69  GeneratePostfix(&postfix, 0, 0, 0);
70 }
71 
72 // Generates random regular expressions, calling HandleRegexp for each one.
73 void RegexpGenerator::GenerateRandom(int32_t seed, int n) {
74  rng_.seed(seed);
75 
76  for (int i = 0; i < n; i++) {
77  std::vector<std::string> postfix;
78  GenerateRandomPostfix(&postfix, 0, 0, 0);
79  }
80 }
81 
82 // Counts and returns the number of occurrences of "%s" in s.
83 static int CountArgs(const std::string& s) {
84  const char *p = s.c_str();
85  int n = 0;
86  while ((p = strstr(p, "%s")) != NULL) {
87  p += 2;
88  n++;
89  }
90  return n;
91 }
92 
93 // Generates all possible postfix command sequences.
94 // Each sequence is handed off to RunPostfix to generate a regular expression.
95 // The arguments are:
96 // post: the current postfix sequence
97 // nstk: the number of elements that would be on the stack after executing
98 // the sequence
99 // ops: the number of operators used in the sequence
100 // atoms: the number of atoms used in the sequence
101 // For example, if post were ["a", "b", "%s%s", "c"],
102 // then nstk = 2, ops = 1, atoms = 3.
103 //
104 // The initial call should be GeneratePostfix([empty vector], 0, 0, 0).
105 //
106 void RegexpGenerator::GeneratePostfix(std::vector<std::string>* post,
107  int nstk, int ops, int atoms) {
108  if (nstk == 1)
109  RunPostfix(*post);
110 
111  // Early out: if used too many operators or can't
112  // get back down to a single expression on the stack
113  // using binary operators, give up.
114  if (ops + nstk - 1 > maxops_)
115  return;
116 
117  // Add atoms if there is room.
118  if (atoms < maxatoms_) {
119  for (size_t i = 0; i < atoms_.size(); i++) {
120  post->push_back(atoms_[i]);
121  GeneratePostfix(post, nstk + 1, ops, atoms + 1);
122  post->pop_back();
123  }
124  }
125 
126  // Add operators if there are enough arguments.
127  if (ops < maxops_) {
128  for (size_t i = 0; i < ops_.size(); i++) {
129  const std::string& fmt = ops_[i];
130  int nargs = CountArgs(fmt);
131  if (nargs <= nstk) {
132  post->push_back(fmt);
133  GeneratePostfix(post, nstk - nargs + 1, ops + 1, atoms);
134  post->pop_back();
135  }
136  }
137  }
138 }
139 
140 // Generates a random postfix command sequence.
141 // Stops and returns true once a single sequence has been generated.
142 bool RegexpGenerator::GenerateRandomPostfix(std::vector<std::string>* post,
143  int nstk, int ops, int atoms) {
144  std::uniform_int_distribution<int> random_stop(0, maxatoms_ - atoms);
145  std::uniform_int_distribution<int> random_bit(0, 1);
146  std::uniform_int_distribution<int> random_ops_index(
147  0, static_cast<int>(ops_.size()) - 1);
148  std::uniform_int_distribution<int> random_atoms_index(
149  0, static_cast<int>(atoms_.size()) - 1);
150 
151  for (;;) {
152  // Stop if we get to a single element, but only sometimes.
153  if (nstk == 1 && random_stop(rng_) == 0) {
154  RunPostfix(*post);
155  return true;
156  }
157 
158  // Early out: if used too many operators or can't
159  // get back down to a single expression on the stack
160  // using binary operators, give up.
161  if (ops + nstk - 1 > maxops_)
162  return false;
163 
164  // Add operators if there are enough arguments.
165  if (ops < maxops_ && random_bit(rng_) == 0) {
166  const std::string& fmt = ops_[random_ops_index(rng_)];
167  int nargs = CountArgs(fmt);
168  if (nargs <= nstk) {
169  post->push_back(fmt);
170  bool ret = GenerateRandomPostfix(post, nstk - nargs + 1,
171  ops + 1, atoms);
172  post->pop_back();
173  if (ret)
174  return true;
175  }
176  }
177 
178  // Add atoms if there is room.
179  if (atoms < maxatoms_ && random_bit(rng_) == 0) {
180  post->push_back(atoms_[random_atoms_index(rng_)]);
181  bool ret = GenerateRandomPostfix(post, nstk + 1, ops, atoms + 1);
182  post->pop_back();
183  if (ret)
184  return true;
185  }
186  }
187 }
188 
189 // Interprets the postfix command sequence to create a regular expression
190 // passed to HandleRegexp. The results of operators like %s|%s are wrapped
191 // in (?: ) to avoid needing to maintain a precedence table.
192 void RegexpGenerator::RunPostfix(const std::vector<std::string>& post) {
193  std::stack<std::string> regexps;
194  for (size_t i = 0; i < post.size(); i++) {
195  switch (CountArgs(post[i])) {
196  default:
197  LOG(FATAL) << "Bad operator: " << post[i];
198  case 0:
199  regexps.push(post[i]);
200  break;
201  case 1: {
202  std::string a = regexps.top();
203  regexps.pop();
204  regexps.push("(?:" + StringPrintf(post[i].c_str(), a.c_str()) + ")");
205  break;
206  }
207  case 2: {
208  std::string b = regexps.top();
209  regexps.pop();
210  std::string a = regexps.top();
211  regexps.pop();
212  regexps.push("(?:" +
213  StringPrintf(post[i].c_str(), a.c_str(), b.c_str()) +
214  ")");
215  break;
216  }
217  }
218  }
219 
220  if (regexps.size() != 1) {
221  // Internal error - should never happen.
222  printf("Bad regexp program:\n");
223  for (size_t i = 0; i < post.size(); i++) {
224  printf(" %s\n", CEscape(post[i]).c_str());
225  }
226  printf("Stack after running program:\n");
227  while (!regexps.empty()) {
228  printf(" %s\n", CEscape(regexps.top()).c_str());
229  regexps.pop();
230  }
231  LOG(FATAL) << "Bad regexp program.";
232  }
233 
234  HandleRegexp(regexps.top());
235  HandleRegexp("^(?:" + regexps.top() + ")$");
236  HandleRegexp("^(?:" + regexps.top() + ")");
237  HandleRegexp("(?:" + regexps.top() + ")$");
238 }
239 
240 // Split s into an vector of strings, one for each UTF-8 character.
241 std::vector<std::string> Explode(const StringPiece& s) {
242  std::vector<std::string> v;
243 
244  for (const char *q = s.data(); q < s.data() + s.size(); ) {
245  const char* p = q;
246  Rune r;
247  q += chartorune(&r, q);
248  v.push_back(std::string(p, q - p));
249  }
250 
251  return v;
252 }
253 
254 // Split string everywhere a substring is found, returning
255 // vector of pieces.
256 std::vector<std::string> Split(const StringPiece& sep, const StringPiece& s) {
257  std::vector<std::string> v;
258 
259  if (sep.empty())
260  return Explode(s);
261 
262  const char *p = s.data();
263  for (const char *q = s.data(); q + sep.size() <= s.data() + s.size(); q++) {
264  if (StringPiece(q, sep.size()) == sep) {
265  v.push_back(std::string(p, q - p));
266  p = q + sep.size();
267  q = p - 1; // -1 for ++ in loop
268  continue;
269  }
270  }
271  if (p < s.data() + s.size())
272  v.push_back(std::string(p, s.data() + s.size() - p));
273  return v;
274 }
275 
276 } // namespace re2
re2::CountArgs
static int CountArgs(const std::string &s)
Definition: re2/re2/testing/regexp_generator.cc:83
post
static void post(QUEUE *q, enum uv__work_kind kind)
Definition: threadpool.c:142
string.h
seed
static const uint8_t seed[20]
Definition: dsa_test.cc:79
printf
_Use_decl_annotations_ int __cdecl printf(const char *_Format,...)
Definition: cs_driver.c:91
testing::internal::string
::std::string string
Definition: bloaty/third_party/protobuf/third_party/googletest/googletest/include/gtest/internal/gtest-port.h:881
re2
Definition: bloaty/third_party/re2/re2/bitmap256.h:17
absl::FormatConversionChar::s
@ s
a
int a
Definition: abseil-cpp/absl/container/internal/hash_policy_traits_test.cc:88
absl::CEscape
std::string CEscape(absl::string_view src)
Definition: abseil-cpp/absl/strings/escaping.cc:854
xds_manager.p
p
Definition: xds_manager.py:60
re2::RegexpGenerator::RegexpGenerator
RegexpGenerator(int maxatoms, int maxops, const std::vector< std::string > &atoms, const std::vector< std::string > &ops)
Definition: bloaty/third_party/re2/re2/testing/regexp_generator.cc:54
gen_stats_data.c_str
def c_str(s, encoding='ascii')
Definition: gen_stats_data.py:38
LOG
#define LOG(severity)
Definition: bloaty/third_party/re2/util/logging.h:53
text_format_test_wrapper.sep
sep
Definition: text_format_test_wrapper.py:34
setup.v
v
Definition: third_party/bloaty/third_party/capstone/bindings/python/setup.py:42
grpc_generator::Split
void Split(const std::string &s, char, std::vector< std::string > *append_to)
Definition: generator_helpers.h:169
google::protobuf::StringPrintf
string StringPrintf(const char *format,...)
Definition: bloaty/third_party/protobuf/src/google/protobuf/stubs/stringprintf.cc:109
asyncio_get_stats.nargs
nargs
Definition: asyncio_get_stats.py:36
b
uint64_t b
Definition: abseil-cpp/absl/container/internal/layout_test.cc:53
n
int n
Definition: abseil-cpp/absl/container/btree_test.cc:1080
stdint.h
arraysize
#define arraysize(array)
Definition: benchmark/src/arraysize.h:28
FATAL
#define FATAL(msg)
Definition: task.h:88
testing::internal::fmt
GTEST_API_ const char * fmt
Definition: bloaty/third_party/googletest/googletest/include/gtest/gtest.h:1808
re2::RegexpGenerator::EgrepOps
static const std::vector< std::string > & EgrepOps()
Definition: bloaty/third_party/re2/re2/testing/regexp_generator.cc:41
re2::Explode
std::vector< std::string > Explode(const StringPiece &s)
Definition: bloaty/third_party/re2/re2/testing/regexp_generator.cc:241
rng_
std::mt19937 rng_
Definition: rls.cc:607
re2::Rune
signed int Rune
Definition: bloaty/third_party/re2/util/utf.h:25
ret
UniquePtr< SSL_SESSION > ret
Definition: ssl_x509.cc:1029
fix_build_deps.r
r
Definition: fix_build_deps.py:491
re2::chartorune
int chartorune(Rune *rune, const char *str)
Definition: bloaty/third_party/re2/util/rune.cc:51
absl::str_format_internal::LengthMod::q
@ q
int32_t
signed int int32_t
Definition: stdint-msvc2008.h:77
ops
static grpc_op ops[6]
Definition: test/core/fling/client.cc:39
i
uint64_t i
Definition: abseil-cpp/absl/container/btree_benchmark.cc:230


grpc
Author(s):
autogenerated on Fri May 16 2025 03:00:02