bloaty/third_party/re2/re2/testing/backtrack.cc
Go to the documentation of this file.
1 // Copyright 2008 The RE2 Authors. All Rights Reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4 
5 // Tested by search_test.cc, exhaustive_test.cc, tester.cc
6 //
7 // Prog::UnsafeSearchBacktrack is a backtracking regular expression search,
8 // except that it remembers where it has been, trading a lot of
9 // memory for a lot of time. It exists only for testing purposes.
10 //
11 // Let me repeat that.
12 //
13 // THIS CODE SHOULD NEVER BE USED IN PRODUCTION:
14 // - It uses a ton of memory.
15 // - It uses a ton of stack.
16 // - It uses CHECK and LOG(FATAL).
17 // - It implements unanchored search by repeated anchored search.
18 //
19 // On the other hand, it is very simple and a good reference
20 // implementation for the more complicated regexp packages.
21 //
22 // In BUILD, this file is linked into the ":testing" library,
23 // not the main library, in order to make it harder to pick up
24 // accidentally.
25 
26 #include <stddef.h>
27 #include <stdint.h>
28 #include <string.h>
29 
30 #include "util/util.h"
31 #include "util/logging.h"
32 #include "re2/prog.h"
33 #include "re2/regexp.h"
34 
35 namespace re2 {
36 
37 // Backtracker holds the state for a backtracking search.
38 //
39 // Excluding the search parameters, the main search state
40 // is just the "capture registers", which record, for the
41 // current execution, the string position at which each
42 // parenthesis was passed. cap_[0] and cap_[1] are the
43 // left and right parenthesis in $0, cap_[2] and cap_[3] in $1, etc.
44 //
45 // To avoid infinite loops during backtracking on expressions
46 // like (a*)*, the visited_[] bitmap marks the (state, string-position)
47 // pairs that have already been explored and are thus not worth
48 // re-exploring if we get there via another path. Modern backtracking
49 // libraries engineer their program representation differently, to make
50 // such infinite loops possible to avoid without keeping a giant visited_
51 // bitmap, but visited_ works fine for a reference implementation
52 // and it has the nice benefit of making the search run in linear time.
53 class Backtracker {
54  public:
55  explicit Backtracker(Prog* prog);
56  ~Backtracker();
57 
58  bool Search(const StringPiece& text, const StringPiece& context,
59  bool anchored, bool longest,
60  StringPiece* submatch, int nsubmatch);
61 
62  private:
63  // Explores from instruction id at string position p looking for a match.
64  // Returns true if found (so that caller can stop trying other possibilities).
65  bool Visit(int id, const char* p);
66 
67  // Tries instruction id at string position p.
68  // Returns true if a match is found.
69  bool Try(int id, const char* p);
70 
71  // Search parameters
72  Prog* prog_; // program being run
73  StringPiece text_; // text being searched
74  StringPiece context_; // greater context of text being searched
75  bool anchored_; // whether search is anchored at text.begin()
76  bool longest_; // whether search wants leftmost-longest match
77  bool endmatch_; // whether search must end at text.end()
78  StringPiece *submatch_; // submatches to fill in
79  int nsubmatch_; // # of submatches to fill in
80 
81  // Search state
82  const char* cap_[64]; // capture registers
83  uint32_t *visited_; // bitmap: (Inst*, char*) pairs already backtracked
84  size_t nvisited_; // # of words in bitmap
85 };
86 
88  : prog_(prog),
89  anchored_(false),
90  longest_(false),
91  endmatch_(false),
92  submatch_(NULL),
93  nsubmatch_(0),
94  visited_(NULL),
95  nvisited_(0) {
96 }
97 
99  delete[] visited_;
100 }
101 
102 // Runs a backtracking search.
104  bool anchored, bool longest,
105  StringPiece* submatch, int nsubmatch) {
106  text_ = text;
107  context_ = context;
108  if (context_.data() == NULL)
109  context_ = text;
110  if (prog_->anchor_start() && text.begin() > context_.begin())
111  return false;
112  if (prog_->anchor_end() && text.end() < context_.end())
113  return false;
114  anchored_ = anchored | prog_->anchor_start();
115  longest_ = longest | prog_->anchor_end();
117  submatch_ = submatch;
118  nsubmatch_ = nsubmatch;
119  CHECK_LT(2*nsubmatch_, static_cast<int>(arraysize(cap_)));
120  memset(cap_, 0, sizeof cap_);
121 
122  // We use submatch_[0] for our own bookkeeping,
123  // so it had better exist.
124  StringPiece sp0;
125  if (nsubmatch < 1) {
126  submatch_ = &sp0;
127  nsubmatch_ = 1;
128  }
129  submatch_[0] = StringPiece();
130 
131  // Allocate new visited_ bitmap -- size is proportional
132  // to text, so have to reallocate on each call to Search.
133  delete[] visited_;
134  nvisited_ = (prog_->size()*(text.size()+1) + 31)/32;
135  visited_ = new uint32_t[nvisited_];
136  memset(visited_, 0, nvisited_*sizeof visited_[0]);
137 
138  // Anchored search must start at text.begin().
139  if (anchored_) {
140  cap_[0] = text.data();
141  return Visit(prog_->start(), text.data());
142  }
143 
144  // Unanchored search, starting from each possible text position.
145  // Notice that we have to try the empty string at the end of
146  // the text, so the loop condition is p <= text.end(), not p < text.end().
147  for (const char* p = text.data(); p <= text.data() + text.size(); p++) {
148  cap_[0] = p;
149  if (Visit(prog_->start(), p)) // Match must be leftmost; done.
150  return true;
151  }
152  return false;
153 }
154 
155 // Explores from instruction id at string position p looking for a match.
156 // Return true if found (so that caller can stop trying other possibilities).
157 bool Backtracker::Visit(int id, const char* p) {
158  // Check bitmap. If we've already explored from here,
159  // either it didn't match or it did but we're hoping for a better match.
160  // Either way, don't go down that road again.
161  CHECK(p <= text_.data() + text_.size());
162  size_t n = id*(text_.size()+1) + (p - text_.data());
163  CHECK_LT(n/32, nvisited_);
164  if (visited_[n/32] & (1 << (n&31)))
165  return false;
166  visited_[n/32] |= 1 << (n&31);
167 
168  Prog::Inst* ip = prog_->inst(id);
169  if (Try(id, p)) {
170  if (longest_ && !ip->last())
171  Visit(id+1, p);
172  return true;
173  }
174  if (!ip->last())
175  return Visit(id+1, p);
176  return false;
177 }
178 
179 // Tries instruction id at string position p.
180 // Returns true if a match is found.
181 bool Backtracker::Try(int id, const char* p) {
182  // Pick out byte at current position. If at end of string,
183  // have to explore in hope of finishing a match. Use impossible byte -1.
184  int c = -1;
185  if (p < text_.data() + text_.size())
186  c = *p & 0xFF;
187 
188  Prog::Inst* ip = prog_->inst(id);
189  switch (ip->opcode()) {
190  default:
191  LOG(FATAL) << "Unexpected opcode: " << (int)ip->opcode();
192  return false; // not reached
193 
194  case kInstAltMatch:
195  // Ignored.
196  return false;
197 
198  case kInstByteRange:
199  if (ip->Matches(c))
200  return Visit(ip->out(), p+1);
201  return false;
202 
203  case kInstCapture:
204  if (0 <= ip->cap() &&
205  ip->cap() < static_cast<int>(arraysize(cap_))) {
206  // Capture p to register, but save old value.
207  const char* q = cap_[ip->cap()];
208  cap_[ip->cap()] = p;
209  bool ret = Visit(ip->out(), p);
210  // Restore old value as we backtrack.
211  cap_[ip->cap()] = q;
212  return ret;
213  }
214  return Visit(ip->out(), p);
215 
216  case kInstEmptyWidth:
217  if (ip->empty() & ~Prog::EmptyFlags(context_, p))
218  return false;
219  return Visit(ip->out(), p);
220 
221  case kInstNop:
222  return Visit(ip->out(), p);
223 
224  case kInstMatch:
225  // We found a match. If it's the best so far, record the
226  // parameters in the caller's submatch_ array.
227  if (endmatch_ && p != context_.data() + context_.size())
228  return false;
229  cap_[1] = p;
230  if (submatch_[0].data() == NULL ||
231  (longest_ && p > submatch_[0].data() + submatch_[0].size())) {
232  // First match so far - or better match.
233  for (int i = 0; i < nsubmatch_; i++)
235  cap_[2 * i], static_cast<size_t>(cap_[2 * i + 1] - cap_[2 * i]));
236  }
237  return true;
238 
239  case kInstFail:
240  return false;
241  }
242 }
243 
244 // Runs a backtracking search.
246  const StringPiece& context,
247  Anchor anchor,
248  MatchKind kind,
250  int nmatch) {
251  // If full match, we ask for an anchored longest match
252  // and then check that match[0] == text.
253  // So make sure match[0] exists.
254  StringPiece sp0;
255  if (kind == kFullMatch) {
256  anchor = kAnchored;
257  if (nmatch < 1) {
258  match = &sp0;
259  nmatch = 1;
260  }
261  }
262 
263  // Run the search.
264  Backtracker b(this);
265  bool anchored = anchor == kAnchored;
266  bool longest = kind != kFirstMatch;
267  if (!b.Search(text, context, anchored, longest, match, nmatch))
268  return false;
269  if (kind == kFullMatch && match[0].end() != text.end())
270  return false;
271  return true;
272 }
273 
274 } // namespace re2
re2::Prog::Inst::Matches
bool Matches(int c)
Definition: bloaty/third_party/re2/re2/prog.h:102
re2::kInstFail
@ kInstFail
Definition: bloaty/third_party/re2/re2/prog.h:37
re2::Backtracker::text_
StringPiece text_
Definition: bloaty/third_party/re2/re2/testing/backtrack.cc:73
prog
char * prog
Definition: bloaty/third_party/zlib/contrib/untgz/untgz.c:125
memset
return memset(p, 0, total)
false
#define false
Definition: setup_once.h:323
re2::Prog::kFullMatch
@ kFullMatch
Definition: bloaty/third_party/re2/re2/prog.h:195
re2::StringPiece::end
const_iterator end() const
Definition: bloaty/third_party/re2/re2/stringpiece.h:72
re2::Prog::Inst
Definition: bloaty/third_party/re2/re2/prog.h:62
re2::Prog::Anchor
Anchor
Definition: bloaty/third_party/re2/re2/prog.h:175
match
unsigned char match[65280+2]
Definition: bloaty/third_party/zlib/examples/gun.c:165
string.h
re2::StringPiece::size
size_type size() const
Definition: bloaty/third_party/re2/re2/stringpiece.h:80
re2::Backtracker::nvisited_
size_t nvisited_
Definition: bloaty/third_party/re2/re2/testing/backtrack.cc:84
re2::Prog::Inst::empty
EmptyOp empty()
Definition: bloaty/third_party/re2/re2/prog.h:92
re2
Definition: bloaty/third_party/re2/re2/bitmap256.h:17
re2::Prog::Inst::cap
int cap()
Definition: bloaty/third_party/re2/re2/prog.h:86
re2::kInstEmptyWidth
@ kInstEmptyWidth
Definition: bloaty/third_party/re2/re2/prog.h:34
re2::Backtracker::anchored_
bool anchored_
Definition: bloaty/third_party/re2/re2/testing/backtrack.cc:75
re2::Prog::MatchKind
MatchKind
Definition: bloaty/third_party/re2/re2/prog.h:192
uint32_t
unsigned int uint32_t
Definition: stdint-msvc2008.h:80
re2::kInstCapture
@ kInstCapture
Definition: bloaty/third_party/re2/re2/prog.h:33
re2::Backtracker::prog_
Prog * prog_
Definition: bloaty/third_party/re2/re2/testing/backtrack.cc:72
gen_server_registered_method_bad_client_test_body.text
def text
Definition: gen_server_registered_method_bad_client_test_body.py:50
xds_interop_client.int
int
Definition: xds_interop_client.py:113
end
char * end
Definition: abseil-cpp/absl/strings/internal/str_format/float_conversion.cc:1008
re2::Backtracker::longest_
bool longest_
Definition: bloaty/third_party/re2/re2/testing/backtrack.cc:76
re2::Prog::anchor_end
bool anchor_end()
Definition: bloaty/third_party/re2/re2/prog.h:216
LOG
#define LOG(severity)
Definition: bloaty/third_party/re2/util/logging.h:53
re2::Backtracker::Backtracker
Backtracker(Prog *prog)
Definition: bloaty/third_party/re2/re2/testing/backtrack.cc:87
re2::Prog::inst
Inst * inst(int id)
Definition: bloaty/third_party/re2/re2/prog.h:199
CHECK
#define CHECK(x)
Definition: bloaty/third_party/protobuf/php/ext/google/protobuf/upb.c:8085
re2::Prog::kAnchored
@ kAnchored
Definition: bloaty/third_party/re2/re2/prog.h:177
re2::Prog::Inst::last
int last()
Definition: bloaty/third_party/re2/re2/prog.h:83
re2::Backtracker::endmatch_
bool endmatch_
Definition: bloaty/third_party/re2/re2/testing/backtrack.cc:77
re2::Prog::size
int size()
Definition: bloaty/third_party/re2/re2/prog.h:204
data
char data[kBufferLength]
Definition: abseil-cpp/absl/strings/internal/str_format/float_conversion.cc:1006
re2::Backtracker::context_
StringPiece context_
Definition: bloaty/third_party/re2/re2/testing/backtrack.cc:74
re2::Backtracker::submatch_
StringPiece * submatch_
Definition: bloaty/third_party/re2/re2/testing/backtrack.cc:78
b
uint64_t b
Definition: abseil-cpp/absl/container/internal/layout_test.cc:53
re2::Backtracker::cap_
const char * cap_[64]
Definition: bloaty/third_party/re2/re2/testing/backtrack.cc:82
n
int n
Definition: abseil-cpp/absl/container/btree_test.cc:1080
stdint.h
re2::Prog::Inst::opcode
InstOp opcode()
Definition: bloaty/third_party/re2/re2/prog.h:82
arraysize
#define arraysize(array)
Definition: benchmark/src/arraysize.h:28
re2::Backtracker::Try
bool Try(int id, const char *p)
Definition: bloaty/third_party/re2/re2/testing/backtrack.cc:181
re2::StringPiece::data
const_pointer data() const
Definition: bloaty/third_party/re2/re2/stringpiece.h:85
FATAL
#define FATAL(msg)
Definition: task.h:88
CHECK_LT
#define CHECK_LT(a, b)
Definition: bloaty/third_party/protobuf/third_party/benchmark/src/check.h:70
re2::kInstNop
@ kInstNop
Definition: bloaty/third_party/re2/re2/prog.h:36
re2::kInstMatch
@ kInstMatch
Definition: bloaty/third_party/re2/re2/prog.h:35
re2::kInstAltMatch
@ kInstAltMatch
Definition: bloaty/third_party/re2/re2/prog.h:31
re2::Prog
Definition: bloaty/third_party/re2/re2/prog.h:56
ret
UniquePtr< SSL_SESSION > ret
Definition: ssl_x509.cc:1029
re2::Prog::anchor_start
bool anchor_start()
Definition: bloaty/third_party/re2/re2/prog.h:214
re2::Backtracker::Visit
bool Visit(int id, const char *p)
Definition: bloaty/third_party/re2/re2/testing/backtrack.cc:157
re2::StringPiece::begin
const_iterator begin() const
Definition: bloaty/third_party/re2/re2/stringpiece.h:71
re2::Prog::Inst::out
int out()
Definition: bloaty/third_party/re2/re2/prog.h:84
re2::Backtracker
Definition: bloaty/third_party/re2/re2/testing/backtrack.cc:53
re2::Backtracker::visited_
uint32_t * visited_
Definition: bloaty/third_party/re2/re2/testing/backtrack.cc:83
re2::Backtracker::nsubmatch_
int nsubmatch_
Definition: bloaty/third_party/re2/re2/testing/backtrack.cc:79
re2::Backtracker::Search
bool Search(const StringPiece &text, const StringPiece &context, bool anchored, bool longest, StringPiece *submatch, int nsubmatch)
Definition: bloaty/third_party/re2/re2/testing/backtrack.cc:103
re2::Prog::kFirstMatch
@ kFirstMatch
Definition: bloaty/third_party/re2/re2/prog.h:193
re2::Prog::UnsafeSearchBacktrack
bool UnsafeSearchBacktrack(const StringPiece &text, const StringPiece &context, Anchor anchor, MatchKind kind, StringPiece *match, int nmatch)
Definition: bloaty/third_party/re2/re2/testing/backtrack.cc:245
context
grpc::ClientContext context
Definition: istio_echo_server_lib.cc:61
re2::Backtracker::~Backtracker
~Backtracker()
Definition: bloaty/third_party/re2/re2/testing/backtrack.cc:98
size
voidpf void uLong size
Definition: bloaty/third_party/zlib/contrib/minizip/ioapi.h:136
re2::StringPiece
Definition: bloaty/third_party/re2/re2/stringpiece.h:39
re2::kInstByteRange
@ kInstByteRange
Definition: bloaty/third_party/re2/re2/prog.h:32
re2::Prog::EmptyFlags
static uint32_t EmptyFlags(const StringPiece &context, const char *p)
Definition: bloaty/third_party/re2/re2/prog.cc:287
i
uint64_t i
Definition: abseil-cpp/absl/container/btree_benchmark.cc:230
re2::Prog::start
int start()
Definition: bloaty/third_party/re2/re2/prog.h:200


grpc
Author(s):
autogenerated on Fri May 16 2025 02:57:45