bloaty/third_party/re2/re2/bitstate.cc
Go to the documentation of this file.
1 // Copyright 2008 The RE2 Authors. All Rights Reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4 
5 // Tested by search_test.cc, exhaustive_test.cc, tester.cc
6 
7 // Prog::SearchBitState is a regular expression search with submatch
8 // tracking for small regular expressions and texts. Similarly to
9 // testing/backtrack.cc, it allocates a bitmap with (count of
10 // lists) * (length of prog) bits to make sure it never explores the
11 // same (instruction list, character position) multiple times. This
12 // limits the search to run in time linear in the length of the text.
13 //
14 // Unlike testing/backtrack.cc, SearchBitState is not recursive
15 // on the text.
16 //
17 // SearchBitState is a fast replacement for the NFA code on small
18 // regexps and texts when SearchOnePass cannot be used.
19 
20 #include <stddef.h>
21 #include <stdint.h>
22 #include <string.h>
23 #include <limits>
24 #include <utility>
25 
26 #include "util/logging.h"
27 #include "util/pod_array.h"
28 #include "re2/prog.h"
29 #include "re2/regexp.h"
30 
31 namespace re2 {
32 
33 struct Job {
34  int id;
35  int rle; // run length encoding
36  const char* p;
37 };
38 
39 class BitState {
40  public:
41  explicit BitState(Prog* prog);
42 
43  // The usual Search prototype.
44  // Can only call Search once per BitState.
45  bool Search(const StringPiece& text, const StringPiece& context,
46  bool anchored, bool longest,
47  StringPiece* submatch, int nsubmatch);
48 
49  private:
50  inline bool ShouldVisit(int id, const char* p);
51  void Push(int id, const char* p);
52  void GrowStack();
53  bool TrySearch(int id, const char* p);
54 
55  // Search parameters
56  Prog* prog_; // program being run
57  StringPiece text_; // text being searched
58  StringPiece context_; // greater context of text being searched
59  bool anchored_; // whether search is anchored at text.begin()
60  bool longest_; // whether search wants leftmost-longest match
61  bool endmatch_; // whether match must end at text.end()
62  StringPiece* submatch_; // submatches to fill in
63  int nsubmatch_; // # of submatches to fill in
64 
65  // Search state
66  static const int VisitedBits = 32;
67  PODArray<uint32_t> visited_; // bitmap: (list ID, char*) pairs visited
68  PODArray<const char*> cap_; // capture registers
69  PODArray<Job> job_; // stack of text positions to explore
70  int njob_; // stack size
71 };
72 
74  : prog_(prog),
75  anchored_(false),
76  longest_(false),
77  endmatch_(false),
78  submatch_(NULL),
79  nsubmatch_(0),
80  njob_(0) {
81 }
82 
83 // Given id, which *must* be a list head, we can look up its list ID.
84 // Then the question is: Should the search visit the (list ID, p) pair?
85 // If so, remember that it was visited so that the next time,
86 // we don't repeat the visit.
87 bool BitState::ShouldVisit(int id, const char* p) {
88  int n = prog_->list_heads()[id] * static_cast<int>(text_.size()+1) +
89  static_cast<int>(p-text_.data());
90  if (visited_[n/VisitedBits] & (1 << (n & (VisitedBits-1))))
91  return false;
92  visited_[n/VisitedBits] |= 1 << (n & (VisitedBits-1));
93  return true;
94 }
95 
96 // Grow the stack.
98  PODArray<Job> tmp(2*job_.size());
99  memmove(tmp.data(), job_.data(), njob_*sizeof job_[0]);
100  job_ = std::move(tmp);
101 }
102 
103 // Push (id, p) onto the stack, growing it if necessary.
104 void BitState::Push(int id, const char* p) {
105  if (njob_ >= job_.size()) {
106  GrowStack();
107  if (njob_ >= job_.size()) {
108  LOG(DFATAL) << "GrowStack() failed: "
109  << "njob_ = " << njob_ << ", "
110  << "job_.size() = " << job_.size();
111  return;
112  }
113  }
114 
115  // If id < 0, it's undoing a Capture,
116  // so we mustn't interfere with that.
117  if (id >= 0 && njob_ > 0) {
118  Job* top = &job_[njob_-1];
119  if (id == top->id &&
120  p == top->p + top->rle + 1 &&
122  ++top->rle;
123  return;
124  }
125  }
126 
127  Job* top = &job_[njob_++];
128  top->id = id;
129  top->rle = 0;
130  top->p = p;
131 }
132 
133 // Try a search from instruction id0 in state p0.
134 // Return whether it succeeded.
135 bool BitState::TrySearch(int id0, const char* p0) {
136  bool matched = false;
137  const char* end = text_.data() + text_.size();
138  njob_ = 0;
139  // Push() no longer checks ShouldVisit(),
140  // so we must perform the check ourselves.
141  if (ShouldVisit(id0, p0))
142  Push(id0, p0);
143  while (njob_ > 0) {
144  // Pop job off stack.
145  --njob_;
146  int id = job_[njob_].id;
147  int& rle = job_[njob_].rle;
148  const char* p = job_[njob_].p;
149 
150  if (id < 0) {
151  // Undo the Capture.
152  cap_[prog_->inst(-id)->cap()] = p;
153  continue;
154  }
155 
156  if (rle > 0) {
157  p += rle;
158  // Revivify job on stack.
159  --rle;
160  ++njob_;
161  }
162 
163  Loop:
164  // Visit id, p.
165  Prog::Inst* ip = prog_->inst(id);
166  switch (ip->opcode()) {
167  default:
168  LOG(DFATAL) << "Unexpected opcode: " << ip->opcode();
169  return false;
170 
171  case kInstFail:
172  break;
173 
174  case kInstAltMatch:
175  if (ip->greedy(prog_)) {
176  // out1 is the Match instruction.
177  id = ip->out1();
178  p = end;
179  goto Loop;
180  }
181  if (longest_) {
182  // ip must be non-greedy...
183  // out is the Match instruction.
184  id = ip->out();
185  p = end;
186  goto Loop;
187  }
188  goto Next;
189 
190  case kInstByteRange: {
191  int c = -1;
192  if (p < end)
193  c = *p & 0xFF;
194  if (!ip->Matches(c))
195  goto Next;
196 
197  if (ip->hint() != 0)
198  Push(id+ip->hint(), p); // try the next when we're done
199  id = ip->out();
200  p++;
201  goto CheckAndLoop;
202  }
203 
204  case kInstCapture:
205  if (!ip->last())
206  Push(id+1, p); // try the next when we're done
207 
208  if (0 <= ip->cap() && ip->cap() < cap_.size()) {
209  // Capture p to register, but save old value first.
210  Push(-id, cap_[ip->cap()]); // undo when we're done
211  cap_[ip->cap()] = p;
212  }
213 
214  id = ip->out();
215  goto CheckAndLoop;
216 
217  case kInstEmptyWidth:
218  if (ip->empty() & ~Prog::EmptyFlags(context_, p))
219  goto Next;
220 
221  if (!ip->last())
222  Push(id+1, p); // try the next when we're done
223  id = ip->out();
224  goto CheckAndLoop;
225 
226  case kInstNop:
227  if (!ip->last())
228  Push(id+1, p); // try the next when we're done
229  id = ip->out();
230 
231  CheckAndLoop:
232  // Sanity check: id is the head of its list, which must
233  // be the case if id-1 is the last of *its* list. :)
234  DCHECK(id == 0 || prog_->inst(id-1)->last());
235  if (ShouldVisit(id, p))
236  goto Loop;
237  break;
238 
239  case kInstMatch: {
240  if (endmatch_ && p != end)
241  goto Next;
242 
243  // We found a match. If the caller doesn't care
244  // where the match is, no point going further.
245  if (nsubmatch_ == 0)
246  return true;
247 
248  // Record best match so far.
249  // Only need to check end point, because this entire
250  // call is only considering one start position.
251  matched = true;
252  cap_[1] = p;
253  if (submatch_[0].data() == NULL ||
254  (longest_ && p > submatch_[0].data() + submatch_[0].size())) {
255  for (int i = 0; i < nsubmatch_; i++)
256  submatch_[i] =
257  StringPiece(cap_[2 * i],
258  static_cast<size_t>(cap_[2 * i + 1] - cap_[2 * i]));
259  }
260 
261  // If going for first match, we're done.
262  if (!longest_)
263  return true;
264 
265  // If we used the entire text, no longer match is possible.
266  if (p == end)
267  return true;
268 
269  // Otherwise, continue on in hope of a longer match.
270  // Note the absence of the ShouldVisit() check here
271  // due to execution remaining in the same list.
272  Next:
273  if (!ip->last()) {
274  id++;
275  goto Loop;
276  }
277  break;
278  }
279  }
280  }
281  return matched;
282 }
283 
284 // Search text (within context) for prog_.
286  bool anchored, bool longest,
287  StringPiece* submatch, int nsubmatch) {
288  // Search parameters.
289  text_ = text;
290  context_ = context;
291  if (context_.data() == NULL)
292  context_ = text;
293  if (prog_->anchor_start() && context_.begin() != text.begin())
294  return false;
295  if (prog_->anchor_end() && context_.end() != text.end())
296  return false;
297  anchored_ = anchored || prog_->anchor_start();
298  longest_ = longest || prog_->anchor_end();
300  submatch_ = submatch;
301  nsubmatch_ = nsubmatch;
302  for (int i = 0; i < nsubmatch_; i++)
303  submatch_[i] = StringPiece();
304 
305  // Allocate scratch space.
306  int nvisited = prog_->list_count() * static_cast<int>(text.size()+1);
307  nvisited = (nvisited + VisitedBits-1) / VisitedBits;
308  visited_ = PODArray<uint32_t>(nvisited);
309  memset(visited_.data(), 0, nvisited*sizeof visited_[0]);
310 
311  int ncap = 2*nsubmatch;
312  if (ncap < 2)
313  ncap = 2;
314  cap_ = PODArray<const char*>(ncap);
315  memset(cap_.data(), 0, ncap*sizeof cap_[0]);
316 
317  // When sizeof(Job) == 16, we start with a nice round 1KiB. :)
318  job_ = PODArray<Job>(64);
319 
320  // Anchored search must start at text.begin().
321  if (anchored_) {
322  cap_[0] = text.data();
323  return TrySearch(prog_->start(), text.data());
324  }
325 
326  // Unanchored search, starting from each possible text position.
327  // Notice that we have to try the empty string at the end of
328  // the text, so the loop condition is p <= text.end(), not p < text.end().
329  // This looks like it's quadratic in the size of the text,
330  // but we are not clearing visited_ between calls to TrySearch,
331  // so no work is duplicated and it ends up still being linear.
332  for (const char* p = text.data(); p <= text.data() + text.size(); p++) {
333  // Try to use memchr to find the first byte quickly.
334  int fb = prog_->first_byte();
335  if (fb >= 0 && p < text.data() + text.size() && (p[0] & 0xFF) != fb) {
336  p = reinterpret_cast<const char*>(
337  memchr(p, fb, text.data() + text.size() - p));
338  if (p == NULL)
339  p = text.data() + text.size();
340  }
341 
342  cap_[0] = p;
343  if (TrySearch(prog_->start(), p)) // Match must be leftmost; done.
344  return true;
345  }
346  return false;
347 }
348 
349 // Bit-state search.
351  const StringPiece& context,
352  Anchor anchor,
353  MatchKind kind,
355  int nmatch) {
356  // If full match, we ask for an anchored longest match
357  // and then check that match[0] == text.
358  // So make sure match[0] exists.
359  StringPiece sp0;
360  if (kind == kFullMatch) {
361  anchor = kAnchored;
362  if (nmatch < 1) {
363  match = &sp0;
364  nmatch = 1;
365  }
366  }
367 
368  // Run the search.
369  BitState b(this);
370  bool anchored = anchor == kAnchored;
371  bool longest = kind != kFirstMatch;
372  if (!b.Search(text, context, anchored, longest, match, nmatch))
373  return false;
374  if (kind == kFullMatch && match[0].end() != text.end())
375  return false;
376  return true;
377 }
378 
379 } // namespace re2
re2::Prog::Inst::Matches
bool Matches(int c)
Definition: bloaty/third_party/re2/re2/prog.h:102
re2::Prog::Inst::greedy
bool greedy(Prog *p)
Definition: bloaty/third_party/re2/re2/prog.h:94
re2::kInstFail
@ kInstFail
Definition: bloaty/third_party/re2/re2/prog.h:37
re2::BitState::TrySearch
bool TrySearch(int id, const char *p)
Definition: bloaty/third_party/re2/re2/bitstate.cc:135
re2::BitState::VisitedBits
static const int VisitedBits
Definition: bloaty/third_party/re2/re2/bitstate.cc:66
prog
char * prog
Definition: bloaty/third_party/zlib/contrib/untgz/untgz.c:125
re2::Prog::list_count
int list_count()
Definition: bloaty/third_party/re2/re2/prog.h:207
re2::Prog::Inst::out1
int out1()
Definition: bloaty/third_party/re2/re2/prog.h:85
pod_array.h
memset
return memset(p, 0, total)
false
#define false
Definition: setup_once.h:323
re2::Prog::kFullMatch
@ kFullMatch
Definition: bloaty/third_party/re2/re2/prog.h:195
re2::StringPiece::end
const_iterator end() const
Definition: bloaty/third_party/re2/re2/stringpiece.h:72
re2::BitState::Search
bool Search(const StringPiece &text, const StringPiece &context, bool anchored, bool longest, StringPiece *submatch, int nsubmatch)
Definition: bloaty/third_party/re2/re2/bitstate.cc:285
re2::Prog::Inst
Definition: bloaty/third_party/re2/re2/prog.h:62
re2::Prog::Anchor
Anchor
Definition: bloaty/third_party/re2/re2/prog.h:175
match
unsigned char match[65280+2]
Definition: bloaty/third_party/zlib/examples/gun.c:165
string.h
re2::BitState::cap_
PODArray< const char * > cap_
Definition: bloaty/third_party/re2/re2/bitstate.cc:68
re2::StringPiece::size
size_type size() const
Definition: bloaty/third_party/re2/re2/stringpiece.h:80
re2::PODArray::data
T * data() const
Definition: bloaty/third_party/re2/util/pod_array.h:24
re2::Prog::list_heads
uint16_t * list_heads()
Definition: bloaty/third_party/re2/re2/prog.h:209
re2::BitState::longest_
bool longest_
Definition: bloaty/third_party/re2/re2/bitstate.cc:60
re2::BitState::context_
StringPiece context_
Definition: bloaty/third_party/re2/re2/bitstate.cc:58
re2::BitState::BitState
BitState(Prog *prog)
Definition: bloaty/third_party/re2/re2/bitstate.cc:73
re2::Prog::Inst::empty
EmptyOp empty()
Definition: bloaty/third_party/re2/re2/prog.h:92
re2
Definition: bloaty/third_party/re2/re2/bitmap256.h:17
re2::Prog::Inst::cap
int cap()
Definition: bloaty/third_party/re2/re2/prog.h:86
absl::base_internal::Next
static AllocList * Next(int i, AllocList *prev, LowLevelAlloc::Arena *arena)
Definition: abseil-cpp/absl/base/internal/low_level_alloc.cc:453
re2::kInstEmptyWidth
@ kInstEmptyWidth
Definition: bloaty/third_party/re2/re2/prog.h:34
re2::BitState::text_
StringPiece text_
Definition: bloaty/third_party/re2/re2/bitstate.cc:57
re2::PODArray::size
int size() const
Definition: bloaty/third_party/re2/util/pod_array.h:28
re2::Prog::MatchKind
MatchKind
Definition: bloaty/third_party/re2/re2/prog.h:192
re2::kInstCapture
@ kInstCapture
Definition: bloaty/third_party/re2/re2/prog.h:33
gen_server_registered_method_bad_client_test_body.text
def text
Definition: gen_server_registered_method_bad_client_test_body.py:50
absl::move
constexpr absl::remove_reference_t< T > && move(T &&t) noexcept
Definition: abseil-cpp/absl/utility/utility.h:221
end
char * end
Definition: abseil-cpp/absl/strings/internal/str_format/float_conversion.cc:1008
max
int max
Definition: bloaty/third_party/zlib/examples/enough.c:170
re2::Prog::anchor_end
bool anchor_end()
Definition: bloaty/third_party/re2/re2/prog.h:216
LOG
#define LOG(severity)
Definition: bloaty/third_party/re2/util/logging.h:53
re2::Prog::SearchBitState
bool SearchBitState(const StringPiece &text, const StringPiece &context, Anchor anchor, MatchKind kind, StringPiece *match, int nmatch)
Definition: bloaty/third_party/re2/re2/bitstate.cc:350
re2::Prog::inst
Inst * inst(int id)
Definition: bloaty/third_party/re2/re2/prog.h:199
re2::BitState::ShouldVisit
bool ShouldVisit(int id, const char *p)
Definition: bloaty/third_party/re2/re2/bitstate.cc:87
re2::Prog::kAnchored
@ kAnchored
Definition: bloaty/third_party/re2/re2/prog.h:177
re2::Prog::Inst::last
int last()
Definition: bloaty/third_party/re2/re2/prog.h:83
data
char data[kBufferLength]
Definition: abseil-cpp/absl/strings/internal/str_format/float_conversion.cc:1006
re2::BitState::Push
void Push(int id, const char *p)
Definition: bloaty/third_party/re2/re2/bitstate.cc:104
b
uint64_t b
Definition: abseil-cpp/absl/container/internal/layout_test.cc:53
re2::BitState::endmatch_
bool endmatch_
Definition: bloaty/third_party/re2/re2/bitstate.cc:61
re2::BitState::nsubmatch_
int nsubmatch_
Definition: bloaty/third_party/re2/re2/bitstate.cc:63
re2::BitState::submatch_
StringPiece * submatch_
Definition: bloaty/third_party/re2/re2/bitstate.cc:62
n
int n
Definition: abseil-cpp/absl/container/btree_test.cc:1080
stdint.h
re2::Prog::Inst::opcode
InstOp opcode()
Definition: bloaty/third_party/re2/re2/prog.h:82
grpc_core::Loop
promise_detail::Loop< F > Loop(F f)
Definition: loop.h:130
re2::StringPiece::data
const_pointer data() const
Definition: bloaty/third_party/re2/re2/stringpiece.h:85
re2::Job::p
const char * p
Definition: bloaty/third_party/re2/re2/bitstate.cc:36
re2::kInstNop
@ kInstNop
Definition: bloaty/third_party/re2/re2/prog.h:36
re2::BitState::anchored_
bool anchored_
Definition: bloaty/third_party/re2/re2/bitstate.cc:59
re2::kInstMatch
@ kInstMatch
Definition: bloaty/third_party/re2/re2/prog.h:35
re2::kInstAltMatch
@ kInstAltMatch
Definition: bloaty/third_party/re2/re2/prog.h:31
re2::BitState::njob_
int njob_
Definition: bloaty/third_party/re2/re2/bitstate.cc:70
re2::BitState
Definition: bloaty/third_party/re2/re2/bitstate.cc:39
re2::BitState::job_
PODArray< Job > job_
Definition: bloaty/third_party/re2/re2/bitstate.cc:69
re2::Prog::first_byte
int first_byte()
Definition: bloaty/third_party/re2/re2/prog.cc:188
re2::Prog
Definition: bloaty/third_party/re2/re2/prog.h:56
re2::Prog::anchor_start
bool anchor_start()
Definition: bloaty/third_party/re2/re2/prog.h:214
re2::Job
Definition: bloaty/third_party/re2/re2/bitstate.cc:33
re2::StringPiece::begin
const_iterator begin() const
Definition: bloaty/third_party/re2/re2/stringpiece.h:71
re2::Prog::Inst::out
int out()
Definition: bloaty/third_party/re2/re2/prog.h:84
re2::BitState::prog_
Prog * prog_
Definition: bloaty/third_party/re2/re2/bitstate.cc:56
re2::Job::rle
int rle
Definition: bloaty/third_party/re2/re2/bitstate.cc:35
re2::Prog::kFirstMatch
@ kFirstMatch
Definition: bloaty/third_party/re2/re2/prog.h:193
re2::BitState::visited_
PODArray< uint32_t > visited_
Definition: bloaty/third_party/re2/re2/bitstate.cc:67
context
grpc::ClientContext context
Definition: istio_echo_server_lib.cc:61
autogen_x86imm.tmp
tmp
Definition: autogen_x86imm.py:12
DCHECK
#define DCHECK(condition)
Definition: bloaty/third_party/re2/util/logging.h:19
re2::Prog::Inst::hint
int hint()
Definition: bloaty/third_party/re2/re2/prog.h:90
size
voidpf void uLong size
Definition: bloaty/third_party/zlib/contrib/minizip/ioapi.h:136
re2::BitState::GrowStack
void GrowStack()
Definition: bloaty/third_party/re2/re2/bitstate.cc:97
re2::StringPiece
Definition: bloaty/third_party/re2/re2/stringpiece.h:39
re2::PODArray< uint32_t >
top
static upb_pb_encoder_segment * top(upb_pb_encoder *e)
Definition: bloaty/third_party/protobuf/php/ext/google/protobuf/upb.c:7624
re2::kInstByteRange
@ kInstByteRange
Definition: bloaty/third_party/re2/re2/prog.h:32
re2::Prog::EmptyFlags
static uint32_t EmptyFlags(const StringPiece &context, const char *p)
Definition: bloaty/third_party/re2/re2/prog.cc:287
re2::Job::id
int id
Definition: bloaty/third_party/re2/re2/bitstate.cc:34
i
uint64_t i
Definition: abseil-cpp/absl/container/btree_benchmark.cc:230
re2::Prog::start
int start()
Definition: bloaty/third_party/re2/re2/prog.h:200
id
uint32_t id
Definition: flow_control_fuzzer.cc:70


grpc
Author(s):
autogenerated on Fri May 16 2025 02:57:48