bloaty/third_party/re2/re2/prog.cc
Go to the documentation of this file.
1 // Copyright 2007 The RE2 Authors. All Rights Reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4 
5 // Compiled regular expression representation.
6 // Tested by compile_test.cc
7 
8 #include "re2/prog.h"
9 
10 #include <stdint.h>
11 #include <string.h>
12 #include <algorithm>
13 #include <memory>
14 #include <utility>
15 
16 #include "util/util.h"
17 #include "util/logging.h"
18 #include "util/strutil.h"
19 #include "re2/bitmap256.h"
20 #include "re2/stringpiece.h"
21 
22 namespace re2 {
23 
24 // Constructors per Inst opcode
25 
29  out1_ = out1;
30 }
31 
32 void Prog::Inst::InitByteRange(int lo, int hi, int foldcase, uint32_t out) {
33  DCHECK_EQ(out_opcode_, 0);
34  set_out_opcode(out, kInstByteRange);
35  lo_ = lo & 0xFF;
36  hi_ = hi & 0xFF;
37  hint_foldcase_ = foldcase&1;
38 }
39 
41  DCHECK_EQ(out_opcode_, 0);
42  set_out_opcode(out, kInstCapture);
43  cap_ = cap;
44 }
45 
47  DCHECK_EQ(out_opcode_, 0);
48  set_out_opcode(out, kInstEmptyWidth);
49  empty_ = empty;
50 }
51 
53  DCHECK_EQ(out_opcode_, 0);
54  set_opcode(kInstMatch);
55  match_id_ = id;
56 }
57 
59  DCHECK_EQ(out_opcode_, 0);
60  set_opcode(kInstNop);
61 }
62 
64  DCHECK_EQ(out_opcode_, 0);
65  set_opcode(kInstFail);
66 }
67 
69  switch (opcode()) {
70  default:
71  return StringPrintf("opcode %d", static_cast<int>(opcode()));
72 
73  case kInstAlt:
74  return StringPrintf("alt -> %d | %d", out(), out1_);
75 
76  case kInstAltMatch:
77  return StringPrintf("altmatch -> %d | %d", out(), out1_);
78 
79  case kInstByteRange:
80  return StringPrintf("byte%s [%02x-%02x] %d -> %d",
81  foldcase() ? "/i" : "",
82  lo_, hi_, hint(), out());
83 
84  case kInstCapture:
85  return StringPrintf("capture %d -> %d", cap_, out());
86 
87  case kInstEmptyWidth:
88  return StringPrintf("emptywidth %#x -> %d",
89  static_cast<int>(empty_), out());
90 
91  case kInstMatch:
92  return StringPrintf("match! %d", match_id());
93 
94  case kInstNop:
95  return StringPrintf("nop -> %d", out());
96 
97  case kInstFail:
98  return StringPrintf("fail");
99  }
100 }
101 
103  : anchor_start_(false),
105  reversed_(false),
108  start_(0),
110  size_(0),
111  bytemap_range_(0),
112  first_byte_(-1),
113  flags_(0),
114  list_count_(0),
115  dfa_mem_(0),
116  dfa_first_(NULL),
117  dfa_longest_(NULL) {
118 }
119 
123 }
124 
125 typedef SparseSet Workq;
126 
127 static inline void AddToQueue(Workq* q, int id) {
128  if (id != 0)
129  q->insert(id);
130 }
131 
133  std::string s;
134  for (Workq::iterator i = q->begin(); i != q->end(); ++i) {
135  int id = *i;
136  Prog::Inst* ip = prog->inst(id);
137  s += StringPrintf("%d. %s\n", id, ip->Dump().c_str());
138  AddToQueue(q, ip->out());
139  if (ip->opcode() == kInstAlt || ip->opcode() == kInstAltMatch)
140  AddToQueue(q, ip->out1());
141  }
142  return s;
143 }
144 
146  std::string s;
147  for (int id = start; id < prog->size(); id++) {
148  Prog::Inst* ip = prog->inst(id);
149  if (ip->last())
150  s += StringPrintf("%d. %s\n", id, ip->Dump().c_str());
151  else
152  s += StringPrintf("%d+ %s\n", id, ip->Dump().c_str());
153  }
154  return s;
155 }
156 
158  if (did_flatten_)
159  return FlattenedProgToString(this, start_);
160 
161  Workq q(size_);
162  AddToQueue(&q, start_);
163  return ProgToString(this, &q);
164 }
165 
167  if (did_flatten_)
169 
170  Workq q(size_);
172  return ProgToString(this, &q);
173 }
174 
177  for (int c = 0; c < 256; c++) {
178  int b = bytemap_[c];
179  int lo = c;
180  while (c < 256-1 && bytemap_[c+1] == b)
181  c++;
182  int hi = c;
183  map += StringPrintf("[%02x-%02x] -> %d\n", lo, hi, b);
184  }
185  return map;
186 }
187 
190  prog->first_byte_ = prog->ComputeFirstByte();
191  }, this);
192  return first_byte_;
193 }
194 
195 static bool IsMatch(Prog*, Prog::Inst*);
196 
197 // Peep-hole optimizer.
199  Workq q(size_);
200 
201  // Eliminate nops. Most are taken out during compilation
202  // but a few are hard to avoid.
203  q.clear();
204  AddToQueue(&q, start_);
205  for (Workq::iterator i = q.begin(); i != q.end(); ++i) {
206  int id = *i;
207 
208  Inst* ip = inst(id);
209  int j = ip->out();
210  Inst* jp;
211  while (j != 0 && (jp=inst(j))->opcode() == kInstNop) {
212  j = jp->out();
213  }
214  ip->set_out(j);
215  AddToQueue(&q, ip->out());
216 
217  if (ip->opcode() == kInstAlt) {
218  j = ip->out1();
219  while (j != 0 && (jp=inst(j))->opcode() == kInstNop) {
220  j = jp->out();
221  }
222  ip->out1_ = j;
223  AddToQueue(&q, ip->out1());
224  }
225  }
226 
227  // Insert kInstAltMatch instructions
228  // Look for
229  // ip: Alt -> j | k
230  // j: ByteRange [00-FF] -> ip
231  // k: Match
232  // or the reverse (the above is the greedy one).
233  // Rewrite Alt to AltMatch.
234  q.clear();
235  AddToQueue(&q, start_);
236  for (Workq::iterator i = q.begin(); i != q.end(); ++i) {
237  int id = *i;
238  Inst* ip = inst(id);
239  AddToQueue(&q, ip->out());
240  if (ip->opcode() == kInstAlt)
241  AddToQueue(&q, ip->out1());
242 
243  if (ip->opcode() == kInstAlt) {
244  Inst* j = inst(ip->out());
245  Inst* k = inst(ip->out1());
246  if (j->opcode() == kInstByteRange && j->out() == id &&
247  j->lo() == 0x00 && j->hi() == 0xFF &&
248  IsMatch(this, k)) {
250  continue;
251  }
252  if (IsMatch(this, j) &&
253  k->opcode() == kInstByteRange && k->out() == id &&
254  k->lo() == 0x00 && k->hi() == 0xFF) {
256  }
257  }
258  }
259 }
260 
261 // Is ip a guaranteed match at end of text, perhaps after some capturing?
262 static bool IsMatch(Prog* prog, Prog::Inst* ip) {
263  for (;;) {
264  switch (ip->opcode()) {
265  default:
266  LOG(DFATAL) << "Unexpected opcode in IsMatch: " << ip->opcode();
267  return false;
268 
269  case kInstAlt:
270  case kInstAltMatch:
271  case kInstByteRange:
272  case kInstFail:
273  case kInstEmptyWidth:
274  return false;
275 
276  case kInstCapture:
277  case kInstNop:
278  ip = prog->inst(ip->out());
279  break;
280 
281  case kInstMatch:
282  return true;
283  }
284  }
285 }
286 
287 uint32_t Prog::EmptyFlags(const StringPiece& text, const char* p) {
288  int flags = 0;
289 
290  // ^ and \A
291  if (p == text.data())
293  else if (p[-1] == '\n')
295 
296  // $ and \z
297  if (p == text.data() + text.size())
299  else if (p < text.data() + text.size() && p[0] == '\n')
300  flags |= kEmptyEndLine;
301 
302  // \b and \B
303  if (p == text.data() && p == text.data() + text.size()) {
304  // no word boundary here
305  } else if (p == text.data()) {
306  if (IsWordChar(p[0]))
308  } else if (p == text.data() + text.size()) {
309  if (IsWordChar(p[-1]))
311  } else {
312  if (IsWordChar(p[-1]) != IsWordChar(p[0]))
314  }
315  if (!(flags & kEmptyWordBoundary))
317 
318  return flags;
319 }
320 
321 // ByteMapBuilder implements a coloring algorithm.
322 //
323 // The first phase is a series of "mark and merge" batches: we mark one or more
324 // [lo-hi] ranges, then merge them into our internal state. Batching is not for
325 // performance; rather, it means that the ranges are treated indistinguishably.
326 //
327 // Internally, the ranges are represented using a bitmap that stores the splits
328 // and a vector that stores the colors; both of them are indexed by the ranges'
329 // last bytes. Thus, in order to merge a [lo-hi] range, we split at lo-1 and at
330 // hi (if not already split), then recolor each range in between. The color map
331 // (i.e. from the old color to the new color) is maintained for the lifetime of
332 // the batch and so underpins this somewhat obscure approach to set operations.
333 //
334 // The second phase builds the bytemap from our internal state: we recolor each
335 // range, then store the new color (which is now the byte class) in each of the
336 // corresponding array elements. Finally, we output the number of byte classes.
338  public:
340  // Initial state: the [0-255] range has color 256.
341  // This will avoid problems during the second phase,
342  // in which we assign byte classes numbered from 0.
343  splits_.Set(255);
344  colors_[255] = 256;
345  nextcolor_ = 257;
346  }
347 
348  void Mark(int lo, int hi);
349  void Merge();
350  void Build(uint8_t* bytemap, int* bytemap_range);
351 
352  private:
353  int Recolor(int oldcolor);
354 
356  int colors_[256];
358  std::vector<std::pair<int, int>> colormap_;
359  std::vector<std::pair<int, int>> ranges_;
360 
361  ByteMapBuilder(const ByteMapBuilder&) = delete;
362  ByteMapBuilder& operator=(const ByteMapBuilder&) = delete;
363 };
364 
365 void ByteMapBuilder::Mark(int lo, int hi) {
366  DCHECK_GE(lo, 0);
367  DCHECK_GE(hi, 0);
368  DCHECK_LE(lo, 255);
369  DCHECK_LE(hi, 255);
370  DCHECK_LE(lo, hi);
371 
372  // Ignore any [0-255] ranges. They cause us to recolor every range, which
373  // has no effect on the eventual result and is therefore a waste of time.
374  if (lo == 0 && hi == 255)
375  return;
376 
377  ranges_.emplace_back(lo, hi);
378 }
379 
381  for (std::vector<std::pair<int, int>>::const_iterator it = ranges_.begin();
382  it != ranges_.end();
383  ++it) {
384  int lo = it->first-1;
385  int hi = it->second;
386 
387  if (0 <= lo && !splits_.Test(lo)) {
388  splits_.Set(lo);
389  int next = splits_.FindNextSetBit(lo+1);
390  colors_[lo] = colors_[next];
391  }
392  if (!splits_.Test(hi)) {
393  splits_.Set(hi);
394  int next = splits_.FindNextSetBit(hi+1);
395  colors_[hi] = colors_[next];
396  }
397 
398  int c = lo+1;
399  while (c < 256) {
400  int next = splits_.FindNextSetBit(c);
402  if (next == hi)
403  break;
404  c = next+1;
405  }
406  }
407  colormap_.clear();
408  ranges_.clear();
409 }
410 
411 void ByteMapBuilder::Build(uint8_t* bytemap, int* bytemap_range) {
412  // Assign byte classes numbered from 0.
413  nextcolor_ = 0;
414 
415  int c = 0;
416  while (c < 256) {
417  int next = splits_.FindNextSetBit(c);
418  uint8_t b = static_cast<uint8_t>(Recolor(colors_[next]));
419  while (c <= next) {
420  bytemap[c] = b;
421  c++;
422  }
423  }
424 
425  *bytemap_range = nextcolor_;
426 }
427 
428 int ByteMapBuilder::Recolor(int oldcolor) {
429  // Yes, this is a linear search. There can be at most 256
430  // colors and there will typically be far fewer than that.
431  // Also, we need to consider keys *and* values in order to
432  // avoid recoloring a given range more than once per batch.
433  std::vector<std::pair<int, int>>::const_iterator it =
434  std::find_if(colormap_.begin(), colormap_.end(),
435  [=](const std::pair<int, int>& kv) -> bool {
436  return kv.first == oldcolor || kv.second == oldcolor;
437  });
438  if (it != colormap_.end())
439  return it->second;
440  int newcolor = nextcolor_;
441  nextcolor_++;
442  colormap_.emplace_back(oldcolor, newcolor);
443  return newcolor;
444 }
445 
447  // Fill in bytemap with byte classes for the program.
448  // Ranges of bytes that are treated indistinguishably
449  // will be mapped to a single byte class.
451 
452  // Don't repeat the work for ^ and $.
453  bool marked_line_boundaries = false;
454  // Don't repeat the work for \b and \B.
455  bool marked_word_boundaries = false;
456 
457  for (int id = 0; id < size(); id++) {
458  Inst* ip = inst(id);
459  if (ip->opcode() == kInstByteRange) {
460  int lo = ip->lo();
461  int hi = ip->hi();
462  builder.Mark(lo, hi);
463  if (ip->foldcase() && lo <= 'z' && hi >= 'a') {
464  int foldlo = lo;
465  int foldhi = hi;
466  if (foldlo < 'a')
467  foldlo = 'a';
468  if (foldhi > 'z')
469  foldhi = 'z';
470  if (foldlo <= foldhi) {
471  foldlo += 'A' - 'a';
472  foldhi += 'A' - 'a';
473  builder.Mark(foldlo, foldhi);
474  }
475  }
476  // If this Inst is not the last Inst in its list AND the next Inst is
477  // also a ByteRange AND the Insts have the same out, defer the merge.
478  if (!ip->last() &&
479  inst(id+1)->opcode() == kInstByteRange &&
480  ip->out() == inst(id+1)->out())
481  continue;
482  builder.Merge();
483  } else if (ip->opcode() == kInstEmptyWidth) {
484  if (ip->empty() & (kEmptyBeginLine|kEmptyEndLine) &&
485  !marked_line_boundaries) {
486  builder.Mark('\n', '\n');
487  builder.Merge();
488  marked_line_boundaries = true;
489  }
491  !marked_word_boundaries) {
492  // We require two batches here: the first for ranges that are word
493  // characters, the second for ranges that are not word characters.
494  for (bool isword : {true, false}) {
495  int j;
496  for (int i = 0; i < 256; i = j) {
497  for (j = i + 1; j < 256 &&
498  Prog::IsWordChar(static_cast<uint8_t>(i)) ==
499  Prog::IsWordChar(static_cast<uint8_t>(j));
500  j++)
501  ;
502  if (Prog::IsWordChar(static_cast<uint8_t>(i)) == isword)
503  builder.Mark(i, j - 1);
504  }
505  builder.Merge();
506  }
507  marked_word_boundaries = true;
508  }
509  }
510  }
511 
513 
514  if (0) { // For debugging, use trivial bytemap.
515  LOG(ERROR) << "Using trivial bytemap.";
516  for (int i = 0; i < 256; i++)
517  bytemap_[i] = static_cast<uint8_t>(i);
518  bytemap_range_ = 256;
519  }
520 }
521 
522 // Prog::Flatten() implements a graph rewriting algorithm.
523 //
524 // The overall process is similar to epsilon removal, but retains some epsilon
525 // transitions: those from Capture and EmptyWidth instructions; and those from
526 // nullable subexpressions. (The latter avoids quadratic blowup in transitions
527 // in the worst case.) It might be best thought of as Alt instruction elision.
528 //
529 // In conceptual terms, it divides the Prog into "trees" of instructions, then
530 // traverses the "trees" in order to produce "lists" of instructions. A "tree"
531 // is one or more instructions that grow from one "root" instruction to one or
532 // more "leaf" instructions; if a "tree" has exactly one instruction, then the
533 // "root" is also the "leaf". In most cases, a "root" is the successor of some
534 // "leaf" (i.e. the "leaf" instruction's out() returns the "root" instruction)
535 // and is considered a "successor root". A "leaf" can be a ByteRange, Capture,
536 // EmptyWidth or Match instruction. However, this is insufficient for handling
537 // nested nullable subexpressions correctly, so in some cases, a "root" is the
538 // dominator of the instructions reachable from some "successor root" (i.e. it
539 // has an unreachable predecessor) and is considered a "dominator root". Since
540 // only Alt instructions can be "dominator roots" (other instructions would be
541 // "leaves"), only Alt instructions are required to be marked as predecessors.
542 //
543 // Dividing the Prog into "trees" comprises two passes: marking the "successor
544 // roots" and the predecessors; and marking the "dominator roots". Sorting the
545 // "successor roots" by their bytecode offsets enables iteration in order from
546 // greatest to least during the second pass; by working backwards in this case
547 // and flooding the graph no further than "leaves" and already marked "roots",
548 // it becomes possible to mark "dominator roots" without doing excessive work.
549 //
550 // Traversing the "trees" is just iterating over the "roots" in order of their
551 // marking and flooding the graph no further than "leaves" and "roots". When a
552 // "leaf" is reached, the instruction is copied with its successor remapped to
553 // its "root" number. When a "root" is reached, a Nop instruction is generated
554 // with its successor remapped similarly. As each "list" is produced, its last
555 // instruction is marked as such. After all of the "lists" have been produced,
556 // a pass over their instructions remaps their successors to bytecode offsets.
558  if (did_flatten_)
559  return;
560  did_flatten_ = true;
561 
562  // Scratch structures. It's important that these are reused by functions
563  // that we call in loops because they would thrash the heap otherwise.
564  SparseSet reachable(size());
565  std::vector<int> stk;
566  stk.reserve(size());
567 
568  // First pass: Marks "successor roots" and predecessors.
569  // Builds the mapping from inst-ids to root-ids.
570  SparseArray<int> rootmap(size());
571  SparseArray<int> predmap(size());
572  std::vector<std::vector<int>> predvec;
573  MarkSuccessors(&rootmap, &predmap, &predvec, &reachable, &stk);
574 
575  // Second pass: Marks "dominator roots".
576  SparseArray<int> sorted(rootmap);
577  std::sort(sorted.begin(), sorted.end(), sorted.less);
578  for (SparseArray<int>::const_iterator i = sorted.end() - 1;
579  i != sorted.begin();
580  --i) {
581  if (i->index() != start_unanchored() && i->index() != start())
582  MarkDominator(i->index(), &rootmap, &predmap, &predvec, &reachable, &stk);
583  }
584 
585  // Third pass: Emits "lists". Remaps outs to root-ids.
586  // Builds the mapping from root-ids to flat-ids.
587  std::vector<int> flatmap(rootmap.size());
588  std::vector<Inst> flat;
589  flat.reserve(size());
590  for (SparseArray<int>::const_iterator i = rootmap.begin();
591  i != rootmap.end();
592  ++i) {
593  flatmap[i->value()] = static_cast<int>(flat.size());
594  EmitList(i->index(), &rootmap, &flat, &reachable, &stk);
595  flat.back().set_last();
596  // We have the bounds of the "list", so this is the
597  // most convenient point at which to compute hints.
598  ComputeHints(&flat, flatmap[i->value()], static_cast<int>(flat.size()));
599  }
600 
601  list_count_ = static_cast<int>(flatmap.size());
602  for (int i = 0; i < kNumInst; i++)
603  inst_count_[i] = 0;
604 
605  // Fourth pass: Remaps outs to flat-ids.
606  // Counts instructions by opcode.
607  for (int id = 0; id < static_cast<int>(flat.size()); id++) {
608  Inst* ip = &flat[id];
609  if (ip->opcode() != kInstAltMatch) // handled in EmitList()
610  ip->set_out(flatmap[ip->out()]);
611  inst_count_[ip->opcode()]++;
612  }
613 
614  int total = 0;
615  for (int i = 0; i < kNumInst; i++)
616  total += inst_count_[i];
617  DCHECK_EQ(total, static_cast<int>(flat.size()));
618 
619  // Remap start_unanchored and start.
620  if (start_unanchored() == 0) {
621  DCHECK_EQ(start(), 0);
622  } else if (start_unanchored() == start()) {
623  set_start_unanchored(flatmap[1]);
624  set_start(flatmap[1]);
625  } else {
626  set_start_unanchored(flatmap[1]);
627  set_start(flatmap[2]);
628  }
629 
630  // Finally, replace the old instructions with the new instructions.
631  size_ = static_cast<int>(flat.size());
633  memmove(inst_.data(), flat.data(), size_*sizeof inst_[0]);
634 
635  // Populate the list heads for BitState.
636  // 512 instructions limits the memory footprint to 1KiB.
637  if (size_ <= 512) {
639  // 0xFF makes it more obvious if we try to look up a non-head.
640  memset(list_heads_.data(), 0xFF, size_*sizeof list_heads_[0]);
641  for (int i = 0; i < list_count_; ++i)
642  list_heads_[flatmap[i]] = i;
643  }
644 }
645 
647  SparseArray<int>* predmap,
648  std::vector<std::vector<int>>* predvec,
649  SparseSet* reachable, std::vector<int>* stk) {
650  // Mark the kInstFail instruction.
651  rootmap->set_new(0, rootmap->size());
652 
653  // Mark the start_unanchored and start instructions.
654  if (!rootmap->has_index(start_unanchored()))
655  rootmap->set_new(start_unanchored(), rootmap->size());
656  if (!rootmap->has_index(start()))
657  rootmap->set_new(start(), rootmap->size());
658 
659  reachable->clear();
660  stk->clear();
661  stk->push_back(start_unanchored());
662  while (!stk->empty()) {
663  int id = stk->back();
664  stk->pop_back();
665  Loop:
666  if (reachable->contains(id))
667  continue;
668  reachable->insert_new(id);
669 
670  Inst* ip = inst(id);
671  switch (ip->opcode()) {
672  default:
673  LOG(DFATAL) << "unhandled opcode: " << ip->opcode();
674  break;
675 
676  case kInstAltMatch:
677  case kInstAlt:
678  // Mark this instruction as a predecessor of each out.
679  for (int out : {ip->out(), ip->out1()}) {
680  if (!predmap->has_index(out)) {
681  predmap->set_new(out, static_cast<int>(predvec->size()));
682  predvec->emplace_back();
683  }
684  (*predvec)[predmap->get_existing(out)].emplace_back(id);
685  }
686  stk->push_back(ip->out1());
687  id = ip->out();
688  goto Loop;
689 
690  case kInstByteRange:
691  case kInstCapture:
692  case kInstEmptyWidth:
693  // Mark the out of this instruction as a "root".
694  if (!rootmap->has_index(ip->out()))
695  rootmap->set_new(ip->out(), rootmap->size());
696  id = ip->out();
697  goto Loop;
698 
699  case kInstNop:
700  id = ip->out();
701  goto Loop;
702 
703  case kInstMatch:
704  case kInstFail:
705  break;
706  }
707  }
708 }
709 
711  SparseArray<int>* predmap,
712  std::vector<std::vector<int>>* predvec,
713  SparseSet* reachable, std::vector<int>* stk) {
714  reachable->clear();
715  stk->clear();
716  stk->push_back(root);
717  while (!stk->empty()) {
718  int id = stk->back();
719  stk->pop_back();
720  Loop:
721  if (reachable->contains(id))
722  continue;
723  reachable->insert_new(id);
724 
725  if (id != root && rootmap->has_index(id)) {
726  // We reached another "tree" via epsilon transition.
727  continue;
728  }
729 
730  Inst* ip = inst(id);
731  switch (ip->opcode()) {
732  default:
733  LOG(DFATAL) << "unhandled opcode: " << ip->opcode();
734  break;
735 
736  case kInstAltMatch:
737  case kInstAlt:
738  stk->push_back(ip->out1());
739  id = ip->out();
740  goto Loop;
741 
742  case kInstByteRange:
743  case kInstCapture:
744  case kInstEmptyWidth:
745  break;
746 
747  case kInstNop:
748  id = ip->out();
749  goto Loop;
750 
751  case kInstMatch:
752  case kInstFail:
753  break;
754  }
755  }
756 
757  for (SparseSet::const_iterator i = reachable->begin();
758  i != reachable->end();
759  ++i) {
760  int id = *i;
761  if (predmap->has_index(id)) {
762  for (int pred : (*predvec)[predmap->get_existing(id)]) {
763  if (!reachable->contains(pred)) {
764  // id has a predecessor that cannot be reached from root!
765  // Therefore, id must be a "root" too - mark it as such.
766  if (!rootmap->has_index(id))
767  rootmap->set_new(id, rootmap->size());
768  }
769  }
770  }
771  }
772 }
773 
775  std::vector<Inst>* flat,
776  SparseSet* reachable, std::vector<int>* stk) {
777  reachable->clear();
778  stk->clear();
779  stk->push_back(root);
780  while (!stk->empty()) {
781  int id = stk->back();
782  stk->pop_back();
783  Loop:
784  if (reachable->contains(id))
785  continue;
786  reachable->insert_new(id);
787 
788  if (id != root && rootmap->has_index(id)) {
789  // We reached another "tree" via epsilon transition. Emit a kInstNop
790  // instruction so that the Prog does not become quadratically larger.
791  flat->emplace_back();
792  flat->back().set_opcode(kInstNop);
793  flat->back().set_out(rootmap->get_existing(id));
794  continue;
795  }
796 
797  Inst* ip = inst(id);
798  switch (ip->opcode()) {
799  default:
800  LOG(DFATAL) << "unhandled opcode: " << ip->opcode();
801  break;
802 
803  case kInstAltMatch:
804  flat->emplace_back();
805  flat->back().set_opcode(kInstAltMatch);
806  flat->back().set_out(static_cast<int>(flat->size()));
807  flat->back().out1_ = static_cast<uint32_t>(flat->size())+1;
809 
810  case kInstAlt:
811  stk->push_back(ip->out1());
812  id = ip->out();
813  goto Loop;
814 
815  case kInstByteRange:
816  case kInstCapture:
817  case kInstEmptyWidth:
818  flat->emplace_back();
819  memmove(&flat->back(), ip, sizeof *ip);
820  flat->back().set_out(rootmap->get_existing(ip->out()));
821  break;
822 
823  case kInstNop:
824  id = ip->out();
825  goto Loop;
826 
827  case kInstMatch:
828  case kInstFail:
829  flat->emplace_back();
830  memmove(&flat->back(), ip, sizeof *ip);
831  break;
832  }
833  }
834 }
835 
836 // For each ByteRange instruction in [begin, end), computes a hint to execution
837 // engines: the delta to the next instruction (in flat) worth exploring iff the
838 // current instruction matched.
839 //
840 // Implements a coloring algorithm related to ByteMapBuilder, but in this case,
841 // colors are instructions and recoloring ranges precisely identifies conflicts
842 // between instructions. Iterating backwards over [begin, end) is guaranteed to
843 // identify the nearest conflict (if any) with only linear complexity.
844 void Prog::ComputeHints(std::vector<Inst>* flat, int begin, int end) {
845  Bitmap256 splits;
846  int colors[256];
847 
848  bool dirty = false;
849  for (int id = end; id >= begin; --id) {
850  if (id == end ||
851  (*flat)[id].opcode() != kInstByteRange) {
852  if (dirty) {
853  dirty = false;
854  splits.Clear();
855  }
856  splits.Set(255);
857  colors[255] = id;
858  // At this point, the [0-255] range is colored with id.
859  // Thus, hints cannot point beyond id; and if id == end,
860  // hints that would have pointed to id will be 0 instead.
861  continue;
862  }
863  dirty = true;
864 
865  // We recolor the [lo-hi] range with id. Note that first ratchets backwards
866  // from end to the nearest conflict (if any) during recoloring.
867  int first = end;
868  auto Recolor = [&](int lo, int hi) {
869  // Like ByteMapBuilder, we split at lo-1 and at hi.
870  --lo;
871 
872  if (0 <= lo && !splits.Test(lo)) {
873  splits.Set(lo);
874  int next = splits.FindNextSetBit(lo+1);
875  colors[lo] = colors[next];
876  }
877  if (!splits.Test(hi)) {
878  splits.Set(hi);
879  int next = splits.FindNextSetBit(hi+1);
880  colors[hi] = colors[next];
881  }
882 
883  int c = lo+1;
884  while (c < 256) {
885  int next = splits.FindNextSetBit(c);
886  // Ratchet backwards...
887  first = std::min(first, colors[next]);
888  // Recolor with id - because it's the new nearest conflict!
889  colors[next] = id;
890  if (next == hi)
891  break;
892  c = next+1;
893  }
894  };
895 
896  Inst* ip = &(*flat)[id];
897  int lo = ip->lo();
898  int hi = ip->hi();
899  Recolor(lo, hi);
900  if (ip->foldcase() && lo <= 'z' && hi >= 'a') {
901  int foldlo = lo;
902  int foldhi = hi;
903  if (foldlo < 'a')
904  foldlo = 'a';
905  if (foldhi > 'z')
906  foldhi = 'z';
907  if (foldlo <= foldhi) {
908  foldlo += 'A' - 'a';
909  foldhi += 'A' - 'a';
910  Recolor(foldlo, foldhi);
911  }
912  }
913 
914  if (first != end) {
915  uint16_t hint = static_cast<uint16_t>(std::min(first - id, 32767));
916  ip->hint_foldcase_ |= hint<<1;
917  }
918  }
919 }
920 
921 } // namespace re2
re2::Prog::Inst::out1_
uint32_t out1_
Definition: bloaty/third_party/re2/re2/prog.h:135
re2::kInstFail
@ kInstFail
Definition: bloaty/third_party/re2/re2/prog.h:37
re2::Prog::dfa_first_
DFA * dfa_first_
Definition: bloaty/third_party/re2/re2/prog.h:417
re2::ByteMapBuilder::ByteMapBuilder
ByteMapBuilder()
Definition: bloaty/third_party/re2/re2/prog.cc:339
gen_build_yaml.out
dictionary out
Definition: src/benchmark/gen_build_yaml.py:24
re2::Prog::did_onepass_
bool did_onepass_
Definition: bloaty/third_party/re2/re2/prog.h:399
re2::Prog::Inst::hi
int hi()
Definition: bloaty/third_party/re2/re2/prog.h:88
regen-readme.it
it
Definition: regen-readme.py:15
re2::IsMatch
static bool IsMatch(Prog *, Prog::Inst *)
Definition: bloaty/third_party/re2/re2/prog.cc:262
prog
char * prog
Definition: bloaty/third_party/zlib/contrib/untgz/untgz.c:125
re2::Prog::DumpUnanchored
std::string DumpUnanchored()
Definition: bloaty/third_party/re2/re2/prog.cc:166
re2::ByteMapBuilder::operator=
ByteMapBuilder & operator=(const ByteMapBuilder &)=delete
re2::Prog::Inst::InitCapture
void InitCapture(int cap, uint32_t out)
Definition: bloaty/third_party/re2/re2/prog.cc:40
re2::Prog::Inst::out1
int out1()
Definition: bloaty/third_party/re2/re2/prog.h:85
re2::Prog::flags
int flags()
Definition: bloaty/third_party/re2/re2/prog.h:212
memset
return memset(p, 0, total)
begin
char * begin
Definition: abseil-cpp/absl/strings/internal/str_format/float_conversion.cc:1007
false
#define false
Definition: setup_once.h:323
uint16_t
unsigned short uint16_t
Definition: stdint-msvc2008.h:79
DCHECK_LE
#define DCHECK_LE(val1, val2)
Definition: bloaty/third_party/re2/util/logging.h:22
re2::Prog::Inst
Definition: bloaty/third_party/re2/re2/prog.h:62
re2::Prog::Prog
Prog()
Definition: bloaty/third_party/re2/re2/prog.cc:102
total
size_t total
Definition: cord_analysis.cc:59
re2::Prog::reversed_
bool reversed_
Definition: bloaty/third_party/re2/re2/prog.h:397
re2::Bitmap256
Definition: bloaty/third_party/re2/re2/bitmap256.h:19
string.h
re2::Prog::did_flatten_
bool did_flatten_
Definition: bloaty/third_party/re2/re2/prog.h:398
testing::internal::string
::std::string string
Definition: bloaty/third_party/protobuf/third_party/googletest/googletest/include/gtest/internal/gtest-port.h:881
re2::PODArray::data
T * data() const
Definition: bloaty/third_party/re2/util/pod_array.h:24
re2::Prog::size_
int size_
Definition: bloaty/third_party/re2/re2/prog.h:403
re2::Prog::Inst::empty
EmptyOp empty()
Definition: bloaty/third_party/re2/re2/prog.h:92
re2::Prog::ComputeByteMap
void ComputeByteMap()
Definition: bloaty/third_party/re2/re2/prog.cc:446
re2::Prog::list_count_
int list_count_
Definition: bloaty/third_party/re2/re2/prog.h:408
re2::Prog::Inst::InitMatch
void InitMatch(int id)
re2::Prog::MarkSuccessors
void MarkSuccessors(SparseArray< int > *rootmap, SparseArray< int > *predmap, std::vector< std::vector< int >> *predvec, SparseSet *reachable, std::vector< int > *stk)
Definition: bloaty/third_party/re2/re2/prog.cc:646
re2::Prog::ComputeHints
void ComputeHints(std::vector< Inst > *flat, int begin, int end)
Definition: bloaty/third_party/re2/re2/prog.cc:844
re2::Prog::Inst::set_out
void set_out(int out)
Definition: bloaty/third_party/re2/re2/prog.h:125
re2
Definition: bloaty/third_party/re2/re2/bitmap256.h:17
re2::SparseArray::end
iterator end()
Definition: bloaty/third_party/re2/util/sparse_array.h:142
uint8_t
unsigned char uint8_t
Definition: stdint-msvc2008.h:78
setup.k
k
Definition: third_party/bloaty/third_party/capstone/bindings/python/setup.py:42
map
zval * map
Definition: php/ext/google/protobuf/encode_decode.c:480
re2::Prog::Inst::set_out_opcode
void set_out_opcode(int out, InstOp opcode)
Definition: bloaty/third_party/re2/re2/prog.h:129
re2::kInstEmptyWidth
@ kInstEmptyWidth
Definition: bloaty/third_party/re2/re2/prog.h:34
re2::Prog::IsWordChar
static bool IsWordChar(uint8_t c)
Definition: bloaty/third_party/re2/re2/prog.h:240
re2::Prog::first_byte_
int first_byte_
Definition: bloaty/third_party/re2/re2/prog.h:405
uint32_t
unsigned int uint32_t
Definition: stdint-msvc2008.h:80
re2::SparseSetT::clear
void clear()
Definition: bloaty/third_party/re2/util/sparse_set.h:117
re2::SparseSetT::const_iterator
const typedef int * const_iterator
Definition: bloaty/third_party/re2/util/sparse_set.h:76
re2::kEmptyWordBoundary
@ kEmptyWordBoundary
Definition: bloaty/third_party/re2/re2/prog.h:47
absl::call_once
void call_once(absl::once_flag &flag, Callable &&fn, Args &&... args)
Definition: abseil-cpp/absl/base/call_once.h:206
re2::Prog::Inst::InitAlt
void InitAlt(uint32_t out, uint32_t out1)
Definition: bloaty/third_party/re2/re2/prog.cc:26
re2::Prog::DumpByteMap
std::string DumpByteMap()
Definition: bloaty/third_party/re2/re2/prog.cc:175
re2::SparseSetT::contains
bool contains(int i) const
Definition: bloaty/third_party/re2/util/sparse_set.h:220
re2::SparseSetT
Definition: bloaty/third_party/re2/util/sparse_set.h:69
re2::ByteMapBuilder::Recolor
int Recolor(int oldcolor)
Definition: bloaty/third_party/re2/re2/prog.cc:428
re2::kInstCapture
@ kInstCapture
Definition: bloaty/third_party/re2/re2/prog.h:33
re2::Prog::Inst::InitEmptyWidth
void InitEmptyWidth(EmptyOp empty, uint32_t out)
Definition: bloaty/third_party/re2/re2/prog.cc:46
start
static uint64_t start
Definition: benchmark-pound.c:74
profile_analyzer.builder
builder
Definition: profile_analyzer.py:159
gen_server_registered_method_bad_client_test_body.text
def text
Definition: gen_server_registered_method_bad_client_test_body.py:50
re2::Bitmap256::Set
void Set(int c)
Definition: bloaty/third_party/re2/re2/bitmap256.h:39
end
char * end
Definition: abseil-cpp/absl/strings/internal/str_format/float_conversion.cc:1008
re2::FlattenedProgToString
static std::string FlattenedProgToString(Prog *prog, int start)
Definition: bloaty/third_party/re2/re2/prog.cc:145
re2::ByteMapBuilder::Mark
void Mark(int lo, int hi)
Definition: bloaty/third_party/re2/re2/prog.cc:365
re2::ByteMapBuilder::colors_
int colors_[256]
Definition: bloaty/third_party/re2/re2/prog.cc:356
root
RefCountedPtr< grpc_tls_certificate_provider > root
Definition: xds_server_config_fetcher.cc:223
FALLTHROUGH_INTENDED
#define FALLTHROUGH_INTENDED
Definition: bloaty/third_party/re2/util/util.h:26
re2::kEmptyNonWordBoundary
@ kEmptyNonWordBoundary
Definition: bloaty/third_party/re2/re2/prog.h:48
re2::Bitmap256::FindNextSetBit
int FindNextSetBit(int c) const
Definition: bloaty/third_party/re2/re2/bitmap256.h:86
LOG
#define LOG(severity)
Definition: bloaty/third_party/re2/util/logging.h:53
re2::EmptyOp
EmptyOp
Definition: bloaty/third_party/re2/re2/prog.h:42
re2::SparseArray
Definition: bloaty/third_party/re2/util/sparse_array.h:110
re2::Prog::inst_
PODArray< Inst > inst_
Definition: bloaty/third_party/re2/re2/prog.h:413
re2::Prog::EmitList
void EmitList(int root, SparseArray< int > *rootmap, std::vector< Inst > *flat, SparseSet *reachable, std::vector< int > *stk)
Definition: bloaty/third_party/re2/re2/prog.cc:774
re2::StringPrintf
std::string StringPrintf(const char *format,...)
Definition: bloaty/third_party/re2/util/strutil.cc:140
re2::Prog::Flatten
void Flatten()
Definition: bloaty/third_party/re2/re2/prog.cc:557
re2::Prog::inst
Inst * inst(int id)
Definition: bloaty/third_party/re2/re2/prog.h:199
re2::Prog::inst_count_
int inst_count_[kNumInst]
Definition: bloaty/third_party/re2/re2/prog.h:409
re2::Prog::Inst::InitNop
void InitNop(uint32_t out)
Definition: bloaty/third_party/re2/re2/prog.cc:58
re2::SparseArray::get_existing
Value & get_existing(int i)
Definition: bloaty/third_party/re2/util/sparse_array.h:200
re2::kEmptyBeginText
@ kEmptyBeginText
Definition: bloaty/third_party/re2/re2/prog.h:45
re2::kNumInst
@ kNumInst
Definition: bloaty/third_party/re2/re2/prog.h:38
re2::Prog::DeleteDFA
void DeleteDFA(DFA *dfa)
Definition: bloaty/third_party/re2/re2/dfa.cc:1847
re2::Prog::~Prog
~Prog()
Definition: bloaty/third_party/re2/re2/prog.cc:120
re2::Prog::Inst::last
int last()
Definition: bloaty/third_party/re2/re2/prog.h:83
re2::Prog::set_start_unanchored
void set_start_unanchored(int start)
Definition: bloaty/third_party/re2/re2/prog.h:203
re2::Prog::size
int size()
Definition: bloaty/third_party/re2/re2/prog.h:204
re2::ByteMapBuilder::colormap_
std::vector< std::pair< int, int > > colormap_
Definition: bloaty/third_party/re2/re2/prog.cc:358
re2::Prog::anchor_end_
bool anchor_end_
Definition: bloaty/third_party/re2/re2/prog.h:396
re2::ProgToString
static std::string ProgToString(Prog *prog, Workq *q)
Definition: bloaty/third_party/re2/re2/prog.cc:132
min
#define min(a, b)
Definition: qsort.h:83
re2::Prog::set_start
void set_start(int start)
Definition: bloaty/third_party/re2/re2/prog.h:202
b
uint64_t b
Definition: abseil-cpp/absl/container/internal/layout_test.cc:53
re2::SparseArray::set_new
iterator set_new(int i, const Value &v)
Definition: bloaty/third_party/re2/util/sparse_array.h:188
re2::kInstAlt
@ kInstAlt
Definition: bloaty/third_party/re2/re2/prog.h:30
google::protobuf::ERROR
static const LogLevel ERROR
Definition: bloaty/third_party/protobuf/src/google/protobuf/testing/googletest.h:70
stdint.h
re2::ByteMapBuilder::Merge
void Merge()
Definition: bloaty/third_party/re2/re2/prog.cc:380
re2::Prog::Inst::opcode
InstOp opcode()
Definition: bloaty/third_party/re2/re2/prog.h:82
google_benchmark.example.empty
def empty(state)
Definition: example.py:31
re2::Bitmap256::Test
bool Test(int c) const
Definition: bloaty/third_party/re2/re2/bitmap256.h:31
re2::Bitmap256::Clear
void Clear()
Definition: bloaty/third_party/re2/re2/bitmap256.h:26
re2::SparseArray::size
int size() const
Definition: bloaty/third_party/re2/util/sparse_array.h:129
re2::kEmptyEndLine
@ kEmptyEndLine
Definition: bloaty/third_party/re2/re2/prog.h:44
grpc_core::Loop
promise_detail::Loop< F > Loop(F f)
Definition: loop.h:130
re2::SparseSetT::begin
iterator begin()
Definition: bloaty/third_party/re2/util/sparse_set.h:89
re2::Prog::MarkDominator
void MarkDominator(int root, SparseArray< int > *rootmap, SparseArray< int > *predmap, std::vector< std::vector< int >> *predvec, SparseSet *reachable, std::vector< int > *stk)
Definition: bloaty/third_party/re2/re2/prog.cc:710
re2::Prog::Inst::foldcase
int foldcase()
Definition: bloaty/third_party/re2/re2/prog.h:89
re2::ByteMapBuilder::Build
void Build(uint8_t *bytemap, int *bytemap_range)
Definition: bloaty/third_party/re2/re2/prog.cc:411
re2::kInstNop
@ kInstNop
Definition: bloaty/third_party/re2/re2/prog.h:36
re2::Prog::Inst::set_opcode
void set_opcode(InstOp opcode)
Definition: bloaty/third_party/re2/re2/prog.h:117
re2::kInstMatch
@ kInstMatch
Definition: bloaty/third_party/re2/re2/prog.h:35
re2::Prog::Inst::out_opcode_
uint32_t out_opcode_
Definition: bloaty/third_party/re2/re2/prog.h:133
re2::kInstAltMatch
@ kInstAltMatch
Definition: bloaty/third_party/re2/re2/prog.h:31
absl::flags_internal
Definition: abseil-cpp/absl/flags/commandlineflag.h:40
re2::Prog::start_unanchored_
int start_unanchored_
Definition: bloaty/third_party/re2/re2/prog.h:402
re2::AddToQueue
static void AddToQueue(Workq *q, int id)
Definition: bloaty/third_party/re2/re2/prog.cc:127
re2::Prog::first_byte
int first_byte()
Definition: bloaty/third_party/re2/re2/prog.cc:188
re2::Prog
Definition: bloaty/third_party/re2/re2/prog.h:56
re2::SparseArray::begin
iterator begin()
Definition: bloaty/third_party/re2/util/sparse_array.h:139
re2::Prog::anchor_start_
bool anchor_start_
Definition: bloaty/third_party/re2/re2/prog.h:395
DCHECK_EQ
#define DCHECK_EQ(val1, val2)
Definition: bloaty/third_party/re2/util/logging.h:20
re2::Prog::bytemap_
uint8_t bytemap_[256]
Definition: bloaty/third_party/re2/re2/prog.h:420
re2::ByteMapBuilder::splits_
Bitmap256 splits_
Definition: bloaty/third_party/re2/re2/prog.cc:355
re2::SparseSetT::iterator
int * iterator
Definition: bloaty/third_party/re2/util/sparse_set.h:75
re2::Prog::dfa_longest_
DFA * dfa_longest_
Definition: bloaty/third_party/re2/re2/prog.h:418
re2::ByteMapBuilder
Definition: bloaty/third_party/re2/re2/prog.cc:337
next
AllocList * next[kMaxLevel]
Definition: abseil-cpp/absl/base/internal/low_level_alloc.cc:100
first
StrT first
Definition: cxa_demangle.cpp:4884
re2::Prog::Inst::out
int out()
Definition: bloaty/third_party/re2/re2/prog.h:84
re2::SparseArray::less
static bool less(const IndexValue &a, const IndexValue &b)
Definition: bloaty/third_party/re2/util/sparse_array.h:385
re2::Workq
SparseSet Workq
Definition: bloaty/third_party/re2/re2/prog.cc:125
re2::Prog::Inst::hint_foldcase_
uint16_t hint_foldcase_
Definition: bloaty/third_party/re2/re2/prog.h:151
DCHECK_GE
#define DCHECK_GE(val1, val2)
Definition: bloaty/third_party/re2/util/logging.h:24
re2::ByteMapBuilder::nextcolor_
int nextcolor_
Definition: bloaty/third_party/re2/re2/prog.cc:357
re2::SparseArray::has_index
bool has_index(int i) const
Definition: bloaty/third_party/re2/util/sparse_array.h:349
re2::Prog::Inst::InitFail
void InitFail()
Definition: bloaty/third_party/re2/re2/prog.cc:63
re2::Prog::start_unanchored
int start_unanchored()
Definition: bloaty/third_party/re2/re2/prog.h:201
re2::Prog::bytemap_range_
int bytemap_range_
Definition: bloaty/third_party/re2/re2/prog.h:404
re2::kEmptyEndText
@ kEmptyEndText
Definition: bloaty/third_party/re2/re2/prog.h:46
re2::kEmptyBeginLine
@ kEmptyBeginLine
Definition: bloaty/third_party/re2/re2/prog.h:43
re2::Prog::first_byte_once_
std::once_flag first_byte_once_
Definition: bloaty/third_party/re2/re2/prog.h:422
re2::Prog::list_heads_
PODArray< uint16_t > list_heads_
Definition: bloaty/third_party/re2/re2/prog.h:410
re2::Prog::Inst::lo
int lo()
Definition: bloaty/third_party/re2/re2/prog.h:87
re2::Prog::Inst::Dump
std::string Dump()
Definition: bloaty/third_party/re2/re2/prog.cc:68
re2::Prog::dfa_mem_
int64_t dfa_mem_
Definition: bloaty/third_party/re2/re2/prog.h:416
re2::Prog::flags_
int flags_
Definition: bloaty/third_party/re2/re2/prog.h:406
int32_t
signed int int32_t
Definition: stdint-msvc2008.h:77
re2::ByteMapBuilder::ranges_
std::vector< std::pair< int, int > > ranges_
Definition: bloaty/third_party/re2/re2/prog.cc:359
re2::StringPiece
Definition: bloaty/third_party/re2/re2/stringpiece.h:39
re2::Prog::Inst::InitByteRange
void InitByteRange(int lo, int hi, int foldcase, uint32_t out)
Definition: bloaty/third_party/re2/re2/prog.cc:32
opcode
opcode
Definition: bloaty/third_party/protobuf/php/ext/google/protobuf/upb.h:6290
re2::SparseSetT::end
iterator end()
Definition: bloaty/third_party/re2/util/sparse_set.h:92
re2::PODArray
Definition: bloaty/third_party/re2/util/pod_array.h:14
re2::Prog::start_
int start_
Definition: bloaty/third_party/re2/re2/prog.h:401
re2::kInstByteRange
@ kInstByteRange
Definition: bloaty/third_party/re2/re2/prog.h:32
re2::Prog::EmptyFlags
static uint32_t EmptyFlags(const StringPiece &context, const char *p)
Definition: bloaty/third_party/re2/re2/prog.cc:287
i
uint64_t i
Definition: abseil-cpp/absl/container/btree_benchmark.cc:230
re2::SparseSetT::insert_new
iterator insert_new(int i)
Definition: bloaty/third_party/re2/util/sparse_set.h:138
re2::Prog::start
int start()
Definition: bloaty/third_party/re2/re2/prog.h:200
id
uint32_t id
Definition: flow_control_fuzzer.cc:70
re2::Prog::Dump
std::string Dump()
Definition: bloaty/third_party/re2/re2/prog.cc:157
re2::Prog::Optimize
void Optimize()
Definition: bloaty/third_party/re2/re2/prog.cc:198


grpc
Author(s):
autogenerated on Fri May 16 2025 02:59:46