bloaty/third_party/re2/re2/mimics_pcre.cc
Go to the documentation of this file.
1 // Copyright 2008 The RE2 Authors. All Rights Reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4 
5 // Determine whether this library should match PCRE exactly
6 // for a particular Regexp. (If so, the testing framework can
7 // check that it does.)
8 //
9 // This library matches PCRE except in these cases:
10 // * the regexp contains a repetition of an empty string,
11 // like (a*)* or (a*)+. In this case, PCRE will treat
12 // the repetition sequence as ending with an empty string,
13 // while this library does not.
14 // * Perl and PCRE differ on whether \v matches \n.
15 // For historical reasons, this library implements the Perl behavior.
16 // * Perl and PCRE allow $ in one-line mode to match either the very
17 // end of the text or just before a \n at the end of the text.
18 // This library requires it to match only the end of the text.
19 // * Similarly, Perl and PCRE do not allow ^ in multi-line mode to
20 // match the end of the text if the last character is a \n.
21 // This library does allow it.
22 //
23 // Regexp::MimicsPCRE checks for any of these conditions.
24 
25 #include "util/util.h"
26 #include "util/logging.h"
27 #include "re2/regexp.h"
28 #include "re2/walker-inl.h"
29 
30 namespace re2 {
31 
32 // Returns whether re might match an empty string.
33 static bool CanBeEmptyString(Regexp *re);
34 
35 // Walker class to compute whether library handles a regexp
36 // exactly as PCRE would. See comment at top for conditions.
37 
38 class PCREWalker : public Regexp::Walker<bool> {
39  public:
41  bool PostVisit(Regexp* re, bool parent_arg, bool pre_arg, bool* child_args,
42  int nchild_args);
43 
44  bool ShortVisit(Regexp* re, bool a) {
45  // Should never be called: we use Walk not WalkExponential.
46  LOG(DFATAL) << "EmptyStringWalker::ShortVisit called";
47  return a;
48  }
49 };
50 
51 // Called after visiting each of re's children and accumulating
52 // the return values in child_args. So child_args contains whether
53 // this library mimics PCRE for those subexpressions.
54 bool PCREWalker::PostVisit(Regexp* re, bool parent_arg, bool pre_arg,
55  bool* child_args, int nchild_args) {
56  // If children failed, so do we.
57  for (int i = 0; i < nchild_args; i++)
58  if (!child_args[i])
59  return false;
60 
61  // Otherwise look for other reasons to fail.
62  switch (re->op()) {
63  // Look for repeated empty string.
64  case kRegexpStar:
65  case kRegexpPlus:
66  case kRegexpQuest:
67  if (CanBeEmptyString(re->sub()[0]))
68  return false;
69  break;
70  case kRegexpRepeat:
71  if (re->max() == -1 && CanBeEmptyString(re->sub()[0]))
72  return false;
73  break;
74 
75  // Look for \v
76  case kRegexpLiteral:
77  if (re->rune() == '\v')
78  return false;
79  break;
80 
81  // Look for $ in single-line mode.
82  case kRegexpEndText:
83  case kRegexpEmptyMatch:
84  if (re->parse_flags() & Regexp::WasDollar)
85  return false;
86  break;
87 
88  // Look for ^ in multi-line mode.
89  case kRegexpBeginLine:
90  // No condition: in single-line mode ^ becomes kRegexpBeginText.
91  return false;
92 
93  default:
94  break;
95  }
96 
97  // Not proven guilty.
98  return true;
99 }
100 
101 // Returns whether this regexp's behavior will mimic PCRE's exactly.
103  PCREWalker w;
104  return w.Walk(this, true);
105 }
106 
107 
108 // Walker class to compute whether a Regexp can match an empty string.
109 // It is okay to overestimate. For example, \b\B cannot match an empty
110 // string, because \b and \B are mutually exclusive, but this isn't
111 // that smart and will say it can. Spurious empty strings
112 // will reduce the number of regexps we sanity check against PCRE,
113 // but they won't break anything.
114 
115 class EmptyStringWalker : public Regexp::Walker<bool> {
116  public:
118  bool PostVisit(Regexp* re, bool parent_arg, bool pre_arg,
119  bool* child_args, int nchild_args);
120 
121  bool ShortVisit(Regexp* re, bool a) {
122  // Should never be called: we use Walk not WalkExponential.
123  LOG(DFATAL) << "EmptyStringWalker::ShortVisit called";
124  return a;
125  }
126 
127  private:
128  EmptyStringWalker(const EmptyStringWalker&) = delete;
130 };
131 
132 // Called after visiting re's children. child_args contains the return
133 // value from each of the children's PostVisits (i.e., whether each child
134 // can match an empty string). Returns whether this clause can match an
135 // empty string.
136 bool EmptyStringWalker::PostVisit(Regexp* re, bool parent_arg, bool pre_arg,
137  bool* child_args, int nchild_args) {
138  switch (re->op()) {
139  case kRegexpNoMatch: // never empty
140  case kRegexpLiteral:
141  case kRegexpAnyChar:
142  case kRegexpAnyByte:
143  case kRegexpCharClass:
145  return false;
146 
147  case kRegexpEmptyMatch: // always empty
148  case kRegexpBeginLine: // always empty, when they match
149  case kRegexpEndLine:
151  case kRegexpWordBoundary:
152  case kRegexpBeginText:
153  case kRegexpEndText:
154  case kRegexpStar: // can always be empty
155  case kRegexpQuest:
156  case kRegexpHaveMatch:
157  return true;
158 
159  case kRegexpConcat: // can be empty if all children can
160  for (int i = 0; i < nchild_args; i++)
161  if (!child_args[i])
162  return false;
163  return true;
164 
165  case kRegexpAlternate: // can be empty if any child can
166  for (int i = 0; i < nchild_args; i++)
167  if (child_args[i])
168  return true;
169  return false;
170 
171  case kRegexpPlus: // can be empty if the child can
172  case kRegexpCapture:
173  return child_args[0];
174 
175  case kRegexpRepeat: // can be empty if child can or is x{0}
176  return child_args[0] || re->min() == 0;
177  }
178  return false;
179 }
180 
181 // Returns whether re can match an empty string.
182 static bool CanBeEmptyString(Regexp* re) {
184  return w.Walk(re, true);
185 }
186 
187 } // namespace re2
re2::kRegexpEmptyMatch
@ kRegexpEmptyMatch
Definition: bloaty/third_party/re2/re2/regexp.h:107
re2::kRegexpBeginLine
@ kRegexpBeginLine
Definition: bloaty/third_party/re2/re2/regexp.h:142
re2::kRegexpNoWordBoundary
@ kRegexpNoWordBoundary
Definition: bloaty/third_party/re2/re2/regexp.h:149
re2::kRegexpLiteralString
@ kRegexpLiteralString
Definition: bloaty/third_party/re2/re2/regexp.h:113
re2::Regexp
Definition: bloaty/third_party/re2/re2/regexp.h:274
re2::EmptyStringWalker::ShortVisit
bool ShortVisit(Regexp *re, bool a)
Definition: bloaty/third_party/re2/re2/mimics_pcre.cc:121
re2::kRegexpLiteral
@ kRegexpLiteral
Definition: bloaty/third_party/re2/re2/regexp.h:110
re2
Definition: bloaty/third_party/re2/re2/bitmap256.h:17
a
int a
Definition: abseil-cpp/absl/container/internal/hash_policy_traits_test.cc:88
re2::kRegexpHaveMatch
@ kRegexpHaveMatch
Definition: bloaty/third_party/re2/re2/regexp.h:161
re2::EmptyStringWalker::PostVisit
bool PostVisit(Regexp *re, bool parent_arg, bool pre_arg, bool *child_args, int nchild_args)
Definition: bloaty/third_party/re2/re2/mimics_pcre.cc:136
re2::kRegexpWordBoundary
@ kRegexpWordBoundary
Definition: bloaty/third_party/re2/re2/regexp.h:147
re2::kRegexpEndText
@ kRegexpEndText
Definition: bloaty/third_party/re2/re2/regexp.h:154
re2::EmptyStringWalker::EmptyStringWalker
EmptyStringWalker()
Definition: bloaty/third_party/re2/re2/mimics_pcre.cc:117
re2::Regexp::Walker
Definition: bloaty/third_party/re2/re2/regexp.h:414
re2::Regexp::rune
Rune rune()
Definition: bloaty/third_party/re2/re2/regexp.h:336
re2::kRegexpConcat
@ kRegexpConcat
Definition: bloaty/third_party/re2/re2/regexp.h:116
re2::PCREWalker::ShortVisit
bool ShortVisit(Regexp *re, bool a)
Definition: bloaty/third_party/re2/re2/mimics_pcre.cc:44
LOG
#define LOG(severity)
Definition: bloaty/third_party/re2/util/logging.h:53
re2::Regexp::MimicsPCRE
bool MimicsPCRE()
Definition: bloaty/third_party/re2/re2/mimics_pcre.cc:102
re2::kRegexpAnyChar
@ kRegexpAnyChar
Definition: bloaty/third_party/re2/re2/regexp.h:136
re2::kRegexpCharClass
@ kRegexpCharClass
Definition: bloaty/third_party/re2/re2/regexp.h:157
re2::Regexp::sub
Regexp ** sub()
Definition: bloaty/third_party/re2/re2/regexp.h:327
re2::PCREWalker::PCREWalker
PCREWalker()
Definition: bloaty/third_party/re2/re2/mimics_pcre.cc:40
re2::PCREWalker
Definition: bloaty/third_party/re2/re2/mimics_pcre.cc:38
re2::kRegexpAlternate
@ kRegexpAlternate
Definition: bloaty/third_party/re2/re2/regexp.h:118
re2::Regexp::Walker::Walk
T Walk(Regexp *re, T top_arg)
Definition: bloaty/third_party/re2/re2/walker-inl.h:231
re2::Regexp::min
int min()
Definition: bloaty/third_party/re2/re2/regexp.h:334
re2::kRegexpStar
@ kRegexpStar
Definition: bloaty/third_party/re2/re2/regexp.h:121
re2::Regexp::parse_flags
ParseFlags parse_flags()
Definition: bloaty/third_party/re2/re2/regexp.h:324
re2::Regexp::max
int max()
Definition: bloaty/third_party/re2/re2/regexp.h:335
re2::EmptyStringWalker::operator=
EmptyStringWalker & operator=(const EmptyStringWalker &)=delete
re2::Regexp::op
RegexpOp op()
Definition: bloaty/third_party/re2/re2/regexp.h:321
re2::kRegexpAnyByte
@ kRegexpAnyByte
Definition: bloaty/third_party/re2/re2/regexp.h:139
re2::kRegexpRepeat
@ kRegexpRepeat
Definition: bloaty/third_party/re2/re2/regexp.h:129
re2::kRegexpBeginText
@ kRegexpBeginText
Definition: bloaty/third_party/re2/re2/regexp.h:152
re2::kRegexpEndLine
@ kRegexpEndLine
Definition: bloaty/third_party/re2/re2/regexp.h:144
re2::Regexp::WasDollar
@ WasDollar
Definition: bloaty/third_party/re2/re2/regexp.h:316
re2::kRegexpCapture
@ kRegexpCapture
Definition: bloaty/third_party/re2/re2/regexp.h:133
re2::EmptyStringWalker
Definition: bloaty/third_party/re2/re2/mimics_pcre.cc:115
re2::CanBeEmptyString
static bool CanBeEmptyString(Regexp *re)
Definition: bloaty/third_party/re2/re2/mimics_pcre.cc:182
re2::kRegexpPlus
@ kRegexpPlus
Definition: bloaty/third_party/re2/re2/regexp.h:123
re2::kRegexpNoMatch
@ kRegexpNoMatch
Definition: bloaty/third_party/re2/re2/regexp.h:104
re2::PCREWalker::PostVisit
bool PostVisit(Regexp *re, bool parent_arg, bool pre_arg, bool *child_args, int nchild_args)
Definition: bloaty/third_party/re2/re2/mimics_pcre.cc:54
re2::kRegexpQuest
@ kRegexpQuest
Definition: bloaty/third_party/re2/re2/regexp.h:125
i
uint64_t i
Definition: abseil-cpp/absl/container/btree_benchmark.cc:230


grpc
Author(s):
autogenerated on Fri May 16 2025 02:59:29