re2/re2/mimics_pcre.cc
Go to the documentation of this file.
1 // Copyright 2008 The RE2 Authors. All Rights Reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4 
5 // Determine whether this library should match PCRE exactly
6 // for a particular Regexp. (If so, the testing framework can
7 // check that it does.)
8 //
9 // This library matches PCRE except in these cases:
10 // * the regexp contains a repetition of an empty string,
11 // like (a*)* or (a*)+. In this case, PCRE will treat
12 // the repetition sequence as ending with an empty string,
13 // while this library does not.
14 // * Perl and PCRE differ on whether \v matches \n.
15 // For historical reasons, this library implements the Perl behavior.
16 // * Perl and PCRE allow $ in one-line mode to match either the very
17 // end of the text or just before a \n at the end of the text.
18 // This library requires it to match only the end of the text.
19 // * Similarly, Perl and PCRE do not allow ^ in multi-line mode to
20 // match the end of the text if the last character is a \n.
21 // This library does allow it.
22 //
23 // Regexp::MimicsPCRE checks for any of these conditions.
24 
25 #include "util/util.h"
26 #include "util/logging.h"
27 #include "re2/regexp.h"
28 #include "re2/walker-inl.h"
29 
30 namespace re2 {
31 
32 // Returns whether re might match an empty string.
33 static bool CanBeEmptyString(Regexp *re);
34 
35 // Walker class to compute whether library handles a regexp
36 // exactly as PCRE would. See comment at top for conditions.
37 
38 class PCREWalker : public Regexp::Walker<bool> {
39  public:
41 
42  virtual bool PostVisit(Regexp* re, bool parent_arg, bool pre_arg,
43  bool* child_args, int nchild_args);
44 
45  virtual bool ShortVisit(Regexp* re, bool a) {
46  // Should never be called: we use Walk(), not WalkExponential().
47 #ifndef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
48  LOG(DFATAL) << "PCREWalker::ShortVisit called";
49 #endif
50  return a;
51  }
52 
53  private:
54  PCREWalker(const PCREWalker&) = delete;
55  PCREWalker& operator=(const PCREWalker&) = delete;
56 };
57 
58 // Called after visiting each of re's children and accumulating
59 // the return values in child_args. So child_args contains whether
60 // this library mimics PCRE for those subexpressions.
61 bool PCREWalker::PostVisit(Regexp* re, bool parent_arg, bool pre_arg,
62  bool* child_args, int nchild_args) {
63  // If children failed, so do we.
64  for (int i = 0; i < nchild_args; i++)
65  if (!child_args[i])
66  return false;
67 
68  // Otherwise look for other reasons to fail.
69  switch (re->op()) {
70  // Look for repeated empty string.
71  case kRegexpStar:
72  case kRegexpPlus:
73  case kRegexpQuest:
74  if (CanBeEmptyString(re->sub()[0]))
75  return false;
76  break;
77  case kRegexpRepeat:
78  if (re->max() == -1 && CanBeEmptyString(re->sub()[0]))
79  return false;
80  break;
81 
82  // Look for \v
83  case kRegexpLiteral:
84  if (re->rune() == '\v')
85  return false;
86  break;
87 
88  // Look for $ in single-line mode.
89  case kRegexpEndText:
90  case kRegexpEmptyMatch:
91  if (re->parse_flags() & Regexp::WasDollar)
92  return false;
93  break;
94 
95  // Look for ^ in multi-line mode.
96  case kRegexpBeginLine:
97  // No condition: in single-line mode ^ becomes kRegexpBeginText.
98  return false;
99 
100  default:
101  break;
102  }
103 
104  // Not proven guilty.
105  return true;
106 }
107 
108 // Returns whether this regexp's behavior will mimic PCRE's exactly.
109 bool Regexp::MimicsPCRE() {
110  PCREWalker w;
111  return w.Walk(this, true);
112 }
113 
114 
115 // Walker class to compute whether a Regexp can match an empty string.
116 // It is okay to overestimate. For example, \b\B cannot match an empty
117 // string, because \b and \B are mutually exclusive, but this isn't
118 // that smart and will say it can. Spurious empty strings
119 // will reduce the number of regexps we sanity check against PCRE,
120 // but they won't break anything.
121 
122 class EmptyStringWalker : public Regexp::Walker<bool> {
123  public:
125 
126  virtual bool PostVisit(Regexp* re, bool parent_arg, bool pre_arg,
127  bool* child_args, int nchild_args);
128 
129  virtual bool ShortVisit(Regexp* re, bool a) {
130  // Should never be called: we use Walk(), not WalkExponential().
131 #ifndef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
132  LOG(DFATAL) << "EmptyStringWalker::ShortVisit called";
133 #endif
134  return a;
135  }
136 
137  private:
138  EmptyStringWalker(const EmptyStringWalker&) = delete;
140 };
141 
142 // Called after visiting re's children. child_args contains the return
143 // value from each of the children's PostVisits (i.e., whether each child
144 // can match an empty string). Returns whether this clause can match an
145 // empty string.
146 bool EmptyStringWalker::PostVisit(Regexp* re, bool parent_arg, bool pre_arg,
147  bool* child_args, int nchild_args) {
148  switch (re->op()) {
149  case kRegexpNoMatch: // never empty
150  case kRegexpLiteral:
151  case kRegexpAnyChar:
152  case kRegexpAnyByte:
153  case kRegexpCharClass:
155  return false;
156 
157  case kRegexpEmptyMatch: // always empty
158  case kRegexpBeginLine: // always empty, when they match
159  case kRegexpEndLine:
161  case kRegexpWordBoundary:
162  case kRegexpBeginText:
163  case kRegexpEndText:
164  case kRegexpStar: // can always be empty
165  case kRegexpQuest:
166  case kRegexpHaveMatch:
167  return true;
168 
169  case kRegexpConcat: // can be empty if all children can
170  for (int i = 0; i < nchild_args; i++)
171  if (!child_args[i])
172  return false;
173  return true;
174 
175  case kRegexpAlternate: // can be empty if any child can
176  for (int i = 0; i < nchild_args; i++)
177  if (child_args[i])
178  return true;
179  return false;
180 
181  case kRegexpPlus: // can be empty if the child can
182  case kRegexpCapture:
183  return child_args[0];
184 
185  case kRegexpRepeat: // can be empty if child can or is x{0}
186  return child_args[0] || re->min() == 0;
187  }
188  return false;
189 }
190 
191 // Returns whether re can match an empty string.
192 static bool CanBeEmptyString(Regexp* re) {
194  return w.Walk(re, true);
195 }
196 
197 } // namespace re2
re2::kRegexpEmptyMatch
@ kRegexpEmptyMatch
Definition: bloaty/third_party/re2/re2/regexp.h:107
re2::kRegexpBeginLine
@ kRegexpBeginLine
Definition: bloaty/third_party/re2/re2/regexp.h:142
re2::kRegexpNoWordBoundary
@ kRegexpNoWordBoundary
Definition: bloaty/third_party/re2/re2/regexp.h:149
re2::kRegexpLiteralString
@ kRegexpLiteralString
Definition: bloaty/third_party/re2/re2/regexp.h:113
re2::Regexp
Definition: bloaty/third_party/re2/re2/regexp.h:274
re2::kRegexpLiteral
@ kRegexpLiteral
Definition: bloaty/third_party/re2/re2/regexp.h:110
re2
Definition: bloaty/third_party/re2/re2/bitmap256.h:17
a
int a
Definition: abseil-cpp/absl/container/internal/hash_policy_traits_test.cc:88
re2::kRegexpHaveMatch
@ kRegexpHaveMatch
Definition: bloaty/third_party/re2/re2/regexp.h:161
re2::EmptyStringWalker::PostVisit
bool PostVisit(Regexp *re, bool parent_arg, bool pre_arg, bool *child_args, int nchild_args)
Definition: bloaty/third_party/re2/re2/mimics_pcre.cc:136
re2::kRegexpWordBoundary
@ kRegexpWordBoundary
Definition: bloaty/third_party/re2/re2/regexp.h:147
re2::kRegexpEndText
@ kRegexpEndText
Definition: bloaty/third_party/re2/re2/regexp.h:154
re2::EmptyStringWalker::EmptyStringWalker
EmptyStringWalker()
Definition: re2/re2/mimics_pcre.cc:124
re2::kRegexpConcat
@ kRegexpConcat
Definition: bloaty/third_party/re2/re2/regexp.h:116
LOG
#define LOG(severity)
Definition: bloaty/third_party/re2/util/logging.h:53
re2::Regexp::MimicsPCRE
bool MimicsPCRE()
Definition: bloaty/third_party/re2/re2/mimics_pcre.cc:102
re2::kRegexpAnyChar
@ kRegexpAnyChar
Definition: bloaty/third_party/re2/re2/regexp.h:136
re2::kRegexpCharClass
@ kRegexpCharClass
Definition: bloaty/third_party/re2/re2/regexp.h:157
re2::PCREWalker::PCREWalker
PCREWalker()
Definition: re2/re2/mimics_pcre.cc:40
re2::PCREWalker
Definition: bloaty/third_party/re2/re2/mimics_pcre.cc:38
re2::kRegexpAlternate
@ kRegexpAlternate
Definition: bloaty/third_party/re2/re2/regexp.h:118
re2::kRegexpStar
@ kRegexpStar
Definition: bloaty/third_party/re2/re2/regexp.h:121
re2::PCREWalker::operator=
PCREWalker & operator=(const PCREWalker &)=delete
re2::EmptyStringWalker::operator=
EmptyStringWalker & operator=(const EmptyStringWalker &)=delete
re2::kRegexpAnyByte
@ kRegexpAnyByte
Definition: bloaty/third_party/re2/re2/regexp.h:139
re2::kRegexpRepeat
@ kRegexpRepeat
Definition: bloaty/third_party/re2/re2/regexp.h:129
re2::kRegexpBeginText
@ kRegexpBeginText
Definition: bloaty/third_party/re2/re2/regexp.h:152
re2::kRegexpEndLine
@ kRegexpEndLine
Definition: bloaty/third_party/re2/re2/regexp.h:144
re2::Regexp::WasDollar
@ WasDollar
Definition: bloaty/third_party/re2/re2/regexp.h:316
re2::kRegexpCapture
@ kRegexpCapture
Definition: bloaty/third_party/re2/re2/regexp.h:133
re2::EmptyStringWalker
Definition: bloaty/third_party/re2/re2/mimics_pcre.cc:115
re2::CanBeEmptyString
static bool CanBeEmptyString(Regexp *re)
Definition: bloaty/third_party/re2/re2/mimics_pcre.cc:182
re2::kRegexpPlus
@ kRegexpPlus
Definition: bloaty/third_party/re2/re2/regexp.h:123
re2::kRegexpNoMatch
@ kRegexpNoMatch
Definition: bloaty/third_party/re2/re2/regexp.h:104
re2::PCREWalker::PostVisit
bool PostVisit(Regexp *re, bool parent_arg, bool pre_arg, bool *child_args, int nchild_args)
Definition: bloaty/third_party/re2/re2/mimics_pcre.cc:54
re2::kRegexpQuest
@ kRegexpQuest
Definition: bloaty/third_party/re2/re2/regexp.h:125
re2::EmptyStringWalker::ShortVisit
virtual bool ShortVisit(Regexp *re, bool a)
Definition: re2/re2/mimics_pcre.cc:129
re2::PCREWalker::ShortVisit
virtual bool ShortVisit(Regexp *re, bool a)
Definition: re2/re2/mimics_pcre.cc:45
i
uint64_t i
Definition: abseil-cpp/absl/container/btree_benchmark.cc:230


grpc
Author(s):
autogenerated on Fri May 16 2025 02:59:29