29 #include "util/util.h"
30 #include "util/logging.h"
31 #include "util/strutil.h"
34 #include "re2/regexp.h"
46 : encoding_(opt ==
RE2::Latin1 ? EncodingLatin1 : EncodingUTF8),
47 posix_syntax_(opt ==
RE2::POSIX),
48 longest_match_(opt ==
RE2::POSIX),
49 log_errors_(opt !=
RE2::Quiet),
50 max_mem_(kDefaultMaxMem),
54 never_capture_(
false),
55 case_sensitive_(
true),
57 word_boundary_(
false),
75 return RE2::ErrorInternal;
77 return RE2::ErrorBadEscape;
79 return RE2::ErrorBadCharClass;
81 return RE2::ErrorBadCharRange;
83 return RE2::ErrorMissingBracket;
85 return RE2::ErrorMissingParen;
87 return RE2::ErrorUnexpectedParen;
89 return RE2::ErrorTrailingBackslash;
91 return RE2::ErrorRepeatArgument;
93 return RE2::ErrorRepeatSize;
95 return RE2::ErrorRepeatOp;
97 return RE2::ErrorBadPerlOp;
99 return RE2::ErrorBadUTF8;
101 return RE2::ErrorBadNamedCapture;
103 return RE2::ErrorInternal;
113 RE2::RE2(
const char*
pattern) {
121 RE2::RE2(
const StringPiece&
pattern) {
129 int RE2::Options::ParseFlags()
const {
130 int flags = Regexp::ClassNL;
136 case RE2::Options::EncodingUTF8:
138 case RE2::Options::EncodingLatin1:
139 flags |= Regexp::Latin1;
144 flags |= Regexp::LikePerl;
147 flags |= Regexp::Literal;
150 flags |= Regexp::NeverNL;
153 flags |= Regexp::DotNL;
156 flags |= Regexp::NeverCapture;
158 if (!case_sensitive())
159 flags |= Regexp::FoldCase;
162 flags |= Regexp::PerlClasses;
165 flags |= Regexp::PerlB;
168 flags |= Regexp::OneLine;
183 entire_regexp_ = NULL;
185 error_code_ = NoError;
188 prefix_foldcase_ =
false;
189 suffix_regexp_ = NULL;
192 is_one_pass_ =
false;
195 named_groups_ = NULL;
201 static_cast<Regexp::ParseFlags
>(
options_.ParseFlags()),
203 if (entire_regexp_ == NULL) {
215 if (entire_regexp_->RequiredPrefix(&
prefix_, &prefix_foldcase_, &
suffix))
218 suffix_regexp_ = entire_regexp_->Incref();
223 prog_ = suffix_regexp_->CompileToProg(
options_.max_mem()*2/3);
228 error_code_ = RE2::ErrorPatternTooLarge;
235 num_captures_ = suffix_regexp_->NumCaptures();
242 is_one_pass_ = prog_->IsOnePass();
249 re->suffix_regexp_->CompileToReverseProg(re->options_.max_mem() / 3);
250 if (re->rprog_ == NULL) {
251 if (re->options_.log_errors())
252 LOG(ERROR) <<
"Error reverse compiling '" << trunc(re->pattern_) <<
"'";
265 suffix_regexp_->Decref();
267 entire_regexp_->Decref();
273 delete named_groups_;
278 int RE2::ProgramSize()
const {
281 return prog_->size();
284 int RE2::ReverseProgramSize()
const {
287 Prog*
prog = ReverseProg();
296 #if defined(__GNUC__)
297 return 31 ^ __builtin_clz(
n);
298 #elif defined(_MSC_VER) && (defined(_M_X64) || defined(_M_IX86))
300 _BitScanReverse(&
c,
n);
301 return static_cast<int>(
c);
304 for (
int shift = 1 << 4; shift != 0; shift >>= 1) {
317 prog->Fanout(&fanout);
334 int RE2::ProgramFanout(std::vector<int>*
histogram)
const {
340 int RE2::ReverseProgramFanout(std::vector<int>*
histogram)
const {
350 const std::map<std::string, int>& RE2::NamedCapturingGroups()
const {
357 return *named_groups_;
361 const std::map<int, std::string>& RE2::CapturingGroupNames()
const {
363 if (re->suffix_regexp_ != NULL)
364 re->group_names_ = re->suffix_regexp_->CaptureNames();
365 if (re->group_names_ == NULL)
368 return *group_names_;
373 bool RE2::FullMatchN(
const StringPiece&
text,
const RE2& re,
375 return re.DoMatch(
text, ANCHOR_BOTH, NULL,
args,
n);
378 bool RE2::PartialMatchN(
const StringPiece&
text,
const RE2& re,
380 return re.DoMatch(
text, UNANCHORED, NULL,
args,
n);
383 bool RE2::ConsumeN(StringPiece*
input,
const RE2& re,
386 if (re.DoMatch(*
input, ANCHOR_START, &consumed,
args,
n)) {
387 input->remove_prefix(consumed);
394 bool RE2::FindAndConsumeN(StringPiece*
input,
const RE2& re,
397 if (re.DoMatch(*
input, UNANCHORED, &consumed,
args,
n)) {
398 input->remove_prefix(consumed);
407 const StringPiece& rewrite) {
409 int nvec = 1 + MaxSubmatch(rewrite);
410 if (nvec > 1 + re.NumberOfCapturingGroups())
412 if (nvec >
static_cast<int>(
arraysize(vec)))
414 if (!re.Match(*
str, 0,
str->size(), UNANCHORED, vec, nvec))
418 if (!re.Rewrite(&s, rewrite, vec, nvec))
421 assert(vec[0].
data() >=
str->data());
422 assert(vec[0].
data() + vec[0].
size() <=
str->data() +
str->size());
423 str->replace(vec[0].
data() -
str->data(), vec[0].size(), s);
429 const StringPiece& rewrite) {
431 int nvec = 1 + MaxSubmatch(rewrite);
432 if (nvec > 1 + re.NumberOfCapturingGroups())
434 if (nvec >
static_cast<int>(
arraysize(vec)))
437 const char*
p =
str->data();
438 const char* ep =
p +
str->size();
439 const char* lastend = NULL;
442 #ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
445 while (
p ==
str->data()) {
449 if (!re.Match(*
str,
static_cast<size_t>(
p -
str->data()),
450 str->size(), UNANCHORED, vec, nvec))
452 if (
p < vec[0].
data())
454 if (vec[0].
data() == lastend && vec[0].
empty()) {
459 if (re.options().encoding() == RE2::Options::EncodingUTF8 &&
484 re.Rewrite(&
out, rewrite, vec, nvec);
485 p = vec[0].data() + vec[0].size();
494 out.append(
p, ep -
p);
500 bool RE2::Extract(
const StringPiece&
text,
502 const StringPiece& rewrite,
505 int nvec = 1 + MaxSubmatch(rewrite);
506 if (nvec > 1 + re.NumberOfCapturingGroups())
508 if (nvec >
static_cast<int>(
arraysize(vec)))
510 if (!re.Match(
text, 0,
text.size(), UNANCHORED, vec, nvec))
514 return re.Rewrite(
out, rewrite, vec, nvec);
517 std::string RE2::QuoteMeta(
const StringPiece& unquoted) {
519 result.reserve(unquoted.size() << 1);
528 for (
size_t ii = 0; ii < unquoted.size(); ++ii) {
531 if ((unquoted[ii] <
'a' || unquoted[ii] >
'z') &&
532 (unquoted[ii] <
'A' || unquoted[ii] >
'Z') &&
533 (unquoted[ii] <
'0' || unquoted[ii] >
'9') &&
534 unquoted[ii] !=
'_' &&
538 !(unquoted[ii] & 128)) {
539 if (unquoted[ii] ==
'\0') {
560 int n =
static_cast<int>(
prefix_.size());
567 if (prefix_foldcase_) {
569 for (
int i = 0;
i <
n;
i++) {
571 if (
'a' <=
c &&
c <=
'z')
579 if (maxlen > 0 && prog_->PossibleMatchRange(&dmin, &dmax, maxlen)) {
582 }
else if (!
max->empty()) {
600 const char* ae =
a +
len;
602 for (;
a < ae;
a++,
b++) {
605 if (
'A' <=
y &&
y <=
'Z')
620 StringPiece* submatch,
621 int nsubmatch)
const {
628 if (startpos > endpos || endpos >
text.size()) {
630 LOG(
ERROR) <<
"RE2: invalid startpos, endpos pair. ["
631 <<
"startpos: " << startpos <<
", "
632 <<
"endpos: " << endpos <<
", "
633 <<
"text size: " <<
text.size() <<
"]";
637 StringPiece subtext =
text;
638 subtext.remove_prefix(startpos);
639 subtext.remove_suffix(
text.size() - endpos);
646 StringPiece* matchp = &
match;
650 int ncap = 1 + NumberOfCapturingGroups();
651 if (ncap > nsubmatch)
655 if (prog_->anchor_start() && startpos != 0)
657 if (prog_->anchor_end() && endpos !=
text.size())
662 if (prog_->anchor_start() && prog_->anchor_end())
663 re_anchor = ANCHOR_BOTH;
664 else if (prog_->anchor_start() && re_anchor != ANCHOR_BOTH)
665 re_anchor = ANCHOR_START;
668 size_t prefixlen = 0;
673 if (prefixlen > subtext.size())
675 if (prefix_foldcase_) {
679 if (memcmp(&
prefix_[0], subtext.data(), prefixlen) != 0)
682 subtext.remove_prefix(prefixlen);
684 if (re_anchor != ANCHOR_BOTH)
685 re_anchor = ANCHOR_START;
688 Prog::Anchor anchor = Prog::kUnanchored;
691 kind = Prog::kLongestMatch;
693 bool can_one_pass = (is_one_pass_ && ncap <= Prog::kMaxOnePassCapture);
699 const int kMaxBitStateBitmapSize = 256*1024;
700 bool can_bit_state = prog_->CanBitState();
701 size_t bit_state_text_max = kMaxBitStateBitmapSize / prog_->list_count();
703 #ifdef RE2_HAVE_THREAD_LOCAL
706 bool dfa_failed =
false;
707 bool skipped_test =
false;
710 LOG(DFATAL) <<
"Unexpected re_anchor value: " << re_anchor;
714 if (prog_->anchor_end()) {
718 Prog*
prog = ReverseProg();
724 if (!
prog->SearchDFA(subtext,
text, Prog::kAnchored,
725 Prog::kLongestMatch, matchp, &dfa_failed, NULL)) {
729 <<
"pattern length " << pattern_.size() <<
", "
730 <<
"program size " <<
prog->size() <<
", "
731 <<
"list count " <<
prog->list_count() <<
", "
732 <<
"bytemap range " <<
prog->bytemap_range();
744 if (!prog_->SearchDFA(subtext,
text, anchor, kind,
745 matchp, &dfa_failed, NULL)) {
749 <<
"pattern length " << pattern_.size() <<
", "
750 <<
"program size " << prog_->size() <<
", "
751 <<
"list count " << prog_->list_count() <<
", "
752 <<
"bytemap range " << prog_->bytemap_range();
764 Prog*
prog = ReverseProg();
771 Prog::kLongestMatch, &
match, &dfa_failed, NULL)) {
775 <<
"pattern length " << pattern_.size() <<
", "
776 <<
"program size " <<
prog->size() <<
", "
777 <<
"list count " <<
prog->list_count() <<
", "
778 <<
"bytemap range " <<
prog->bytemap_range();
784 LOG(
ERROR) <<
"SearchDFA inconsistency";
792 if (re_anchor == ANCHOR_BOTH)
793 kind = Prog::kFullMatch;
794 anchor = Prog::kAnchored;
803 if (can_one_pass &&
text.size() <= 4096 &&
804 (ncap > 1 ||
text.size() <= 16)) {
808 if (can_bit_state &&
text.size() <= bit_state_text_max && ncap > 1) {
812 if (!prog_->SearchDFA(subtext,
text, anchor, kind,
813 &
match, &dfa_failed, NULL)) {
817 <<
"pattern length " << pattern_.size() <<
", "
818 <<
"program size " << prog_->size() <<
", "
819 <<
"list count " << prog_->list_count() <<
", "
820 <<
"bytemap range " << prog_->bytemap_range();
830 if (!skipped_test && ncap <= 1) {
835 StringPiece subtext1;
845 anchor = Prog::kAnchored;
846 kind = Prog::kFullMatch;
849 if (can_one_pass && anchor != Prog::kUnanchored) {
850 if (!prog_->SearchOnePass(subtext1,
text, anchor, kind, submatch, ncap)) {
851 if (!skipped_test &&
options_.log_errors())
852 LOG(
ERROR) <<
"SearchOnePass inconsistency";
855 }
else if (can_bit_state && subtext1.size() <= bit_state_text_max) {
856 if (!prog_->SearchBitState(subtext1,
text, anchor,
857 kind, submatch, ncap)) {
858 if (!skipped_test &&
options_.log_errors())
859 LOG(
ERROR) <<
"SearchBitState inconsistency";
863 if (!prog_->SearchNFA(subtext1,
text, anchor, kind, submatch, ncap)) {
864 if (!skipped_test &&
options_.log_errors())
865 LOG(
ERROR) <<
"SearchNFA inconsistency";
872 if (prefixlen > 0 && nsubmatch > 0)
873 submatch[0] = StringPiece(submatch[0].
data() - prefixlen,
874 submatch[0].
size() + prefixlen);
877 for (
int i = ncap;
i < nsubmatch;
i++)
878 submatch[
i] = StringPiece();
883 bool RE2::DoMatch(
const StringPiece&
text,
894 if (NumberOfCapturingGroups() <
n) {
901 if (
n == 0 && consumed == NULL)
908 StringPiece* heapvec = NULL;
910 if (nvec <=
static_cast<int>(
arraysize(stkvec))) {
913 vec =
new StringPiece[nvec];
922 if (consumed != NULL)
923 *consumed =
static_cast<size_t>(vec[0].end() -
text.begin());
925 if (
n == 0 ||
args == NULL) {
932 for (
int i = 0;
i <
n;
i++) {
933 const StringPiece&
s = vec[
i+1];
947 bool RE2::CheckRewriteString(
const StringPiece& rewrite,
950 for (
const char *s = rewrite.data(), *
end = s + rewrite.size();
957 *
error =
"Rewrite schema error: '\\' not allowed at end.";
965 *
error =
"Rewrite schema error: "
966 "'\\' must be followed by a digit or '\\'.";
975 if (max_token > NumberOfCapturingGroups()) {
977 "Rewrite schema requests %d matches, but the regexp only has %d "
978 "parenthesized subexpressions.",
979 max_token, NumberOfCapturingGroups());
987 int RE2::MaxSubmatch(
const StringPiece& rewrite) {
989 for (
const char *s = rewrite.data(), *
end = s + rewrite.size();
993 int c = (
s <
end) ? *s : -1;
1007 const StringPiece& rewrite,
1008 const StringPiece* vec,
1010 for (
const char *s = rewrite.data(), *
end = s + rewrite.size();
1017 int c = (
s <
end) ? *s : -1;
1022 LOG(
ERROR) <<
"invalid substitution \\" <<
n
1023 <<
" from " << veclen <<
" groups";
1027 StringPiece snip = vec[
n];
1029 out->append(snip.data(), snip.size());
1030 }
else if (
c ==
'\\') {
1031 out->push_back(
'\\');
1034 LOG(
ERROR) <<
"invalid rewrite pattern: " << rewrite.data();
1043 namespace re2_internal {
1048 return (
dest == NULL);
1053 if (
dest == NULL)
return true;
1060 if (
dest == NULL)
return true;
1067 if (
n != 1)
return false;
1068 if (
dest == NULL)
return true;
1075 if (
n != 1)
return false;
1076 if (
dest == NULL)
return true;
1083 if (
n != 1)
return false;
1084 if (
dest == NULL)
return true;
1096 size_t* np,
bool accept_spaces) {
1098 if (
n == 0)
return "";
1099 if (
n > 0 && isspace(*
str)) {
1102 if (!accept_spaces) {
1105 while (
n > 0 && isspace(*
str)) {
1120 if (
n >= 1 &&
str[0] ==
'-') {
1126 if (
n >= 3 &&
str[0] ==
'0' &&
str[1] ==
'0') {
1127 while (
n >= 3 &&
str[2] ==
'0') {
1138 if (
n > nbuf-1)
return "";
1151 if (
n == 0)
return false;
1152 static const int kMaxLength = 200;
1153 char buf[kMaxLength+1];
1158 if (
end !=
str +
n)
return false;
1159 if (errno)
return false;
1160 if (
dest == NULL)
return true;
1167 if (
n == 0)
return false;
1168 static const int kMaxLength = 200;
1169 char buf[kMaxLength+1];
1174 if (
end !=
str +
n)
return false;
1175 if (errno)
return false;
1176 if (
dest == NULL)
return true;
1183 if (
n == 0)
return false;
1189 if (
end !=
str +
n)
return false;
1190 if (errno)
return false;
1191 if (
dest == NULL)
return true;
1198 if (
n == 0)
return false;
1201 if (
str[0] ==
'-') {
1210 if (
end !=
str +
n)
return false;
1211 if (errno)
return false;
1212 if (
dest == NULL)
return true;
1221 if ((
short)
r !=
r)
return false;
1222 if (
dest == NULL)
return true;
1231 if ((
unsigned short)
r !=
r)
return false;
1232 if (
dest == NULL)
return true;
1233 *
dest = (
unsigned short)
r;
1241 if ((
int)
r !=
r)
return false;
1242 if (
dest == NULL)
return true;
1251 if ((
unsigned int)
r !=
r)
return false;
1252 if (
dest == NULL)
return true;
1259 if (
n == 0)
return false;
1265 if (
end !=
str +
n)
return false;
1266 if (errno)
return false;
1267 if (
dest == NULL)
return true;
1274 if (
n == 0)
return false;
1277 if (
str[0] ==
'-') {
1285 if (
end !=
str +
n)
return false;
1286 if (errno)
return false;
1287 if (
dest == NULL)
return true;
1296 #ifdef RE2_HAVE_THREAD_LOCAL
1300 template <
typename T>
1303 T*
Load()
const {
return cb_.load(std::memory_order_acquire); }
1305 #if !defined(__clang__) && defined(_MSC_VER)
1308 static_assert(ATOMIC_POINTER_LOCK_FREE == 2,
1309 "std::atomic<T*> must be always lock-free");
1310 T* cb_for_constinit_;
1316 template <
typename T>
1319 #define DEFINE_HOOK(type, name) \
1320 static Hook<type##Callback> name##_hook = {{&DoNothing<type>}}; \
1321 void Set##type##Hook(type##Callback* cb) { name##_hook.Store(cb); } \
1322 type##Callback* Get##type##Hook() { return name##_hook.Load(); }
1324 DEFINE_HOOK(DFAStateCacheReset, dfa_state_cache_reset)