28 #include "util/util.h"
29 #include "util/logging.h"
30 #include "util/strutil.h"
33 #include "re2/regexp.h"
34 #include "re2/stringpiece.h"
35 #include "re2/unicode_casefold.h"
36 #include "re2/unicode_groups.h"
37 #include "re2/walker-inl.h"
39 #if defined(RE2_USE_ICU)
40 #include "unicode/uniset.h"
41 #include "unicode/unistr.h"
42 #include "unicode/utypes.h"
197 : flags_(
flags), whole_regexp_(whole_regexp),
208 for (
Regexp* re = stacktop_; re != NULL; re =
next) {
226 CharClassBuilder* ccb = re->
ccb_;
246 re->ccb_->RemoveAbove(rune_max_);
247 if (re->ccb_->size() == 1) {
248 Rune r = re->ccb_->begin()->lo;
252 }
else if (re->ccb_->size() == 2) {
253 Rune r = re->ccb_->begin()->lo;
257 re->rune_ =
r +
'a' -
'A';
262 if (!IsMarker(re->op()))
263 re->simple_ = re->ComputeSimple();
264 re->down_ = stacktop_;
273 const CaseFold* ef =
f +
n;
278 if (f[
m].lo <=
r &&
r <= f[
m].hi)
337 if (f == NULL || r < f->lo)
351 LOG(DFATAL) <<
"AddFoldedRange recurses too much.";
355 if (!
cc->AddRange(lo, hi))
370 Rune hi1 = std::min<Rune>(hi, f->hi);
401 re->ccb_ =
new CharClassBuilder;
404 if (!(flags_ &
NeverNL) ||
r !=
'\n') {
405 re->ccb_->AddRange(
r,
r);
409 return PushRegexp(re);
413 if ((flags_ &
NeverNL) &&
r ==
'\n')
417 if (MaybeConcatString(
r, flags_))
422 return PushRegexp(re);
460 re->ccb_ =
new CharClassBuilder;
461 re->ccb_->AddRange(0,
'\n' - 1);
462 re->ccb_->AddRange(
'\n' + 1, rune_max_);
463 return PushRegexp(re);
469 return PushRegexp(re);
477 if (stacktop_ == NULL || IsMarker(stacktop_->op())) {
488 if (
op == stacktop_->op() && fl == stacktop_->parse_flags())
497 fl == stacktop_->parse_flags()) {
504 re->down_ = stacktop_->down_;
505 re->sub()[0] = FinishRegexp(stacktop_);
506 re->simple_ = re->ComputeSimple();
520 class RepetitionWalker :
public Regexp::Walker<int> {
523 virtual int PreVisit(
Regexp* re,
int parent_arg,
bool*
stop);
524 virtual int PostVisit(
Regexp* re,
int parent_arg,
int pre_arg,
525 int* child_args,
int nchild_args);
526 virtual int ShortVisit(
Regexp* re,
int parent_arg);
534 int arg = parent_arg;
548 int* child_args,
int nchild_args) {
550 for (
int i = 0;
i < nchild_args;
i++) {
551 if (child_args[i] <
arg) {
560 #ifndef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
561 LOG(DFATAL) <<
"RepetitionWalker::ShortVisit called";
569 const StringPiece& s,
578 if (stacktop_ == NULL || IsMarker(stacktop_->op())) {
590 re->down_ = stacktop_->down_;
591 re->sub()[0] = FinishRegexp(stacktop_);
592 re->simple_ = re->ComputeSimple();
594 if (
min >= 2 ||
max >= 2) {
615 if (
name.data() != NULL)
617 return PushRegexp(re);
624 return PushRegexp(re);
639 if ((r1 = stacktop_) != NULL &&
640 (r2 = r1->down_) != NULL &&
643 if ((r3 = r2->down_) != NULL &&
661 r1->down_ = r3->down_;
669 r1->down_ = r2->down_;
687 if ((r1 = stacktop_) == NULL ||
688 (r2 = r1->down_) == NULL ||
691 status_->set_error_arg(whole_regexp_);
696 stacktop_ = r2->down_;
700 flags_ = re->parse_flags();
707 re->sub()[0] = FinishRegexp(r1);
708 re->simple_ = re->ComputeSimple();
713 return PushRegexp(re);
720 if (re != NULL && re->down_ != NULL) {
722 status_->set_error_arg(whole_regexp_);
726 return FinishRegexp(re);
757 if (re->nsub() == 2) {
790 *nrune = re->nrunes_;
819 if (n >= re->nrunes_) {
824 }
else if (n == re->nrunes_ - 1) {
825 Rune rune = re->runes_[re->nrunes_ - 1];
833 memmove(re->runes_, re->runes_ + n, re->nrunes_ *
sizeof re->runes_[0]);
845 switch (re->nsub()) {
849 LOG(DFATAL) <<
"Concat of " << re->nsub();
905 std::vector<Splice> splices;
915 std::vector<Splice>* splices);
918 std::vector<Splice>* splices);
921 std::vector<Splice>* splices);
936 std::vector<Frame> stk;
940 auto&
sub = stk.back().
sub;
941 auto&
nsub = stk.back().nsub;
942 auto&
round = stk.back().round;
943 auto& splices = stk.back().splices;
944 auto& spliceidx = stk.back().spliceidx;
946 if (splices.empty()) {
950 }
else if (spliceidx <
static_cast<int>(splices.size())) {
952 stk.emplace_back(splices[spliceidx].
sub, splices[spliceidx].
nsub);
956 auto iter = splices.begin();
958 for (
int i = 0;
i <
nsub; ) {
960 while (
sub + i < iter->
sub)
967 re[0] =
iter->prefix;
979 LOG(DFATAL) <<
"unknown round: " <<
round;
983 if (++
iter == splices.end()) {
1005 if (stk.size() == 1) {
1013 stk.back().splices[stk.back().spliceidx].nsuffix = nsuffix;
1014 ++stk.back().spliceidx;
1018 LOG(DFATAL) <<
"unknown round: " <<
round;
1023 if (splices.empty() ||
round == 3) {
1024 spliceidx =
static_cast<int>(splices.size());
1033 std::vector<Splice>* splices) {
1039 for (
int i = 0;
i <=
nsub;
i++) {
1042 Rune* rune_i = NULL;
1047 if (runeflags_i == runeflags) {
1049 while (same < nrune && same < nrune_i &&
rune[same] == rune_i[same])
1064 }
else if (i ==
start+1) {
1078 runeflags = runeflags_i;
1085 std::vector<Splice>* splices) {
1096 for (
int i = 0;
i <=
nsub;
i++) {
1102 if (
first != NULL &&
1130 }
else if (i ==
start+1) {
1149 std::vector<Splice>* splices) {
1153 for (
int i = 0;
i <=
nsub;
i++) {
1159 if (
first != NULL &&
1172 }
else if (i ==
start+1) {
1175 CharClassBuilder ccb;
1179 CharClass*
cc = re->cc();
1181 ccb.AddRange(
it->lo,
it->hi);
1183 ccb.AddRangeFlags(re->rune(), re->rune(), re->parse_flags());
1185 LOG(DFATAL) <<
"RE2: unexpected op: " << re->op() <<
" "
1220 if (stacktop_ != NULL && stacktop_->down_ ==
next)
1224 PODArray<Regexp*>
subs(n);
1232 subs[--
i] = sub_subs[
k]->Incref();
1240 re->simple_ = re->ComputeSimple();
1249 if (r1 == NULL || IsMarker(r1->op())) {
1263 stacktop_ = r1->down_;
1281 if ((re1 = stacktop_) == NULL || (
re2 = re1->down_) == NULL)
1302 re2->AddRuneToString(re1->rune_);
1304 for (
int i = 0;
i < re1->nrunes_;
i++)
1305 re2->AddRuneToString(re1->runes_[i]);
1307 delete[] re1->runes_;
1329 if (s->empty() || !isdigit((*s)[0] & 0xFF))
1332 if (s->size() >= 2 && (*s)[0] ==
'0' && isdigit((*s)[1] & 0xFF))
1336 while (!s->empty() && isdigit(c = (*s)[0] & 0xFF)) {
1341 s->remove_prefix(1);
1358 if (s.empty() || s[0] !=
'{')
1381 if (s.empty() || s[0] !=
'}')
1424 while (!t.empty()) {
1433 return (
'0' <= c && c <=
'9') ||
1434 (
'A' <= c && c <=
'F') ||
1435 (
'a' <= c && c <=
'f');
1440 if (
'0' <= c && c <=
'9')
1442 if (
'A' <= c && c <=
'F')
1443 return c -
'A' + 10;
1444 if (
'a' <= c && c <=
'f')
1445 return c -
'a' + 10;
1446 LOG(DFATAL) <<
"Bad hex digit " << c;
1455 const char*
begin = s->data();
1456 if (s->empty() || (*s)[0] !=
'\\') {
1462 if (s->size() == 1) {
1468 s->remove_prefix(1);
1474 if (c <
Runeself && !isalpha(c) && !isdigit(c)) {
1493 if (s->empty() || (*s)[0] <
'0' || (*s)[0] >
'7')
1499 if (!s->empty() &&
'0' <= (c = (*s)[0]) && c <=
'7') {
1501 s->remove_prefix(1);
1504 if (
'0' <= c && c <=
'7') {
1506 s->remove_prefix(1);
1510 if (
code > rune_max)
1535 if (
code > rune_max)
1542 if (c !=
'}' || nhex == 0)
1592 LOG(DFATAL) <<
"Not reached in ParseEscape.";
1610 if (cutnl && lo <=
'\n' &&
'\n' <= hi) {
1629 for (
int i = 0;
i < ngroups;
i++)
1644 #if !defined(RE2_USE_ICU)
1663 for (
int i = 0;
i <
g->nr16;
i++) {
1666 for (
int i = 0;
i <
g->nr32;
i++) {
1686 cc->AddCharClass(&ccb1);
1690 for (
int i = 0;
i <
g->nr16;
i++) {
1691 if (next < g->r16[
i].lo)
1695 for (
int i = 0;
i <
g->nr32;
i++) {
1696 if (next < g->r32[
i].lo)
1714 if (s->size() < 2 || (*s)[0] !=
'\\')
1718 StringPiece
name(s->data(), 2);
1722 s->remove_prefix(
name.size());
1735 CharClassBuilder *
cc,
1740 if (s->size() < 2 || (*s)[0] !=
'\\')
1743 if (c !=
'p' && c !=
'P')
1750 StringPiece seq = *s;
1752 s->remove_prefix(2);
1758 const char* p = seq.data() + 2;
1759 name = StringPiece(p,
static_cast<size_t>(s->data() - p));
1762 size_t end =
s->find(
'}', 0);
1767 status->set_error_arg(seq);
1770 name = StringPiece(
s->data(),
end);
1771 s->remove_prefix(
end + 1);
1777 seq = StringPiece(seq.data(),
static_cast<size_t>(
s->data() - seq.data()));
1779 if (!
name.empty() &&
name[0] ==
'^') {
1781 name.remove_prefix(1);
1784 #if !defined(RE2_USE_ICU)
1789 status->set_error_arg(seq);
1797 ::icu::UnicodeString ustr = ::icu::UnicodeString::fromUTF8(
1799 UErrorCode uerr = U_ZERO_ERROR;
1800 ::icu::UnicodeSet uset(ustr, uerr);
1801 if (U_FAILURE(uerr)) {
1803 status->set_error_arg(seq);
1808 int nr = uset.getRangeCount();
1809 PODArray<URange32>
r(nr);
1810 for (
int i = 0;
i < nr;
i++) {
1811 r[
i].lo = uset.getRangeStart(i);
1812 r[
i].hi = uset.getRangeEnd(i);
1814 UGroup
g = {
"", +1, 0, 0,
r.data(), nr};
1828 const char* p = s->data();
1829 const char* ep = s->data() + s->size();
1830 if (ep - p < 2 || p[0] !=
'[' || p[1] !=
':')
1835 for (q = p+2; q <= ep-2 && (*q !=
':' || *(q+1) !=
']'); q++)
1853 s->remove_prefix(
name.size());
1863 const StringPiece& whole_class,
1867 status->set_error_arg(whole_class);
1873 if ((*s)[0] ==
'\\')
1886 const StringPiece& whole_class,
1888 StringPiece os = *
s;
1889 if (!ParseCCCharacter(s, &rr->lo, whole_class,
status))
1892 if (
s->size() >= 2 && (*s)[0] ==
'-' && (*s)[1] !=
']') {
1893 s->remove_prefix(1);
1894 if (!ParseCCCharacter(s, &rr->hi, whole_class,
status))
1896 if (rr->hi < rr->lo) {
1899 StringPiece(os.data(),
static_cast<size_t>(
s->data() - os.data())));
1914 StringPiece whole_class = *
s;
1915 if (
s->empty() || (*s)[0] !=
'[') {
1918 status->set_error_arg(StringPiece());
1921 bool negated =
false;
1923 re->ccb_ =
new CharClassBuilder;
1924 s->remove_prefix(1);
1925 if (!
s->empty() && (*s)[0] ==
'^') {
1926 s->remove_prefix(1);
1931 re->ccb_->AddRange(
'\n',
'\n');
1935 while (!
s->empty() && ((*s)[0] !=
']' ||
first)) {
1938 if ((*s)[0] ==
'-' && !
first && !(flags_&
PerlX) &&
1939 (
s->size() == 1 || (*s)[1] !=
']')) {
1949 status->set_error_arg(StringPiece(
s->data(), 1+n));
1956 if (
s->size() > 2 && (*s)[0] ==
'[' && (*s)[1] ==
':') {
1969 if (
s->size() > 2 &&
1971 ((*s)[1] ==
'p' || (*s)[1] ==
'P')) {
1992 if (!ParseCCRange(s, &rr, whole_class,
status)) {
2005 status->set_error_arg(whole_class);
2009 s->remove_prefix(1);
2034 {
"Lu",
"Ll",
"Lt",
"Lm",
"Lo",
"Nl",
"Mn",
"Mc",
"Nd",
"Pc"})
2042 while (!t.empty()) {
2061 if (!(flags_ &
PerlX) || t.size() < 2 || t[0] !=
'(' || t[1] !=
'?') {
2062 LOG(DFATAL) <<
"Bad call to ParseState::ParsePerlFlags";
2084 if (
t.size() > 2 && t[0] ==
'P' && t[1] ==
'<') {
2086 size_t end =
t.find(
'>', 2);
2096 StringPiece capture(
t.data()-2,
end+3);
2097 StringPiece
name(
t.data()+2,
end-2);
2102 status_->set_error_arg(capture);
2106 if (!DoLeftParen(
name)) {
2112 static_cast<size_t>(capture.data() + capture.size() -
s->data()));
2116 bool negated =
false;
2117 bool sawflags =
false;
2172 if (!DoLeftParenNoCapture()) {
2186 if (negated && !sawflags)
2196 StringPiece(
s->data(),
static_cast<size_t>(
t.data() -
s->data())));
2208 for (
size_t i = 0;
i < latin1.size();
i++) {
2209 Rune r = latin1[
i] & 0xFF;
2211 utf->append(
buf, n);
2222 RegexpStatus xstatus;
2230 if (global_flags &
Latin1) {
2239 while (!
t.empty()) {
2243 if (!ps.PushLiteral(
r))
2246 return ps.DoFinish();
2249 StringPiece lastunary = StringPiece();
2250 while (!
t.empty()) {
2251 StringPiece isunary = StringPiece();
2257 if (!ps.PushLiteral(
r))
2264 if ((ps.flags() &
PerlX) && (
t.size() >= 2 && t[1] ==
'?')) {
2266 if (!ps.ParsePerlFlags(&t))
2271 if (!ps.DoLeftParenNoCapture())
2274 if (!ps.DoLeftParen(StringPiece()))
2281 if (!ps.DoVerticalBar())
2287 if (!ps.DoRightParen())
2293 if (!ps.PushCaret())
2299 if (!ps.PushDollar())
2312 if (!ps.ParseCharClass(&t, &re,
status))
2314 if (!ps.PushRegexp(re))
2330 StringPiece opstr =
t;
2331 bool nongreedy =
false;
2333 if (ps.flags() &
PerlX) {
2334 if (!
t.empty() && t[0] ==
'?') {
2338 if (!lastunary.empty()) {
2343 status->set_error_arg(StringPiece(
2345 static_cast<size_t>(
t.data() - lastunary.data())));
2349 opstr = StringPiece(opstr.data(),
2350 static_cast<size_t>(
t.data() - opstr.data()));
2351 if (!ps.PushRepeatOp(
op, opstr, nongreedy))
2359 StringPiece opstr =
t;
2362 if (!ps.PushLiteral(
'{'))
2367 bool nongreedy =
false;
2368 if (ps.flags() &
PerlX) {
2369 if (!
t.empty() && t[0] ==
'?') {
2373 if (!lastunary.empty()) {
2376 status->set_error_arg(StringPiece(
2378 static_cast<size_t>(
t.data() - lastunary.data())));
2382 opstr = StringPiece(opstr.data(),
2383 static_cast<size_t>(
t.data() - opstr.data()));
2384 if (!ps.PushRepetition(lo, hi, opstr, nongreedy))
2393 t.size() >= 2 && (t[1] ==
'b' || t[1] ==
'B')) {
2394 if (!ps.PushWordBoundary(t[1] ==
'b'))
2427 while (!
t.empty()) {
2428 if (
t.size() >= 2 && t[0] ==
'\\' && t[1] ==
'E') {
2435 if (!ps.PushLiteral(
r))
2442 if (
t.size() >= 2 && (t[1] ==
'p' || t[1] ==
'P')) {
2444 re->ccb_ =
new CharClassBuilder;
2447 if (!ps.PushRegexp(re))
2462 re->ccb_ =
new CharClassBuilder;
2464 if (!ps.PushRegexp(re))
2472 if (!ps.PushLiteral(
r))
2478 lastunary = isunary;
2480 return ps.DoFinish();