19 #ifndef RAPIDJSON_INTERNAL_REGEX_H_ 20 #define RAPIDJSON_INTERNAL_REGEX_H_ 22 #include "../allocators.h" 23 #include "../stream.h" 28 RAPIDJSON_DIAG_OFF(padded)
29 RAPIDJSON_DIAG_OFF(switch - enum)
30 #elif defined(_MSC_VER) 32 RAPIDJSON_DIAG_OFF(4512)
37 RAPIDJSON_DIAG_OFF(effc++)
40 #ifndef RAPIDJSON_REGEX_VERBOSE 41 #define RAPIDJSON_REGEX_VERBOSE 0 50 template <
typename SourceStream,
typename Encoding>
78 template <
typename Encoding,
typename Allocator>
114 template <
typename Encoding,
typename Allocator = CrtAllocator>
118 typedef typename Encoding::Ch
Ch;
119 template <
typename,
typename>
124 allocator_(allocator ? allocator : ownAllocator_),
125 states_(allocator_, 256),
126 ranges_(allocator_, 256),
127 root_(kRegexInvalidState),
151 static const unsigned kAnyCharacterClass = 0xFFFFFFFF;
152 static const unsigned kRangeCharacterClass = 0xFFFFFFFE;
153 static const unsigned kRangeNegationFlag = 0x80000000;
177 return states_.template Bottom<State>()[index];
182 return states_.template Bottom<State>()[index];
187 return ranges_.template Bottom<Range>()[index];
192 return ranges_.template Bottom<Range>()[index];
195 template <
typename InputStream>
202 *atomCountStack.template Push<unsigned>() = 0;
205 while (ds.
Peek() != 0) {
206 switch (codepoint = ds.
Take()) {
216 while (!operatorStack.
Empty() &&
217 *operatorStack.template Top<Operator>() < kAlternation)
218 if (!Eval(operandStack, *operatorStack.template Pop<Operator>(1)))
220 *operatorStack.template Push<Operator>() = kAlternation;
221 *atomCountStack.template Top<unsigned>() = 0;
225 *operatorStack.template Push<Operator>() = kLeftParenthesis;
226 *atomCountStack.template Push<unsigned>() = 0;
230 while (!operatorStack.
Empty() &&
231 *operatorStack.template Top<Operator>() != kLeftParenthesis)
232 if (!Eval(operandStack, *operatorStack.template Pop<Operator>(1)))
234 if (operatorStack.
Empty())
return;
235 operatorStack.template Pop<Operator>(1);
236 atomCountStack.template Pop<unsigned>(1);
237 ImplicitConcatenation(atomCountStack, operatorStack);
241 if (!Eval(operandStack, kZeroOrOne))
return;
245 if (!Eval(operandStack, kZeroOrMore))
return;
249 if (!Eval(operandStack, kOneOrMore))
return;
254 if (!ParseUnsigned(ds, &n))
return;
256 if (ds.
Peek() ==
',') {
258 if (ds.
Peek() ==
'}')
259 m = kInfinityQuantifier;
260 else if (!ParseUnsigned(ds, &m) || m < n)
265 if (!EvalQuantifier(operandStack, n, m) || ds.
Peek() !=
'}')
return;
270 PushOperand(operandStack, kAnyCharacterClass);
271 ImplicitConcatenation(atomCountStack, operatorStack);
276 if (!ParseRange(ds, &range))
return;
277 SizeType s = NewState(kRegexInvalidState, kRegexInvalidState,
278 kRangeCharacterClass);
279 GetState(s).rangeStart = range;
280 *operandStack.template Push<Frag>() = Frag(s, s, s);
282 ImplicitConcatenation(atomCountStack, operatorStack);
286 if (!CharacterEscape(ds, &codepoint))
289 RAPIDJSON_DELIBERATE_FALLTHROUGH;
292 PushOperand(operandStack, codepoint);
293 ImplicitConcatenation(atomCountStack, operatorStack);
297 while (!operatorStack.
Empty())
298 if (!Eval(operandStack, *operatorStack.template Pop<Operator>(1)))
return;
301 if (operandStack.
GetSize() ==
sizeof(Frag)) {
302 Frag *e = operandStack.template Pop<Frag>(1);
303 Patch(e->out, NewState(kRegexInvalidState, kRegexInvalidState, 0));
306 #if RAPIDJSON_REGEX_VERBOSE 307 printf(
"root: %d\n", root_);
308 for (
SizeType i = 0; i < stateCount_; i++) {
309 State &
s = GetState(i);
310 printf(
"[%2d] out: %2d out1: %2d c: '%c'\n", i, s.out, s.out1,
319 State *
s = states_.template Push<State>();
322 s->codepoint = codepoint;
324 return stateCount_++;
328 SizeType s = NewState(kRegexInvalidState, kRegexInvalidState, codepoint);
329 *operandStack.template Push<Frag>() = Frag(s, s, s);
334 if (*atomCountStack.template Top<unsigned>())
335 *operatorStack.template Push<Operator>() = kConcatenation;
336 (*atomCountStack.template Top<unsigned>())++;
341 while (GetState(l1).out != kRegexInvalidState) l1 = GetState(l1).out;
342 GetState(l1).out = l2;
348 next = GetState(l).out;
358 Frag e2 = *operandStack.template Pop<Frag>(1);
359 Frag e1 = *operandStack.template Pop<Frag>(1);
360 Patch(e1.out, e2.start);
361 *operandStack.template Push<Frag>() =
362 Frag(e1.start, e2.out, Min(e1.minIndex, e2.minIndex));
367 if (operandStack.
GetSize() >=
sizeof(Frag) * 2) {
368 Frag e2 = *operandStack.template Pop<Frag>(1);
369 Frag e1 = *operandStack.template Pop<Frag>(1);
370 SizeType s = NewState(e1.start, e2.start, 0);
371 *operandStack.template Push<Frag>() =
372 Frag(s,
Append(e1.out, e2.out), Min(e1.minIndex, e2.minIndex));
378 if (operandStack.
GetSize() >=
sizeof(Frag)) {
379 Frag e = *operandStack.template Pop<Frag>(1);
380 SizeType s = NewState(kRegexInvalidState, e.start, 0);
381 *operandStack.template Push<Frag>() =
382 Frag(s,
Append(e.out, s), e.minIndex);
388 if (operandStack.
GetSize() >=
sizeof(Frag)) {
389 Frag e = *operandStack.template Pop<Frag>(1);
390 SizeType s = NewState(kRegexInvalidState, e.start, 0);
392 *operandStack.template Push<Frag>() = Frag(s, s, e.minIndex);
398 if (operandStack.
GetSize() >=
sizeof(Frag)) {
399 Frag e = *operandStack.template Pop<Frag>(1);
400 SizeType s = NewState(kRegexInvalidState, e.start, 0);
402 *operandStack.template Push<Frag>() = Frag(e.start, s, e.minIndex);
420 else if (m == kInfinityQuantifier)
421 Eval(operandStack, kZeroOrMore);
423 Eval(operandStack, kZeroOrOne);
424 for (
unsigned i = 0; i < m - 1; i++)
425 CloneTopOperand(operandStack);
426 for (
unsigned i = 0; i < m - 1; i++)
427 Eval(operandStack, kConcatenation);
432 for (
unsigned i = 0; i < n - 1; i++)
433 CloneTopOperand(operandStack);
435 if (m == kInfinityQuantifier)
436 Eval(operandStack, kOneOrMore);
438 CloneTopOperand(operandStack);
439 Eval(operandStack, kZeroOrOne);
440 for (
unsigned i = n; i < m - 1; i++)
441 CloneTopOperand(operandStack);
442 for (
unsigned i = n; i < m; i++)
443 Eval(operandStack, kConcatenation);
446 for (
unsigned i = 0; i < n - 1; i++)
458 .template Top<Frag>();
460 stateCount_ - src.minIndex;
462 State *
s = states_.template Push<State>(count);
463 memcpy(s, &GetState(src.minIndex), count *
sizeof(State));
464 for (
SizeType j = 0; j < count; j++) {
465 if (s[j].out != kRegexInvalidState) s[j].out += count;
466 if (s[j].out1 != kRegexInvalidState) s[j].out1 += count;
468 *operandStack.template Push<Frag>() =
469 Frag(src.start + count, src.out + count, src.minIndex + count);
470 stateCount_ += count;
473 template <
typename InputStream>
476 if (ds.
Peek() <
'0' || ds.
Peek() >
'9')
return false;
477 while (ds.
Peek() >=
'0' && ds.
Peek() <=
'9') {
478 if (r >= 429496729 && ds.
Peek() >
'5')
480 r = r * 10 + (ds.
Take() -
'0');
486 template <
typename InputStream>
494 while ((codepoint = ds.
Take()) != 0) {
497 if (codepoint ==
'^') {
505 if (start == kRegexInvalidRange)
510 GetRange(current).next = r;
512 if (negate) GetRange(start).start |= kRangeNegationFlag;
517 if (ds.
Peek() ==
'b') {
520 }
else if (!CharacterEscape(ds, &codepoint))
523 RAPIDJSON_DELIBERATE_FALLTHROUGH;
528 if (codepoint ==
'-') {
533 RAPIDJSON_DELIBERATE_FALLTHROUGH;
537 if (current != kRegexInvalidRange) GetRange(current).next = r;
538 if (start == kRegexInvalidRange) start = r;
546 GetRange(current).end = codepoint;
555 Range *r = ranges_.template Push<Range>();
556 r->start = r->end = codepoint;
558 return rangeCount_++;
561 template <
typename InputStream>
563 unsigned *escapedCodepoint) {
565 switch (codepoint = ds.
Take()) {
580 *escapedCodepoint = codepoint;
583 *escapedCodepoint = 0x000C;
586 *escapedCodepoint = 0x000A;
589 *escapedCodepoint = 0x000D;
592 *escapedCodepoint = 0x0009;
595 *escapedCodepoint = 0x000B;
610 static const unsigned kInfinityQuantifier = ~0u;
617 template <
typename RegexType,
typename Allocator = CrtAllocator>
621 typedef typename Encoding::Ch
Ch;
625 allocator_(allocator),
627 state0_(allocator, 0),
628 state1_(allocator, 0),
631 if (!allocator_) ownAllocator_ = allocator_ =
RAPIDJSON_NEW(Allocator)();
632 stateSet_ =
static_cast<unsigned *
>(allocator_->Malloc(GetStateSetSize()));
633 state0_.template Reserve<SizeType>(regex_.stateCount_);
634 state1_.template Reserve<SizeType>(regex_.stateCount_);
638 Allocator::Free(stateSet_);
642 template <
typename InputStream>
644 return SearchWithAnchoring(is,
true,
true);
652 template <
typename InputStream>
654 return SearchWithAnchoring(is, regex_.anchorBegin_, regex_.anchorEnd_);
663 typedef typename RegexType::State
State;
664 typedef typename RegexType::Range
Range;
666 template <
typename InputStream>
672 const size_t stateSetSize = GetStateSetSize();
673 std::memset(stateSet_, 0, stateSetSize);
675 bool matched = AddState(*current, regex_.root_);
677 while (!current->
Empty() && (codepoint = ds.
Take()) != 0) {
678 std::memset(stateSet_, 0, stateSetSize);
681 for (
const SizeType *
s = current->template Bottom<SizeType>();
682 s != current->template End<SizeType>(); ++
s) {
683 const State &sr = regex_.GetState(*
s);
684 if (sr.codepoint == codepoint ||
685 sr.codepoint == RegexType::kAnyCharacterClass ||
686 (sr.codepoint == RegexType::kRangeCharacterClass &&
687 MatchRange(sr.rangeStart, codepoint))) {
688 matched = AddState(*next, sr.out) || matched;
689 if (!anchorEnd && matched)
return true;
691 if (!anchorBegin) AddState(*next, regex_.root_);
705 const State &
s = regex_.GetState(index);
706 if (s.out1 != kRegexInvalidState) {
707 bool matched = AddState(l, s.out);
708 return AddState(l, s.out1) || matched;
709 }
else if (!(stateSet_[index >> 5] & (1u << (index & 31)))) {
710 stateSet_[index >> 5] |= (1u << (index & 31));
711 *l.template PushUnsafe<SizeType>() = index;
719 bool yes = (regex_.GetRange(rangeIndex).start &
720 RegexType::kRangeNegationFlag) == 0;
721 while (rangeIndex != kRegexInvalidRange) {
722 const Range &r = regex_.GetRange(rangeIndex);
723 if (codepoint >= (r.start & ~RegexType::kRangeNegationFlag) &&
749 #if defined(__clang__) || defined(_MSC_VER) 753 #endif // RAPIDJSON_INTERNAL_REGEX_H_
Allocator * ownAllocator_
bool Eval(Stack< Allocator > &operandStack, Operator op)
SizeType NewRange(unsigned codepoint)
const Range & GetRange(SizeType index) const
const CharType(& source)[N]
const State & GetState(SizeType index) const
RAPIDJSON_NAMESPACE_BEGIN typedef unsigned SizeType
Size type (for string lengths, array sizes, etc.)
SizeType out
link-list of all output states
#define RAPIDJSON_ASSERT(x)
Assertion.
#define RAPIDJSON_NAMESPACE_END
provide custom rapidjson namespace (closing expression)
GenericRegexSearch< Regex > RegexSearch
Stack< Allocator > state0_
Stack< Allocator > ranges_
SizeType out1
Equals to non-kInvalid for split.
A type-unsafe stack for storing different types of data.
Regular expression engine with subset of ECMAscript grammar.
bool AddState(Stack< Allocator > &l, SizeType index)
GenericRegex< UTF8<> > Regex
#define RAPIDJSON_NAMESPACE_BEGIN
provide custom rapidjson namespace (opening expression)
void CloneTopOperand(Stack< Allocator > &operandStack)
SizeType out
Equals to kInvalid for matching state.
#define RAPIDJSON_NEW(TypeName)
! customization point for global new
SizeType NewState(SizeType out, SizeType out1, unsigned codepoint)
void PushOperand(Stack< Allocator > &operandStack, unsigned codepoint)
bool SearchWithAnchoring(InputStream &is, bool anchorBegin, bool anchorEnd)
DecodedStream(SourceStream &ss)
GenericRegexSearch(const RegexType ®ex, Allocator *allocator=0)
size_t GetStateSetSize() const
static const SizeType kRegexInvalidState
Represents an invalid index in GenericRegex::State::out, out1.
Stack< Allocator > state1_
void Swap(T &a, T &b) RAPIDJSON_NOEXCEPT
Custom swap() to avoid dependency on C++ <algorithm> header.
#define RAPIDJSON_DELETE(x)
! customization point for global delete
void Parse(DecodedStream< InputStream, Encoding > &ds)
const GenericPointer< typename T::ValueType > T2 T::AllocatorType & a
bool ParseRange(DecodedStream< InputStream, Encoding > &ds, SizeType *range)
bool Search(InputStream &is)
static SizeType Min(SizeType a, SizeType b)
State & GetState(SizeType index)
Frag(SizeType s, SizeType o, SizeType m)
bool MatchRange(SizeType rangeIndex, unsigned codepoint) const
bool EvalQuantifier(Stack< Allocator > &operandStack, unsigned n, unsigned m)
bool Match(InputStream &is)
void Patch(SizeType l, SizeType s)
bool ParseUnsigned(DecodedStream< InputStream, Encoding > &ds, unsigned *u)
RegexType::EncodingType Encoding
Allocator * ownAllocator_
SizeType Append(SizeType l1, SizeType l2)
Range & GetRange(SizeType index)
bool CharacterEscape(DecodedStream< InputStream, Encoding > &ds, unsigned *escapedCodepoint)
void ImplicitConcatenation(Stack< Allocator > &atomCountStack, Stack< Allocator > &operatorStack)
static const SizeType kRegexInvalidRange
Stack< Allocator > states_
GenericRegex(const Ch *source, Allocator *allocator=0)